In [1]:
import numpy as np
import pandas as pd
import matplotlib as plt
import os

import nltk
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

import sklearn.feature_extraction as fe
from sklearn.naive_bayes import MultinomialNB

[nltk_data] Downloading package wordnet to /usr/share/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
%pwd
%cd /
%cd usr/share/nltk_data/corpora
! unzip "/usr/share/nltk_data/corpora/wordnet.zip"

/
/usr/share/nltk_data/corpora
Archive:  /usr/share/nltk_data/corpora/wordnet.zip
   creating: wordnet/
  inflating: wordnet/lexnames        
  inflating: wordnet/data.verb       
  inflating: wordnet/index.adv       
  inflating: wordnet/adv.exc         
  inflating: wordnet/index.verb      
  inflating: wordnet/cntlist.rev     
  inflating: wordnet/data.adj        
  inflating: wordnet/index.adj       
  inflating: wordnet/LICENSE         
  inflating: wordnet/citation.bib    
  inflating: wordnet/noun.exc        
  inflating: wordnet/verb.exc        
  inflating: wordnet/README          
  inflating: wordnet/index.sense     
  inflating: wordnet/data.noun       
  inflating: wordnet/data.adv        
  inflating: wordnet/index.noun      
  inflating: wordnet/adj.exc         


In [3]:
train_data = pd.read_csv('/kaggle/input/nlp-getting-started/train.csv')
test_data = pd.read_csv('/kaggle/input/nlp-getting-started/test.csv')

print(f'Train data:\n{train_data.head()}')
print()
print(f'Test data:\n{test_data.head()}')

Train data:
   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  

Test data:
   id keyword location                                               text
0   0     NaN      NaN                 Just happened a terrible car crash
1   2     NaN      NaN  Heard about #earthquake is different cities, s...
2   3     NaN      NaN  there is a forest fire at spot pond, geese are...
3   9     NaN      NaN           Apocalypse lighting. #Spokane #wildfires
4  11     NaN      NaN      Typhoon Soudelor kills 28 in China and Taiw

In [4]:
stop_words = nltk.corpus.stopwords.words('english')

count_vec = fe.text.CountVectorizer(stop_words=stop_words)
tfidf_transformer = fe.text.TfidfTransformer()

lemmatizer = WordNetLemmatizer()

In [5]:
lemm_train_data = [lemmatizer.lemmatize(word) for word in train_data['text']]
lemm_test_data = [lemmatizer.lemmatize(word) for word in test_data['text']]


train_tok = count_vec.fit_transform(lemm_train_data)
tf_train_tok = tfidf_transformer.fit_transform(train_tok)

print(tf_train_tok.shape)


test_tok = count_vec.transform(lemm_test_data)
tf_test_tok = tfidf_transformer.transform(test_tok)

print(tf_test_tok.shape)

(7613, 21498)
(3263, 21498)


In [6]:
feature_names_tf = tfidf_transformer.get_feature_names_out()
tfidf_scores = tf_train_tok.toarray()[0]

dictionary = count_vec.vocabulary_
dictionary = dict(sorted(dictionary.items(), key=lambda x: x[1]))

In [7]:
clf = MultinomialNB().fit(tf_train_tok, train_data['target'])

In [8]:
predicted = clf.predict(tf_test_tok)

sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")
sample_submission.head()

Unnamed: 0,id,target
0,0,0
1,2,0
2,3,0
3,9,0
4,11,0


In [9]:
sample_submission["target"] = predicted
sample_submission.head()

Unnamed: 0,id,target
0,0,1
1,2,0
2,3,1
3,9,1
4,11,1


In [10]:
sample_submission.to_csv("/kaggle/working/submission.csv", index=False)