In [208]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB as MNB
from sklearn.model_selection import train_test_split
import os
from sklearn.linear_model import LogisticRegression as LR


In [58]:
from nltk.corpus import stopwords
from nltk import TweetTokenizer, pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
import string
lem=WordNetLemmatizer()
tokenizer= TweetTokenizer()

In [59]:
train_data1 = pd.read_csv("train.csv")
test_data1 = pd.read_csv("test.csv")
train_data = train_data1.copy()
test_data = test_data1.copy()
train_data.head(),test_data.head()

(   id keyword location                                               text  \
 0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
 1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
 2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
 3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
 4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   
 
    target  
 0       1  
 1       1  
 2       1  
 3       1  
 4       1  ,
    id keyword location                                               text
 0   0     NaN      NaN                 Just happened a terrible car crash
 1   2     NaN      NaN  Heard about #earthquake is different cities, s...
 2   3     NaN      NaN  there is a forest fire at spot pond, geese are...
 3   9     NaN      NaN           Apocalypse lighting. #Spokane #wildfires
 4  11     NaN      NaN      Typhoon Soudelor kills 28 in China and Taiwan)

In [241]:
awe = train_data['keyword'].value_counts()
awe

fatalities               45
armageddon               42
deluge                   42
harm                     41
damage                   41
                         ..
forest%20fire            19
epicentre                12
threat                   11
inundation               10
radiation%20emergency     9
Name: keyword, Length: 221, dtype: int64

In [60]:
text = list(train_data.text.str.lower()) ## lower case the whole data
text[:4]

['our deeds are the reason of this #earthquake may allah forgive us all',
 'forest fire near la ronge sask. canada',
 "all residents asked to 'shelter in place' are being notified by officers. no other evacuation or shelter in place orders are expected",
 '13,000 people receive #wildfires evacuation orders in california ']

In [61]:
'''text[:3]
text1 = " ".join(text)
#text1 = text1.split(" ")## all the raw data in strings
text1'''

'text[:3]\ntext1 = " ".join(text)\n#text1 = text1.split(" ")## all the raw data in strings\ntext1'

In [62]:
stops = list(set(stopwords.words("english") + list(string.punctuation)))
stops[:6]

['once', 'my', 'to', '\\', 'himself', 'that']

In [63]:
def get_simple_pos(word):
    pos = pos_tag([word])[0][1]
    if pos.startswith('J'):
        return wordnet.ADJ
    elif pos.startswith('V'):
        return wordnet.VERB
    elif pos.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

In [64]:

def clean_text(words):
    op_words = []
    words = tokenizer.tokenize(words)
    for w in words:
        w = w.lower()
        if w not in stops and w.isalpha() and len(w)>2 and not w.isnumeric() and "http" not in w and not w.startswith("@"):
            pos = pos_tag([w])
            clean_word = lem.lemmatize(w,pos = get_simple_pos(pos[0][1]))
            op_words.append(clean_word.lower())
        if w.startswith("#"):
            op_words.append(w)
    return op_words                

In [160]:
df = train_data['text']
df1 = test_data['text']
df[:2],df1[:2]

(0    Our Deeds are the Reason of this #earthquake M...
 1               Forest fire near La Ronge Sask. Canada
 Name: text, dtype: object,
 0                   Just happened a terrible car crash
 1    Heard about #earthquake is different cities, s...
 Name: text, dtype: object)

In [161]:
train_data_ = df.apply(clean_text)

In [162]:
test_data_ = df1.apply(clean_text)

In [163]:
print(train_data_[:2])
train_data_1 = [" ".join(doc) for doc in train_data_]
test_data_1 = [" ".join(doc) for doc in test_data_]
train_data_1[:3],test_data_1[:3]

0    [deed, reason, #earthquake, may, allah, forgive]
1           [forest, fire, near, ronge, sask, canada]
Name: text, dtype: object


(['deed reason #earthquake may allah forgive',
  'forest fire near ronge sask canada',
  'resident asked shelter place notified officer evacuation shelter place order expected'],
 ['happened terrible car crash',
  'heard #earthquake different city stay safe everyone',
  'forest fire spot pond goose fleeing across street cannot save'])

In [167]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [168]:
cv = TfidfVectorizer(max_features=2000,ngram_range=(1,2), tokenizer = tokenizer.tokenize)

In [219]:
x_train = cv.fit_transform(train_data_1).todense()
x_test = cv.transform(test_data_1).todense()

In [220]:
feat = cv.get_feature_names()

In [221]:
y_train = np.array(train_data['target'])
y_train[:5],type(y_train)

(array([1, 1, 1, 1, 1], dtype=int64), numpy.ndarray)

In [222]:
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
rfc.fit(x_train,y_train)
rfc.score(x_train,y_train)

0.9747799816104032

In [223]:
mnb=MNB()
mnb.fit(x_train,y_train)
mnb.score(x_train,y_train)

0.8284513332457638

In [224]:
## PCa
from sklearn.decomposition import PCA
pca = PCA()
pca.fit(x_train)
pca.explained_variance_

array([5.17332994e-03, 4.52157250e-03, 4.02207678e-03, ...,
       6.58333884e-37, 2.03036053e-37, 1.62463505e-38])

In [225]:
k = 0
total_var = sum(pca.explained_variance_)
curr_var = 0
while curr_var / total_var < 0.50:
    k += 1
    curr_var += pca.explained_variance_[k]
k    

292

In [226]:
final_pca = PCA(n_components = 2000, whiten = True)
X_train_pca = pca.fit_transform(x_train)
X_test_pca = pca.transform(x_test)

In [231]:
from sklearn.svm import SVC
svc = SVC(C = 1000)
svc.fit(X_train_pca, y_train)
svc.score(X_train_pca,y_train)

0.9747799816104032

In [228]:
tit = LR()
tit.fit(x_train,y_train)
tit.score(x_train,y_train)

0.8477604098252989

In [229]:
as1 = tit.predict(x_test)

In [230]:
as3 = mnb.predict(x_test)

In [232]:
as4 = svc.predict(X_test_pca)

In [175]:
as2 = rfc.predict(x_test)

In [233]:
as2,as3,as4

(array([1, 1, 1, ..., 1, 1, 0], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int64),
 array([1, 1, 1, ..., 1, 1, 1], dtype=int64))

In [176]:
np.savetxt("kaggle.csv",as2, fmt="%s")

In [178]:
len(as2)

3263

In [179]:
index = np.arange(0,3263,1)
index[:2]

array([0, 1])

In [234]:
dict1={
    "id" :test_data["id"],
    "target" :as4
}

In [235]:
r= pd.DataFrame(dict1,columns=['id','target'])
r.reset_index(drop=True)
r.head()

Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1


In [236]:
r.to_csv("kaggle1.csv",sep=',',index=False)