#B.E. - B15 Tanvi Nirmal

#**Experiment No. 04**
#**Consider a suitable text dataset. Remove stop words, apply stemming and feature selection techniques to represent documents as vectors. Classify documents and evaluate precision, recall.**

In [110]:
import pandas as pd
import numpy as np

In [111]:
#dataset available at https://www.kaggle.com/c/nlp-getting-started/data
#contains dataset for real and fake tweets on disasters

dtf_train = pd.read_csv('/content/train.csv')
print(dtf_train.shape)
dtf_train.head

(7613, 5)


<bound method NDFrame.head of          id keyword  ...                                               text target
0         1     NaN  ...  Our Deeds are the Reason of this #earthquake M...      1
1         4     NaN  ...             Forest fire near La Ronge Sask. Canada      1
2         5     NaN  ...  All residents asked to 'shelter in place' are ...      1
3         6     NaN  ...  13,000 people receive #wildfires evacuation or...      1
4         7     NaN  ...  Just got sent this photo from Ruby #Alaska as ...      1
...     ...     ...  ...                                                ...    ...
7608  10869     NaN  ...  Two giant cranes holding a bridge collapse int...      1
7609  10870     NaN  ...  @aria_ahrary @TheTawniest The out of control w...      1
7610  10871     NaN  ...  M1.94 [01:04 UTC]?5km S of Volcano Hawaii. htt...      1
7611  10872     NaN  ...  Police investigating after an e-bike collided ...      1
7612  10873     NaN  ...  The Latest: More Homes Razed by

#**Data Preprocessing**

In [112]:
dtf_train.isnull().sum()

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64

In [113]:
#As most of the data in keyword and location columns is null and id is not needed, we can drop them.
dtf_train = dtf_train.drop(['id','location','keyword'], axis=1)

#Performing basic NLP Techniques
1. Removing unwanted words 
> *Using re to remove '#, =>, numbers, or ... etc' letters which are not required*
2. Tokenizing data
3. Transforming words to lowercase
4. Stemming and removing stopwords
> *Stop words : commonly words in english that may give misleading results.*
> *Stemming is the process of reducing inflected (or sometimes derived) words to their word stem, base or root form.*

In [114]:
#importing necessary libraries for data processing
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords               
from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [115]:
def remove_unwanted(text):
    un = re.compile(r'[^a-zA-Z]')
    return re.sub(un,' ', text)

dtf_train['clean_text'] = dtf_train['text'].apply(lambda x: remove_unwanted(x))  #Cleaning
dtf_train['tokenized'] = dtf_train['clean_text'].apply(word_tokenize)            #Tokenization
dtf_train['lower'] = dtf_train['tokenized'].apply(lambda x: [word.lower() for word in x])

pst = PorterStemmer()
dtf_train['no_stopwords'] = dtf_train['lower'].apply(lambda x: [pst.stem(word) for word in x])
dtf_train['no_stopwords'] = dtf_train['no_stopwords'].apply(lambda x: [word for word in x if word not in set(stopwords.words('english'))])

In [116]:
dtf_train.head()

Unnamed: 0,text,target,clean_text,tokenized,lower,no_stopwords
0,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake M...,"[Our, Deeds, are, the, Reason, of, this, earth...","[our, deeds, are, the, reason, of, this, earth...","[deed, reason, thi, earthquak, may, allah, for..."
1,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada,"[Forest, fire, near, La, Ronge, Sask, Canada]","[forest, fire, near, la, ronge, sask, canada]","[forest, fire, near, la, rong, sask, canada]"
2,All residents asked to 'shelter in place' are ...,1,All residents asked to shelter in place are ...,"[All, residents, asked, to, shelter, in, place...","[all, residents, asked, to, shelter, in, place...","[resid, ask, shelter, place, notifi, offic, ev..."
3,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation or...,"[people, receive, wildfires, evacuation, order...","[people, receive, wildfires, evacuation, order...","[peopl, receiv, wildfir, evacu, order, califor..."
4,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as ...,"[Just, got, sent, this, photo, from, Ruby, Ala...","[just, got, sent, this, photo, from, ruby, ala...","[got, sent, thi, photo, rubi, alaska, smoke, w..."


In [117]:
dtf_train['no_stopwords'] = [' '.join(map(str, l)) for l in dtf_train['no_stopwords']]

In [118]:
dtf_train.head()

Unnamed: 0,text,target,clean_text,tokenized,lower,no_stopwords
0,Our Deeds are the Reason of this #earthquake M...,1,Our Deeds are the Reason of this earthquake M...,"[Our, Deeds, are, the, Reason, of, this, earth...","[our, deeds, are, the, reason, of, this, earth...",deed reason thi earthquak may allah forgiv us
1,Forest fire near La Ronge Sask. Canada,1,Forest fire near La Ronge Sask Canada,"[Forest, fire, near, La, Ronge, Sask, Canada]","[forest, fire, near, la, ronge, sask, canada]",forest fire near la rong sask canada
2,All residents asked to 'shelter in place' are ...,1,All residents asked to shelter in place are ...,"[All, residents, asked, to, shelter, in, place...","[all, residents, asked, to, shelter, in, place...",resid ask shelter place notifi offic evacu she...
3,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation or...,"[people, receive, wildfires, evacuation, order...","[people, receive, wildfires, evacuation, order...",peopl receiv wildfir evacu order california
4,Just got sent this photo from Ruby #Alaska as ...,1,Just got sent this photo from Ruby Alaska as ...,"[Just, got, sent, this, photo, from, Ruby, Ala...","[just, got, sent, this, photo, from, ruby, ala...",got sent thi photo rubi alaska smoke wildfir p...


In [119]:
#Creating a dictionary where key refers to words and value refers to the word count
uniqueWordFrequents = {}
for t in dtf_train['no_stopwords']:
    for word in t.split():
        if(word in uniqueWordFrequents.keys()):
            uniqueWordFrequents[word] += 1
        else:
            uniqueWordFrequents[word] = 1
            
#Convert dictionary to dataFrame
uniqueWordFrequents = pd.DataFrame.from_dict(uniqueWordFrequents,orient='index',columns=['Word Frequent'])
uniqueWordFrequents.sort_values(by=['Word Frequent'], inplace=True, ascending=False)
uniqueWordFrequents.head(10)

Unnamed: 0,Word Frequent
co,4746
http,4721
thi,483
like,411
wa,395
fire,363
amp,344
get,311
ha,261
bomb,239


In [120]:
#Filtering out most frequent words
uniqueWordFrequents = uniqueWordFrequents[uniqueWordFrequents['Word Frequent'] >= 20]
uniqueWordFrequents

Unnamed: 0,Word Frequent
co,4746
http,4721
thi,483
like,411
wa,395
...,...
captur,20
polit,20
creat,20
radio,20


In [121]:
#Count Vectorization - creates a sparse matrix
from sklearn.feature_extraction.text import CountVectorizer

countVec = CountVectorizer(max_features = uniqueWordFrequents.shape[0])
bagOfWords = countVec.fit_transform(dtf_train['no_stopwords']).toarray()

In [122]:
bagOfWords

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0]])

In [123]:
from sklearn.model_selection import train_test_split

X = bagOfWords
y = dtf_train['target']
print("X shape = ",X.shape)
print("y shape = ",y.shape)

X_train , X_test , y_train , y_test = train_test_split(X,y,test_size=0.20, random_state=70, shuffle =True)

X shape =  (7613, 800)
y shape =  (7613,)


#**Feature Selection**

In [124]:
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn import svm

In [125]:
#Trying various feature selection algorithms
LR = LogisticRegression()
SVM = svm.SVC(kernel='linear')
MNB = MultinomialNB()

LR.fit(X_train,y_train)
SVM.fit(X_train,y_train)
MNB.fit(X_train,y_train)

MultinomialNB()

In [126]:
predictLR = LR.predict(X_test)
predictSVM = SVM.predict(X_test)
predictMNB = MNB.predict(X_test)

#**Performance Analysis**

In [127]:
from sklearn.metrics import classification_report

print("Logistic Regression : \n\n"+classification_report(y_test,predictLR))
print("\n\nSupport Vector Machine : \n\n"+classification_report(y_test,predictSVM))
print("\n\nMultinomial Naive Bayes : \n\n"+classification_report(y_test,predictMNB))

Logistic Regression : 

              precision    recall  f1-score   support

           0       0.81      0.86      0.83       875
           1       0.80      0.73      0.76       648

    accuracy                           0.80      1523
   macro avg       0.80      0.79      0.80      1523
weighted avg       0.80      0.80      0.80      1523



Support Vector Machine : 

              precision    recall  f1-score   support

           0       0.81      0.86      0.83       875
           1       0.80      0.72      0.75       648

    accuracy                           0.80      1523
   macro avg       0.80      0.79      0.79      1523
weighted avg       0.80      0.80      0.80      1523



Multinomial Naive Bayes : 

              precision    recall  f1-score   support

           0       0.80      0.82      0.81       875
           1       0.75      0.73      0.74       648

    accuracy                           0.78      1523
   macro avg       0.78      0.77      0.77  