#Tweet Disaster Detection

In [188]:
import pandas as pd # data frame
import numpy as np # array processing
from nltk.tokenize import word_tokenize # Tokenizing words
from nltk.corpus import stopwords  # stopwords
from nltk.stem import WordNetLemmatizer  #Lemmetization
lem = WordNetLemmatizer() #Lemmetization object
from sklearn.model_selection import train_test_split # Test train split
from sklearn.linear_model import LogisticRegression # classification model feature Base methord
from sklearn.svm import LinearSVC #Classification model feature Base Methord
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report,confusion_matrix # Verification metrics
from wordcloud import WordCloud # wordcloud
from sklearn.feature_extraction.text import TfidfVectorizer #word to vector
import matplotlib.pyplot as plt #plot
import seaborn as sns #plot 
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
stopWords = set(stopwords.words('english')) #Downloading Stopwords in stopWords

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [189]:
df=pd.read_csv('https://raw.githubusercontent.com/laxmimerit/twitter-disaster-prediction-dataset/master/train.csv')
df.head()




Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [190]:
df.shape

(7613, 5)

In [191]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


#Text Preprocessing

In [192]:
contractions = {
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he'll've": "he will have",
"he's": "he is",
"how'd": "how did",
"how'd'y": "how do you",
"how'll": "how will",
"how's": "how does",
"i'd": "i would",
"i'd've": "i would have",
"i'll": "i will",
"i'll've": "i will have",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'd've": "it would have",
"it'll": "it will",
"it'll've": "it will have",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"mightn't've": "might not have",
"must've": "must have",
"mustn't": "must not",
"mustn't've": "must not have",
"needn't": "need not",
"needn't've": "need not have",
"o'clock": "of the clock",
"oughtn't": "ought not",
"oughtn't've": "ought not have",
"shan't": "shall not",
"sha'n't": "shall not",
"shan't've": "shall not have",
"she'd": "she would",
"she'd've": "she would have",
"she'll": "she will",
"she'll've": "she will have",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"shouldn't've": "should not have",
"so've": "so have",
"so's": "so is",
"that'd": "that would",
"that'd've": "that would have",
"that's": "that is",
"there'd": "there would",
"there'd've": "there would have",
"there's": "there is",
"they'd": "they would",
"they'd've": "they would have",
"they'll": "they will",
"they'll've": "they will have",
"they're": "they are",
"they've": "they have",
"to've": "to have",
"wasn't": "was not",
" u ": " you ",
" ur ": " your ",
" n ": " and ",
"won't": "would not",
"wouldn't": "would not",
'dis': 'this',
'bak': 'back',
'brng': 'bring'}

In [193]:
def cont_to_exp(x):
  if x in contractions.keys():
    value=contractions[x]
  else:
    value=x
  return value

In [194]:
def preprocess(x):

  x = x.lower()
  x = ' '.join(list((cont_to_exp(t) for t in x.split())))
  x = re.sub('[^a-zA-Z]', ' ',x)
  x = re.sub('(http|https|ftp|ssh)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?','', x)
  x = re.sub('([a-z0-9+._-]+@[a-z0-9+._-]+\.[a-z0-9+_-]+)','',x)
  x = re.sub('[\W+]', ' ', x)
  x = ' '.join(list((lem.lemmatize(t) for t in x.split() if t not in stopWords)))
  x = ' '.join(list((t for t in x.split() if len(t)>1)))
  return x

In [195]:
df['pro_text']=df['text'].apply(lambda x :  preprocess(x))

In [196]:
X1=df['pro_text']
y1=df['target']

#TFIDF Vectoriser

In [197]:
tf=TfidfVectorizer(max_features=3000)
text_tf= tf.fit_transform(X1)

In [198]:
X, X_test, y, y_test = train_test_split(text_tf, y1, test_size=0.2, random_state=0)

#Support Vector Classifier

In [199]:
svc=LinearSVC()
svc.fit(X,y)
y_pred_svc = svc.predict(X_test)

In [200]:
print(classification_report(y_test,y_pred_svc))

              precision    recall  f1-score   support

           0       0.81      0.86      0.83       886
           1       0.78      0.72      0.75       637

    accuracy                           0.80      1523
   macro avg       0.79      0.79      0.79      1523
weighted avg       0.80      0.80      0.80      1523



#Logistic Regression

In [201]:
lr=LogisticRegression()
lr.fit(X,y)
y_pred_lr=lr.predict(X_test)

In [202]:
print(classification_report(y_test,y_pred_lr))

              precision    recall  f1-score   support

           0       0.79      0.90      0.84       886
           1       0.82      0.67      0.74       637

    accuracy                           0.80      1523
   macro avg       0.81      0.78      0.79      1523
weighted avg       0.80      0.80      0.80      1523



#Random Forest Classifier

In [203]:
rfc = RandomForestClassifier(criterion='entropy', n_estimators=200).fit(X, y)
y_pred_rfc=rfc.predict(X_test)

In [204]:
print(classification_report(y_test,y_pred_rfc))

              precision    recall  f1-score   support

           0       0.77      0.91      0.84       886
           1       0.83      0.63      0.72       637

    accuracy                           0.79      1523
   macro avg       0.80      0.77      0.78      1523
weighted avg       0.80      0.79      0.79      1523



# Word2Vec

In [205]:
import spacy

In [206]:
# !python -m spacy download en_core_web_lg

In [207]:
# spacy.cli.download("en_core_web_lg")
nlp=spacy.load('en_core_web_lg')

In [208]:
def get_vec(x):
  doc = nlp(x)
  vec = doc.vector
  return vec

In [209]:
df['vec']=df['text'].apply(lambda x : get_vec(str(x)))

In [210]:
df['vec']

0       [-0.27897874, 0.103925645, -0.09989621, -0.033...
1       [-0.020768244, 0.053068247, -0.15192825, -0.27...
2       [-0.015307656, 0.06607368, -0.09196319, -0.087...
3       [-0.16429922, 0.2463819, 0.07442567, -0.197739...
4       [-0.10624417, 0.06461317, 0.05314888, 0.088013...
                              ...                        
7608    [0.12572163, -0.17250119, -0.21099047, 0.06948...
7609    [-0.028085582, 0.12006733, -0.0071035903, -0.1...
7610    [0.09417391, 0.07030873, 0.006694543, -0.23429...
7611    [-0.019305991, 0.18637651, -0.21350445, 0.0527...
7612    [-0.08053643, 0.21419215, -0.022203717, -0.074...
Name: vec, Length: 7613, dtype: object

To increase the process speed 

In [211]:
X=df['vec'].to_numpy()
X.shape

(7613,)

reshape to one column and multiple rows

In [212]:
X=X.reshape(-1,1)
X.shape

(7613, 1)

In [213]:
X=np.concatenate(np.concatenate(X,axis=0),axis=0).reshape(-1,300)
X

array([[-0.27897874,  0.10392565, -0.09989621, ...,  0.04564636,
        -0.06325858, -0.05639894],
       [-0.02076824,  0.05306825, -0.15192825, ..., -0.05474687,
        -0.2549715 ,  0.11622399],
       [-0.01530766,  0.06607368, -0.09196319, ..., -0.0348906 ,
         0.07479551,  0.0337687 ],
       ...,
       [ 0.09417391,  0.07030873,  0.00669454, ...,  0.05192918,
        -0.00719345,  0.03631673],
       [-0.01930599,  0.18637651, -0.21350445, ..., -0.07282014,
         0.03912553,  0.07439488],
       [-0.08053643,  0.21419215, -0.02220372, ..., -0.24513498,
        -0.03821203,  0.16831529]], dtype=float32)

In [214]:
y=df['target']

In [215]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0,stratify=y)
X_train.shape,X_test.shape

((6090, 300), (1523, 300))

#Support Vector Classification

In [216]:
wsvc=LinearSVC()

In [217]:
wsvc.fit(X_train,y_train)
y_pred_wsvc= wsvc.predict(X_test)

In [218]:
print(classification_report(y_test,y_pred_wsvc))

              precision    recall  f1-score   support

           0       0.81      0.88      0.84       869
           1       0.82      0.72      0.76       654

    accuracy                           0.81      1523
   macro avg       0.81      0.80      0.80      1523
weighted avg       0.81      0.81      0.81      1523



#Logistic Regression

In [219]:
wlr=LogisticRegression()
wlr.fit(X_train,y_train)
y_pred_wlr=wlr.predict(X_test)

In [220]:
print(classification_report(y_test,y_pred_wlr))

              precision    recall  f1-score   support

           0       0.82      0.88      0.85       869
           1       0.82      0.74      0.78       654

    accuracy                           0.82      1523
   macro avg       0.82      0.81      0.81      1523
weighted avg       0.82      0.82      0.82      1523



#Random Forest Classifier

In [221]:
wrfc = RandomForestClassifier(criterion='entropy', n_estimators=200).fit(X_train, y_train)
y_pred_wrfc=wrfc.predict(X_test)

In [222]:
print(classification_report(y_test,y_pred_wrfc))

              precision    recall  f1-score   support

           0       0.79      0.90      0.84       869
           1       0.83      0.68      0.75       654

    accuracy                           0.80      1523
   macro avg       0.81      0.79      0.79      1523
weighted avg       0.81      0.80      0.80      1523



# Processed Text

In [223]:
df['vec1']=df['pro_text'].apply(lambda x : get_vec(str(x)))

In [224]:
Y1=df['target']

In [225]:
X1=df['vec1'].to_numpy()
X1=X1.reshape(-1,1)
X1.shape

(7613, 1)

In [226]:
X1=np.concatenate(np.concatenate(X1,axis=0),axis=0).reshape(-1,300)
X1.shape

(7613, 300)

In [227]:
X_train,X_test,y_train,y_test=train_test_split(X1,Y1,test_size=0.2,random_state=0,stratify=Y1)
X_train.shape,X_test.shape

((6090, 300), (1523, 300))

#Support Vector Classification

In [228]:
wsvc1=LinearSVC()

In [229]:
wsvc1.fit(X_train,y_train)
y_pred_wsvc1= wsvc1.predict(X_test)



In [230]:
print(classification_report(y_test,y_pred_wsvc1))

              precision    recall  f1-score   support

           0       0.80      0.85      0.82       869
           1       0.78      0.71      0.75       654

    accuracy                           0.79      1523
   macro avg       0.79      0.78      0.79      1523
weighted avg       0.79      0.79      0.79      1523



#Logistic Regression

In [231]:
wlr1=LogisticRegression()
wlr1.fit(X_train,y_train)
y_pred_wlr1=wlr1.predict(X_test)

In [232]:
print(classification_report(y_test,y_pred_wlr1))

              precision    recall  f1-score   support

           0       0.80      0.85      0.83       869
           1       0.79      0.72      0.75       654

    accuracy                           0.79      1523
   macro avg       0.79      0.78      0.79      1523
weighted avg       0.79      0.79      0.79      1523



#Random Forest Classifier

In [233]:
wrfc1 = RandomForestClassifier(criterion='entropy', n_estimators=200).fit(X_train, y_train)
y_pred_wrfc1=wrfc1.predict(X_test)

In [234]:
print(classification_report(y_test,y_pred_wrfc1))

              precision    recall  f1-score   support

           0       0.78      0.89      0.83       869
           1       0.82      0.67      0.74       654

    accuracy                           0.80      1523
   macro avg       0.80      0.78      0.79      1523
weighted avg       0.80      0.80      0.79      1523

