# Predict which tweets are about real disasters

Name: Nivetha M<br>Roll No: 215221926

## 1. Import and read data

In [1]:
import numpy as np
import pandas as pd
import nltk
import re
import sys
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import warnings
warnings.filterwarnings('ignore')

In [2]:
print('Python {}'.format(sys.version))
print('Numpy {}'.format(np.__version__))
print('Panda {}'.format(pd.__version__))
print('NLTK {}'.format(nltk.__version__))
print('Seaborn {}'.format(sns.__version__))

Python 3.7.12 | packaged by conda-forge | (default, Oct 26 2021, 06:08:53) 
[GCC 9.4.0]
Numpy 1.20.3
Panda 1.3.5
NLTK 3.2.4
Seaborn 0.11.2


### 1.1 Load Dataset

In [2]:
df_train = pd.read_csv('train.csv')

In [3]:
print(df_train.info())
print('===================================')
print(df_train.head())
print('===================================')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB
None
   id keyword location                                               text  \
0   1     NaN      NaN  Our Deeds are the Reason of this #earthquake M...   
1   4     NaN      NaN             Forest fire near La Ronge Sask. Canada   
2   5     NaN      NaN  All residents asked to 'shelter in place' are ...   
3   6     NaN      NaN  13,000 people receive #wildfires evacuation or...   
4   7     NaN      NaN  Just got sent this photo from Ruby #Alaska as ...   

   target  
0       1  
1       1  
2       1  
3       1  
4       1  


**Since the id, keyword and location are not important so to drop them**

In [4]:
df_train.drop(['id','keyword','location'], axis = 1, inplace=True)

### 1.2 Average Tweet Length

In [5]:
def avg_word(sentence):
    words = sentence.split()
    return (sum(len(word) for word in words)/len(words))

In [6]:
df_train['avg_word'] = df_train['text'].apply(lambda x: avg_word(x))

In [7]:
print('Disaster Tweets')
print('=================================')
Disaster = df_train[df_train.target==1]
Disaster.head()

Disaster Tweets


Unnamed: 0,text,target,avg_word
0,Our Deeds are the Reason of this #earthquake M...,1,4.384615
1,Forest fire near La Ronge Sask. Canada,1,4.571429
2,All residents asked to 'shelter in place' are ...,1,5.090909
3,"13,000 people receive #wildfires evacuation or...",1,7.125
4,Just got sent this photo from Ruby #Alaska as ...,1,4.5


In [8]:
print('Non-Disaster Tweets')
print('=================================')
Non_Disaster = df_train[df_train.target==0]
Non_Disaster.head()

Non-Disaster Tweets


Unnamed: 0,text,target,avg_word
15,What's up man?,0,4.0
16,I love fruits,0,3.666667
17,Summer is lovely,0,4.666667
18,My car is so fast,0,2.6
19,What a goooooooaaaaaal!!!!!!,0,8.666667


### 1.3 Class Distribution

In [9]:
classes = df_train.loc[:,'target']
print(classes.value_counts())

0    4342
1    3271
Name: target, dtype: int64


## 2. Preprocessing

### 2.1 Lowercase

In [10]:
df_train['text'] = df_train['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))
df_train['text'].head()

0    our deeds are the reason of this #earthquake m...
1               forest fire near la ronge sask. canada
2    all residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    just got sent this photo from ruby #alaska as ...
Name: text, dtype: object

### 2.2 Replace URLs

In [11]:
df_train['text'] = df_train['text'].str.replace(r'(http|ftp|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?', ' ')
df_train['text'].head()

0    our deeds are the reason of this #earthquake m...
1               forest fire near la ronge sask. canada
2    all residents asked to 'shelter in place' are ...
3    13,000 people receive #wildfires evacuation or...
4    just got sent this photo from ruby #alaska as ...
Name: text, dtype: object

### 2.3 Special Characters Removal

In [12]:
df_train['text']= df_train['text'].str.replace('rt ',"").str.replace('@','').str.replace('#','').str.replace('[^\w\s]','').str.replace('[1-9]','')
df_train['text'].head()

0    our deeds are the reason of this earthquake ma...
1                forest fire near la ronge sask canada
2    all residents asked to shelter in place are be...
3    000 people receive wildfires evacuation orders...
4    just got sent this photo from ruby alaska as s...
Name: text, dtype: object

### 2.4 Removal of Numbers

In [13]:
df_train['text'] = df_train['text'].str.replace(r'\d+(\.\d+)?','')
df_train['text'].head()

0    our deeds are the reason of this earthquake ma...
1                forest fire near la ronge sask canada
2    all residents asked to shelter in place are be...
3     people receive wildfires evacuation orders in...
4    just got sent this photo from ruby alaska as s...
Name: text, dtype: object

### 2.5 Removal punctuation, leading, trailing and in between whitespace

In [14]:
df_train['text'] = df_train['text'].str.replace(r'[^\w\d\s]',' ')
df_train['text'] = df_train['text'].str.replace(r'^\s+|\s+?$', '')
df_train['text'] = df_train['text'].str.replace(r'\s+',' ')
df_train['text'].head()

0    our deeds are the reason of this earthquake ma...
1                forest fire near la ronge sask canada
2    all residents asked to shelter in place are be...
3    people receive wildfires evacuation orders in ...
4    just got sent this photo from ruby alaska as s...
Name: text, dtype: object

### 2.6 Removing Stopwords

In [15]:
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

df_train['text'] = df_train['text'].apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))
df_train['text'].head()

0         deeds reason earthquake may allah forgive us
1                forest fire near la ronge sask canada
2    residents asked shelter place notified officer...
3    people receive wildfires evacuation orders cal...
4    got sent photo ruby alaska smoke wildfires pou...
Name: text, dtype: object

### 2.7 Stemming

In [16]:
from nltk.stem import PorterStemmer

st = PorterStemmer()
df_train['text']=df_train['text'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
df_train['text'].head()

0            deed reason earthquak may allah forgiv us
1                 forest fire near la rong sask canada
2    resid ask shelter place notifi offic evacu she...
3          peopl receiv wildfir evacu order california
4    got sent photo rubi alaska smoke wildfir pour ...
Name: text, dtype: object

### 2.8 Convert a Collection of Text Documents to a Matrix of Token Counts

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(df_train.text).toarray()
y = df_train.iloc[:, 1].values

In [18]:
print(X)
print('=============================')
print(y)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 1 ... 0 0 0]]
[1 1 1 ... 1 1 1]


## 3. Modeling

In [19]:
from sklearn.model_selection import train_test_split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [21]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

### 3.1 Define Models to Train

**Here all selected models are with their default parameters**

In [22]:
names = ["K Nearest Neighbors", "Decision Tree", "Random Forest", "Logistic Regression", "SGD Classifier",
         "Naive Bayes", "SVM Linear"]

classifiers = [
    KNeighborsClassifier(),
    DecisionTreeClassifier(),
    RandomForestClassifier(),
    LogisticRegression(),
    SGDClassifier(max_iter = 100),
    MultinomialNB(),
    SVC(kernel = 'linear')
]

models = zip(names, classifiers)

### 3.2 Comparing Accuracy of Different Models

In [23]:
for name, model in models:
    nltk_model = model
    nltk_model.fit(X_train,y_train)
    accuracy = nltk_model.score(X_test, y_test)*100
    print("{} Accuracy: {}".format(name, accuracy))

K Nearest Neighbors Accuracy: 72.6854891661195
Decision Tree Accuracy: 74.32698621142481
Random Forest Accuracy: 78.26657912015759
Logistic Regression Accuracy: 78.72619829284308
SGD Classifier Accuracy: 77.8726198292843
Naive Bayes Accuracy: 77.74130006565989
SVM Linear Accuracy: 77.34734077478662


### 3.3 Selected Model

In [24]:
selected_classifier =  RandomForestClassifier()
selected_classifier.fit(X_train, y_train)

RandomForestClassifier()

In [25]:
prediction = selected_classifier.predict(X_test)

### 3.4 Print Classification Report and Confusion Matrix

In [27]:
print(classification_report(y_test, prediction))

pd.DataFrame(
    confusion_matrix(y_test, prediction),
    index = [['actual', 'actual'], ['Non_Disaster', 'Disaster']],
    columns = [['predicted', 'predicted'], ['Non_Disaster', 'Disaster']])

              precision    recall  f1-score   support

           0       0.79      0.85      0.82       886
           1       0.76      0.69      0.73       637

    accuracy                           0.78      1523
   macro avg       0.78      0.77      0.77      1523
weighted avg       0.78      0.78      0.78      1523



Unnamed: 0_level_0,Unnamed: 1_level_0,predicted,predicted
Unnamed: 0_level_1,Unnamed: 1_level_1,Non_Disaster,Disaster
actual,Non_Disaster,750,136
actual,Disaster,195,442


## 4. Test Dataset

### 4.1 Preprocessing of Dataset

In [28]:
df_test = pd.read_csv('test.csv')

In [29]:
print(df_test.head())

   id keyword location                                               text
0   0     NaN      NaN                 Just happened a terrible car crash
1   2     NaN      NaN  Heard about #earthquake is different cities, s...
2   3     NaN      NaN  there is a forest fire at spot pond, geese are...
3   9     NaN      NaN           Apocalypse lighting. #Spokane #wildfires
4  11     NaN      NaN      Typhoon Soudelor kills 28 in China and Taiwan


In [30]:
df_test.drop(['id','keyword','location'], axis = 1, inplace=True)

In [31]:
#---Lowercase-------
df_test['text'] = df_test['text'].apply(lambda x: " ".join(x.lower() for x in x.split()))

#---Replace URLs----
df_train['text'] = df_test['text'].str.replace(r'(http|ftp|https):\/\/[\w\-_]+(\.[\w\-_]+)+([\w\-\.,@?^=%&amp;:/~\+#]*[\w\-\@?^=%&amp;/~\+#])?', ' ')

#---Special Characters Removal---
df_test['text']= df_test['text'].str.replace('rt ',"").str.replace('@','').str.replace('#','').str.replace('[^\w\s]','').str.replace('[1-9]','')

#---Removal of Numbers------
df_train['text'] = df_train['text'].str.replace(r'\d+(\.\d+)?','')

#----Removal punctuation, leadign, trailing and in between whitespace----
df_test['text'] = df_test['text'].str.replace(r'[^\w\d\s]',' ')
df_test['text'] = df_test['text'].str.replace(r'^\s+|\s+?$', '')
df_test['text'] = df_test['text'].str.replace(r'\s+',' ')

#-----Removing Stopwords-------
df_test['text'] = df_test['text'].apply(lambda x: ' '.join(term for term in x.split() if term not in stop_words))

#-----Stemming--------------------
df_test['text']=df_test['text'].apply(lambda x: " ".join([st.stem(word) for word in x.split()]))
df_test['text'].head()

0                             happen terribl car crash
1        heard earthquak differ citi stay safe everyon
2    forest fire spot pond gees flee across street ...
3                       apocalyps light spokan wildfir
4                   typhoon soudelor kill china taiwan
Name: text, dtype: object

In [32]:
df_test = cv.fit_transform(df_test.text).toarray()

In [33]:
final_predictions = selected_classifier.predict(df_test)