## Imports and Downloads

In [10]:
import pandas as pd
import nltk
import string

In [17]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\arksi\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [18]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\arksi\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [21]:
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\arksi\AppData\Roaming\nltk_data...


True

---
## 1. Read the dataset 

In [41]:
df = pd.read_csv('./disaster_tweets_data.csv') 

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweets  7613 non-null   object
 1   target  7613 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 119.1+ KB


---
## 2. Remove handle null values (if any).

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   tweets  7613 non-null   object
 1   target  7613 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 119.1+ KB


**Observation:** There is no null values in the dataset

---
## 3. Preprocess the disaster tweets data based on the following parameter:

### 1. Tokenizing words

In [44]:
def tokenize_it(text) : return nltk.word_tokenize(text)
df['tweets'] = df['tweets'].apply(tokenize_it)
df.head()

Unnamed: 0,tweets,target
0,"[Our, Deeds, are, the, Reason, of, this, #, ea...",1
1,"[Forest, fire, near, La, Ronge, Sask, ., Canada]",1
2,"[All, residents, asked, to, 'shelter, in, plac...",1
3,"[13,000, people, receive, #, wildfires, evacua...",1
4,"[Just, got, sent, this, photo, from, Ruby, #, ...",1


### 2. Convert words to lower case

In [45]:
def to_lower(tokens) : return [word.lower() for word in tokens]
df['tweets'] = df['tweets'].apply(to_lower)

In [46]:
df.head()

Unnamed: 0,tweets,target
0,"[our, deeds, are, the, reason, of, this, #, ea...",1
1,"[forest, fire, near, la, ronge, sask, ., canada]",1
2,"[all, residents, asked, to, 'shelter, in, plac...",1
3,"[13,000, people, receive, #, wildfires, evacua...",1
4,"[just, got, sent, this, photo, from, ruby, #, ...",1


### 3. Removing Punctuations

In [47]:
def remove_punc(tokens) : return [token for token in tokens if token not in string.punctuation] 
df['tweets'] = df['tweets'].apply(remove_punc)

In [48]:
df.head()

Unnamed: 0,tweets,target
0,"[our, deeds, are, the, reason, of, this, earth...",1
1,"[forest, fire, near, la, ronge, sask, canada]",1
2,"[all, residents, asked, to, 'shelter, in, plac...",1
3,"[13,000, people, receive, wildfires, evacuatio...",1
4,"[just, got, sent, this, photo, from, ruby, ala...",1


### 4. Removing Stop words

In [49]:
from nltk.corpus import stopwords 
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens) : return [token for token in tokens if token not in stop_words] 

df['tweets'] = df['tweets'].apply(remove_stopwords)

In [50]:
df.head()

Unnamed: 0,tweets,target
0,"[deeds, reason, earthquake, may, allah, forgiv...",1
1,"[forest, fire, near, la, ronge, sask, canada]",1
2,"[residents, asked, 'shelter, place, notified, ...",1
3,"[13,000, people, receive, wildfires, evacuatio...",1
4,"[got, sent, photo, ruby, alaska, smoke, wildfi...",1


### 5. Stemming or lemmatizing the words 

#### 1. Stemming

In [51]:
from nltk.stem import PorterStemmer 
stemmer = PorterStemmer() 

def stemming_tokens(tokens): return [stemmer.stem(token) for token in tokens]

df['tweets'] = df['tweets'].apply(stemming_tokens) 

In [52]:
df.head()

Unnamed: 0,tweets,target
0,"[deed, reason, earthquak, may, allah, forgiv, us]",1
1,"[forest, fire, near, la, rong, sask, canada]",1
2,"[resid, ask, 'shelter, place, notifi, offic, e...",1
3,"[13,000, peopl, receiv, wildfir, evacu, order,...",1
4,"[got, sent, photo, rubi, alaska, smoke, wildfi...",1


#### 2. Lemmatizing

In [53]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\arksi\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [54]:
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer() 

def lemmatizing_tokens(tokens): return [lemmatizer.lemmatize(token) for token in tokens] 

df['tweets'] = df['tweets'].apply(lemmatizing_tokens) 

In [55]:
df.head()

Unnamed: 0,tweets,target
0,"[deed, reason, earthquak, may, allah, forgiv, u]",1
1,"[forest, fire, near, la, rong, sask, canada]",1
2,"[resid, ask, 'shelter, place, notifi, offic, e...",1
3,"[13,000, peopl, receiv, wildfir, evacu, order,...",1
4,"[got, sent, photo, rubi, alaska, smoke, wildfi...",1


---
## 4. Transform the words into vectors using Count Vectorizer or TF-IDF Vectorizer


In [58]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')

vectors = vectorizer.fit_transform(df['tweets'].apply(' '.join)) 

---
## 5. Select x(independent feature) as tweets after preprocessing and target as y(dependent feature).

In [60]:
x = vectors
y = df['target']

--- 
## 6. Split data into training and test data.

In [61]:
from sklearn.model_selection import train_test_split 
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

---
## 7. Apply the following models on the training dataset 
    1. Multinomial Naïve Bayes Classification
    2. Logistic Regression 
    3. KNN Classification

In [65]:
from sklearn.naive_bayes import MultinomialNB
MultinomialNB_model = MultinomialNB() 
MultinomialNB_model.fit(X=x_train, y=y_train) 

MultinomialNB()

In [66]:
from sklearn.linear_model import LogisticRegression 
LogisticRegression_model = LogisticRegression()
LogisticRegression_model.fit(x_train, y_train)

LogisticRegression()

In [67]:
from sklearn.neighbors import KNeighborsClassifier 
KNeighborsClassifier_model = KNeighborsClassifier() 
KNeighborsClassifier_model.fit(x_train, y_train)

KNeighborsClassifier()

--- 
## 8. Predict the target for test data
    1. Multinomial Naïve Bayes Classification
    2. Logistic Regression 
    3. KNN Classification

In [68]:
y_pred_MultinomialNB = MultinomialNB_model.predict(x_test)
y_pred_LogisticRegression = LogisticRegression_model.predict(x_test)
y_pred_KNeighborsClassifier = KNeighborsClassifier_model.predict(x_test)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


--- 
## 9. Compute Confusion matrix and classification report for each of these models 
    1. Multinomial Naïve Bayes Classification
    2. Logistic Regression 
    3. KNN Classification


In [69]:
from sklearn.metrics import confusion_matrix, classification_report
def report(y_pred) : 
    print("confusion_matrix :")
    print(confusion_matrix(y_test, y_pred) )
    print("\nClassification Report: ")
    print(classification_report(y_test, y_pred))
    print('')

In [70]:
print("REPORT of Multinomial Naïve Bayes Classification Model") 
report(y_pred_MultinomialNB)

REPORT of Multinomial Naïve Bayes Classification Model
confusion_matrix :
[[811  82]
 [224 406]]

Classification Report: 
              precision    recall  f1-score   support

           0       0.78      0.91      0.84       893
           1       0.83      0.64      0.73       630

    accuracy                           0.80      1523
   macro avg       0.81      0.78      0.78      1523
weighted avg       0.80      0.80      0.79      1523



In [71]:
print('REPORT of Logistic Regression Model') 
report(y_pred_LogisticRegression)

REPORT of Logistic Regression Model
confusion_matrix :
[[819  74]
 [239 391]]

Classification Report: 
              precision    recall  f1-score   support

           0       0.77      0.92      0.84       893
           1       0.84      0.62      0.71       630

    accuracy                           0.79      1523
   macro avg       0.81      0.77      0.78      1523
weighted avg       0.80      0.79      0.79      1523



In [72]:
print("REPORT of KNN Classification Model") 
report(y_pred_KNeighborsClassifier)

REPORT of KNN Classification Model
confusion_matrix :
[[820  73]
 [293 337]]

Classification Report: 
              precision    recall  f1-score   support

           0       0.74      0.92      0.82       893
           1       0.82      0.53      0.65       630

    accuracy                           0.76      1523
   macro avg       0.78      0.73      0.73      1523
weighted avg       0.77      0.76      0.75      1523



--- 
## 10. Report the model with the best accuracy.

Best Accuracy is given by **Multinomial Naïve Bayes Classification Model** which is **80 %**