In [1]:
# Importing libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 

import nltk
from sklearn.feature_extraction.text import CountVectorizer,TfidfTransformer

import warnings
warnings.filterwarnings(action='ignore',category=FutureWarning)
%matplotlib inline

In [2]:
# Importing the dataset

dataset = pd.read_csv('C:/Users/vaibh/Desktop/360 Digitmg/Naive Bayes/Assignment/Disaster_tweets_NB.csv')

In [3]:
# Checking rows and shape in the dataset
dataset.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
dataset.shape

(7613, 5)

In [5]:
# Features in the dataset
dataset.columns

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')

In [6]:
# Info of the dataset
dataset.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [7]:
# Using relevant features in the dataset
X = dataset.text
Y = dataset.target

In [8]:
# Checking for null values
X.isnull().sum(),Y.isnull().sum()

(0, 0)

There are no null values

In [9]:
# Cleaning the data 
import re 

def clean_data(data):
    data = re.sub('@\w+', ' ',data) # Remove @
    data = re.sub(r'http\S+',' ',data) # Remove links
    data = re.sub('[^a-zA-Z]+',' ',data).lower() # Keep only words 
    return data

In [10]:
# Applying clean_data function
X = X.apply(clean_data)

In [11]:
from nltk.corpus import stopwords
stop_word = stopwords.words('english')

# Removing stopwords from dataset and words with length less than 4
def clean_stop_len(data):
    new_text = []
    for i in data:
        temp = []
        for j in i.split():
            if len(j) not in [0,1,2,3]:
                if j not in stop_word:
                    temp.append(j)
        new_temp = " ".join(temp)
        new_text.append(new_temp)
    return new_text

In [12]:
# Applying the clean_stop_len function 
X = pd.DataFrame(clean_stop_len(X),columns = ['text'])

In [13]:
data = pd.concat([X,Y],axis = 1)

In [14]:
# removing empty rows
data = data.loc[data['text'] != "",:]

In [15]:
# Splitting the data into train test 

from sklearn.model_selection import train_test_split

data_train, data_test = train_test_split(data, test_size = 0.2, random_state=12)

In [16]:
# Converting data to bag of words

def data2bow(data):
    return [j for i in data for j in i.split()]

In [17]:
# Using Word Count 
cv = CountVectorizer(analyzer=data2bow).fit(data.text)

data_cv = cv.transform(data.text)
train_cv = cv.transform(data_train.text)
test_cv = cv.transform(data_test.text)

In [18]:
# Weighing the data and transforming train and test data
tfidf = TfidfTransformer()
data_tfidf = tfidf.fit(data_cv)
train_tfidf = tfidf.transform(train_cv)
test_tfidf = tfidf.transform(test_cv)

In [19]:
data_train.shape, train_tfidf.shape, data_test.shape, test_tfidf.shape

((6080, 2), (6080, 26), (1520, 2), (1520, 26))

In [20]:
# Using MultinomialNB Classifier

from sklearn.naive_bayes import BernoulliNB
bnb = BernoulliNB()

# Fitting the model to train data set
bnb.fit(train_tfidf, data_train.target)

BernoulliNB()

In [21]:
# Predicting the target using model

yhat = bnb.predict(test_tfidf)

In [22]:
# Checking Testing data accuracy score

from sklearn.metrics import accuracy_score
print('Accuracy Score: {} %'.format(accuracy_score(data_test.target,yhat)*100))

Accuracy Score: 58.09210526315789 %


In [23]:
# Confusion Matrix for testing data

from sklearn.metrics import confusion_matrix
confusion_matrix(data_test.target,yhat)

array([[453, 427],
       [210, 430]], dtype=int64)

In [24]:
# Checking Training data accuracy score for overfitting

yhat_train = bnb.predict(train_tfidf)

from sklearn.metrics import accuracy_score
print('Accuracy Score: {} %'.format(accuracy_score(data_train.target,yhat_train)*100))

Accuracy Score: 57.94407894736842 %


In [25]:
# Confusion Matrix for training data

from sklearn.metrics import confusion_matrix
confusion_matrix(data_train.target,yhat_train)

array([[1796, 1655],
       [ 902, 1727]], dtype=int64)

.

In [26]:
# Using MultinomialNB Classifier

from sklearn.naive_bayes import MultinomialNB
mnb = MultinomialNB(alpha = 0.75)

# Fitting the model to train data set
mnb.fit(train_tfidf, data_train.target)

MultinomialNB(alpha=0.75)

In [27]:
# Predicting the target using model

yhat2 = mnb.predict(test_tfidf)

In [28]:
# Checking Testing data accuracy score

from sklearn.metrics import accuracy_score
print('Testing Accuracy Score: {} %'.format(accuracy_score(data_test.target,yhat2)*100))

Testing Accuracy Score: 57.89473684210527 %


In [29]:
# Confusion Matrix for testing data

from sklearn.metrics import confusion_matrix
confusion_matrix(data_test.target,yhat2)

array([[880,   0],
       [640,   0]], dtype=int64)

In [30]:
# Checking Training data accuracy score for overfitting

yhat2_train = mnb.predict(train_tfidf)

from sklearn.metrics import accuracy_score
print('Accuracy Score: {:0.02f} %'.format(accuracy_score(data_train.target,yhat2_train)*100))

Accuracy Score: 56.76 %


In [31]:
# Confusion Matrix for training data

from sklearn.metrics import confusion_matrix
confusion_matrix(data_train.target,yhat2_train)

array([[3451,    0],
       [2629,    0]], dtype=int64)

The Bernoulli and Multinomial Classifier gives the same accuracy for train-test data

In [32]:
# Model Performance
cm = pd.DataFrame(confusion_matrix(data_test.target,yhat),
                 columns=['Predicted Positive','Predicted Negative'],
                 index=['Actual Positive','Actual Negative'])

TP = cm.iloc[0,0]
TN = cm.iloc[1,1]
FP = cm.iloc[0,1]
FN = cm.iloc[1,0]

accuracy = (TP + TN) / float(TP + TN + FP + FN) ; print('accuracy : {:0.3f} %'.format(accuracy*100))
error = (FP + FN) / float(TP + TN + FP + FN) ; print('error : {:0.3f} %'.format(error*100))

precision = TP / float(TP + FP) ; print('precision : {:0.3f} %'.format(precision*100))
recall = TP / float(TP + FN) ; print('recall : {:0.3f} %'.format(recall*100))
specificity = TN / (TN + FP) ; print('specificity : {:0.3f} %'.format(specificity*100))

accuracy : 58.092 %
error : 41.908 %
precision : 51.477 %
recall : 68.326 %
specificity : 50.175 %
