In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')


In [4]:
df = pd.read_csv('train.csv')

df.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
# Importing libraries

import re
from nltk.stem.snowball import SnowballStemmer
from tqdm import tqdm

# Create an instance for SnowballStemmer
ss = SnowballStemmer('english')

In [6]:
# Defining a function to convert short words like couldn't to full word could not
def short_form(full_form):
    
    full_form = full_form.lower()      
    
    full_form = re.sub(r"won't", "will not", full_form)
    full_form = re.sub(r"wouldn't", "would not", full_form)
    full_form = re.sub(r"can't", "can not", full_form)
    full_form = re.sub(r"don't", "don not", full_form)
    full_form = re.sub(r"shouldn't", "should not", full_form)
    full_form = re.sub(r"couldn't", "could not", full_form)
    full_form = re.sub(r"\'re", " are", full_form)
    full_form = re.sub(r"\'s", " is", full_form)
    full_form = re.sub(r"\'d", " would", full_form)
    full_form = re.sub(r"\'ll", " will", full_form)
    full_form = re.sub(r"\'ve", " have", full_form)
    full_form = re.sub(r"\'m", " am", full_form)
  
    return full_form

# To remove URL
def url(ur):
    ur = re.sub(r"http\S+", '', ur)
    return ur

# Defining a function to remove punctuations, numbers, stopwords and get stem of words
def punc(pun):
    pun = re.sub('[^a-zA-Z]', ' ', pun)
    pun = pun.lower()
    #pun = pun.split()
    #pun = [ss.stem(sw) for sw in pun if sw not in stopwords.words('english')]
    #pun = ' '.join(pun)
    return pun

In [7]:
import nltk
nltk.download("stopwords")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Sandeep\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

# Working on data

In [8]:
import copy

In [9]:
dfe = copy.deepcopy(df)

In [10]:
from tqdm import tqdm

text_train_clean = []

for i, s in enumerate(tqdm(dfe['text'].values)):
    
    u = url(s)
    sf = short_form(u)
    pu = punc(sf)
    text_train_clean.append(pu)

100%|███████████████████████████████████| 7613/7613 [00:00<00:00, 21443.85it/s]


In [11]:
print('\n-> Preprocessed text data:\n')

dfe['text'] = text_train_clean


-> Preprocessed text data:



In [12]:
dfc = copy.deepcopy(dfe)

# dfc.to_csv('train_clean.csv', index = False)

In [14]:
# Import CountVectorizer library
from sklearn.feature_extraction.text import CountVectorizer

# Create an instance
# Bi-gram
cv_t = CountVectorizer(ngram_range = (1, 2))

# Fit and transform train data
tr_t_b = cv_t.fit_transform(dfe['text'])

In [15]:
# Import normalize library
from sklearn.preprocessing import normalize

# Normalize train data
tr_t_n = normalize(tr_t_b)

In [18]:
x_tr_e = tr_t_n

y = dfe['target']

# Naive Bayes

In [21]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import GridSearchCV

nbe = MultinomialNB()

param = {'alpha': [0.00001, 0.0001, 0.001, 0.01, 0.1, 0.15, 10, 100, 1000, 10000]}


clf_nbe = GridSearchCV(estimator = nbe, param_grid = param, scoring = 'accuracy', cv = 4)

clf_nbe.fit(x_tr_e, y)

GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=MultinomialNB(alpha=1.0, class_prior=None,
                                     fit_prior=True),
             iid='warn', n_jobs=None,
             param_grid={'alpha': [1e-05, 0.0001, 0.001, 0.01, 0.1, 0.15, 10,
                                   100, 1000, 10000]},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='accuracy', verbose=0)

In [22]:
print('\n-> Best score:', clf_nbe.best_score_, '\n')
print('*'*50, '\n')

print('\n-> Best estimators:', clf_nbe.best_estimator_)


-> Best score: 0.7426769998686458 

************************************************** 


-> Best estimators: MultinomialNB(alpha=0.15, class_prior=None, fit_prior=True)


In [23]:
import pickle

cv = pickle.dump(cv_t, open('twt_transform.pkl', 'wb'))

clf = pickle.dump(clf_nbe, open('twt_nb_model.pkl', 'wb'))