In [1]:
import re
import nltk
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier

In [2]:
train_dataset = pd.read_csv('../Datasets/train.csv')
test_dataset = pd.read_csv('../Datasets/test.csv')

In [3]:
train_dataset.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
train_dataset.drop(['keyword','id','location'], axis=1, inplace=True)

In [5]:
train_dataset.head()

Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
stopwords  = stopwords.words('English')

In [7]:
ps = PorterStemmer()
lm = WordNetLemmatizer()

# Data Cleaning

In [8]:
#train_dataset.dropna(inplace=True)

In [9]:
#train_dataset.isnull().sum()

In [10]:
def cleanData(value):
  value = re.sub("[^a-zA-Z]", " ", value)
  value = re.sub(" +", " ", value)
  return value

In [11]:
def stemSentences(sentence):
  sentence = cleanData(sentence)
  wordsList = list()
  sentence = sentence.lower()
  wordsList = sentence.split(" ")
  stemmedWords = [ps.stem(x) for x in wordsList if not x in stopwords]
  return " ".join(stemmedWords)

In [12]:
def lemamtizeSentence(sentence):
  sentence = cleanData(sentence)
  wordsList = list()
  sentence = sentence.lower()
  wordsList = sentence.split(" ")
  stemmedWords = [lm.lemmatize(x) for x in wordsList if not x in stopwords]
  return " ".join(stemmedWords)

In [13]:
def stemDataset(dataset):
  dataset['Stemmed Text'] = dataset['text'].apply(stemSentences)
  dataset['Lemmatized Text'] = dataset['text'].apply(lemamtizeSentence)
  return dataset

In [14]:
dataset = stemDataset(train_dataset)

In [15]:
dataset.head()

Unnamed: 0,text,target,Stemmed Text,Lemmatized Text
0,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquak may allah forgiv us,deed reason earthquake may allah forgive u
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la rong sask canada,forest fire near la ronge sask canada
2,All residents asked to 'shelter in place' are ...,1,resid ask shelter place notifi offic evacu she...,resident asked shelter place notified officer ...
3,"13,000 people receive #wildfires evacuation or...",1,peopl receiv wildfir evacu order california,people receive wildfire evacuation order cali...
4,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo rubi alaska smoke wildfir pour ...,got sent photo ruby alaska smoke wildfire pour...


## Word2Vec 

In [16]:
import gensim

In [17]:
dataset.head()

Unnamed: 0,text,target,Stemmed Text,Lemmatized Text
0,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquak may allah forgiv us,deed reason earthquake may allah forgive u
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la rong sask canada,forest fire near la ronge sask canada
2,All residents asked to 'shelter in place' are ...,1,resid ask shelter place notifi offic evacu she...,resident asked shelter place notified officer ...
3,"13,000 people receive #wildfires evacuation or...",1,peopl receiv wildfir evacu order california,people receive wildfire evacuation order cali...
4,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo rubi alaska smoke wildfir pour ...,got sent photo ruby alaska smoke wildfire pour...


In [18]:
w2v_data = dataset['text'].apply(gensim.utils.simple_preprocess)
w2v_data

0       [our, deeds, are, the, reason, of, this, earth...
1           [forest, fire, near, la, ronge, sask, canada]
2       [all, residents, asked, to, shelter, in, place...
3       [people, receive, wildfires, evacuation, order...
4       [just, got, sent, this, photo, from, ruby, ala...
                              ...                        
7608    [two, giant, cranes, holding, bridge, collapse...
7609    [aria_ahrary, thetawniest, the, out, of, contr...
7610    [utc, km, of, volcano, hawaii, http, co, zdtoy...
7611    [police, investigating, after, an, bike, colli...
7612    [the, latest, more, homes, razed, by, northern...
Name: text, Length: 7613, dtype: object

In [19]:
dataset['W2V text'] = w2v_data.apply(lambda x : " ".join(x))

In [20]:
dataset.head()

Unnamed: 0,text,target,Stemmed Text,Lemmatized Text,W2V text
0,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquak may allah forgiv us,deed reason earthquake may allah forgive u,our deeds are the reason of this earthquake ma...
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la rong sask canada,forest fire near la ronge sask canada,forest fire near la ronge sask canada
2,All residents asked to 'shelter in place' are ...,1,resid ask shelter place notifi offic evacu she...,resident asked shelter place notified officer ...,all residents asked to shelter in place are be...
3,"13,000 people receive #wildfires evacuation or...",1,peopl receiv wildfir evacu order california,people receive wildfire evacuation order cali...,people receive wildfires evacuation orders in ...
4,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo rubi alaska smoke wildfir pour ...,got sent photo ruby alaska smoke wildfire pour...,just got sent this photo from ruby alaska as s...


# Model Creation

## 1.) TFIDF - Naive Bayes - On Lemmatized Text

In [16]:
tfidf = TfidfVectorizer(ngram_range=(1,1))
tokenized = tfidf.fit_transform(dataset['Lemmatized Text']).toarray()
tokenized

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [17]:
#X_tfidf = tokenized.copy()
#y_tfidf = train_dataset['target']

In [18]:
X_train, X_Val, y_train, y_val  = train_test_split(tokenized, train_dataset['target'], test_size=.3, random_state=42)

In [19]:
mnb_tfidf = MultinomialNB(alpha=1)
mnb_tfidf.fit(X_train, y_train)

train_pred = mnb_tfidf.predict(X_train)
val_pred = mnb_tfidf.predict(X_Val)

print('On Training Data')
print(accuracy_score(y_train, train_pred))
print(classification_report(y_train, train_pred))
print(confusion_matrix(y_train, train_pred))

print('============================================')
print()

print('On Validation Data')
print(accuracy_score(y_val, val_pred))
print(classification_report(y_val, val_pred))
print(confusion_matrix(y_val, val_pred))

On Training Data
0.9097391630699944
              precision    recall  f1-score   support

           0       0.88      0.98      0.92      3024
           1       0.97      0.82      0.89      2305

    accuracy                           0.91      5329
   macro avg       0.92      0.90      0.91      5329
weighted avg       0.92      0.91      0.91      5329

[[2963   61]
 [ 420 1885]]

On Validation Data
0.7985989492119089
              precision    recall  f1-score   support

           0       0.78      0.91      0.84      1318
           1       0.84      0.65      0.73       966

    accuracy                           0.80      2284
   macro avg       0.81      0.78      0.78      2284
weighted avg       0.80      0.80      0.79      2284

[[1200  118]
 [ 342  624]]


## 2.) Count Vectorizer - Naive Bayes - On gensim processed text 

In [25]:
bow = CountVectorizer(ngram_range = (1,1), binary=True, analyzer='word')
tokenized = bow.fit_transform(dataset['W2V text']).toarray()
tokenized

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [21]:
#bow.get_feature_names()

In [22]:
#X_bow = tokenized.copy()
#y_bow = train_dataset['target']

In [26]:
X_train_bow, X_Val_bow, y_train_bow, y_val_bow  = train_test_split(tokenized, train_dataset['target'], test_size=.3, random_state=42)

In [22]:
X_train_bow.shape

(5329, 22051)

In [27]:
mnb_bow = MultinomialNB(alpha=1)
mnb_bow.fit(X_train_bow, y_train_bow)

train_pred = mnb_bow.predict(X_train_bow)
val_pred = mnb_bow.predict(X_Val_bow)

print('On Training Data')
print(accuracy_score(y_train_bow, train_pred))
print(classification_report(y_train_bow, train_pred))
print(confusion_matrix(y_train_bow, train_pred))

print('============================================')
print()

print('On Validation Data')
print(accuracy_score(y_val_bow, val_pred))
print(classification_report(y_val_bow, val_pred))
print(confusion_matrix(y_val_bow, val_pred))

On Training Data
0.9172452617751924
              precision    recall  f1-score   support

           0       0.90      0.97      0.93      3024
           1       0.95      0.85      0.90      2305

    accuracy                           0.92      5329
   macro avg       0.92      0.91      0.91      5329
weighted avg       0.92      0.92      0.92      5329

[[2925   99]
 [ 342 1963]]

On Validation Data
0.808231173380035
              precision    recall  f1-score   support

           0       0.81      0.87      0.84      1318
           1       0.80      0.73      0.76       966

    accuracy                           0.81      2284
   macro avg       0.81      0.80      0.80      2284
weighted avg       0.81      0.81      0.81      2284

[[1142  176]
 [ 262  704]]


## 3.) TFIDF - Naive Bayes - On gensim processed text 

In [54]:
# w2v = Word2Vec(window=10, min_count=2, workers=10)
# w2v.build_vocab(w2v_data, progress_per=1000)

In [42]:
tfidf_w2v = TfidfVectorizer(ngram_range=(1,1))
vectorized_w2v_data = tfidf_w2v.fit_transform(dataset['W2V text']).toarray()
vectorized_w2v_data

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [43]:
X_train_w2v, X_Val_w2v, y_train_w2v, y_val_w2v  = train_test_split(vectorized_w2v_data, train_dataset['target'], test_size=.3, random_state=42)

In [44]:
mnb_w2v = MultinomialNB(alpha=1)
mnb_w2v.fit(X_train_w2v, y_train_w2v)

train_pred = mnb_w2v.predict(X_train_w2v)
val_pred = mnb_w2v.predict(X_Val_w2v)

print('On Training Data')
print(accuracy_score(y_train_w2v, train_pred))
print(classification_report(y_train_w2v, train_pred))
print(confusion_matrix(y_train_w2v, train_pred))

print('============================================')
print()

print('On Validation Data')
print(accuracy_score(y_val_w2v, val_pred))
print(classification_report(y_val_w2v, val_pred))
print(confusion_matrix(y_val_w2v, val_pred))

On Training Data
0.8911615687746294
              precision    recall  f1-score   support

           0       0.85      0.99      0.91      3024
           1       0.98      0.77      0.86      2305

    accuracy                           0.89      5329
   macro avg       0.91      0.88      0.89      5329
weighted avg       0.90      0.89      0.89      5329

[[2980   44]
 [ 536 1769]]

On Validation Data
0.8029772329246935
              precision    recall  f1-score   support

           0       0.77      0.94      0.85      1318
           1       0.88      0.62      0.73       966

    accuracy                           0.80      2284
   macro avg       0.82      0.78      0.79      2284
weighted avg       0.82      0.80      0.80      2284

[[1236   82]
 [ 368  598]]


## 4.) Count Vectorizer - XGBoost Classifier - On gensim processed text 

In [23]:
from xgboost import XGBClassifier

In [24]:
xgb = XGBClassifier()
xgb.fit(X_train_bow, y_train_bow)

train_pred = xgb.predict(X_train_bow)
val_pred = xgb.predict(X_Val_bow)

print('On Training Data')
print(accuracy_score(y_train_bow, train_pred))
print(classification_report(y_train_bow, train_pred))
print(confusion_matrix(y_train_bow, train_pred))

print('============================================')
print()

print('On Validation Data')
print(accuracy_score(y_val_bow, val_pred))
print(classification_report(y_val_bow, val_pred))
print(confusion_matrix(y_val_bow, val_pred))

#### Hyper-parameter tuning xgb model

In [25]:
from sklearn.model_selection import KFold, RepeatedStratifiedKFold

In [26]:
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=42)

In [27]:
xgb.get_params()

{'objective': 'binary:logistic',
 'use_label_encoder': True,
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'enable_categorical': False,
 'gamma': None,
 'gpu_id': None,
 'importance_type': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'predictor': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [27]:
params_grid = {
 'colsample_bylevel': [0.5],
 'colsample_bytree': [0.5],
 'learning_rate': [0.05],
 'max_depth': [11],
 'min_child_weight': [5],
 'n_estimators': [50],
 'n_jobs': [6],
 'random_state': [42],
 'reg_alpha': [0],
 'reg_lambda': [1],
 'subsample': [0.8],
}

In [76]:
sklearn.metrics.SCORERS.keys()

dict_keys(['explained_variance', 'r2', 'max_error', 'neg_median_absolute_error', 'neg_mean_absolute_error', 'neg_mean_absolute_percentage_error', 'neg_mean_squared_error', 'neg_mean_squared_log_error', 'neg_root_mean_squared_error', 'neg_mean_poisson_deviance', 'neg_mean_gamma_deviance', 'accuracy', 'top_k_accuracy', 'roc_auc', 'roc_auc_ovr', 'roc_auc_ovo', 'roc_auc_ovr_weighted', 'roc_auc_ovo_weighted', 'balanced_accuracy', 'average_precision', 'neg_log_loss', 'neg_brier_score', 'adjusted_rand_score', 'rand_score', 'homogeneity_score', 'completeness_score', 'v_measure_score', 'mutual_info_score', 'adjusted_mutual_info_score', 'normalized_mutual_info_score', 'fowlkes_mallows_score', 'precision', 'precision_macro', 'precision_micro', 'precision_samples', 'precision_weighted', 'recall', 'recall_macro', 'recall_micro', 'recall_samples', 'recall_weighted', 'f1', 'f1_macro', 'f1_micro', 'f1_samples', 'f1_weighted', 'jaccard', 'jaccard_macro', 'jaccard_micro', 'jaccard_samples', 'jaccard_wei

In [29]:
grid_search = GridSearchCV(estimator=xgb, param_grid=params_grid, scoring='accuracy', n_jobs=10, cv=cv)

In [30]:
grid_search.fit(X_train_bow, y_train_bow)

train_pred = grid_search.predict(X_train_bow)
val_pred = grid_search.predict(X_Val_bow)

print('On Training Data')
print(accuracy_score(y_train_bow, train_pred))
print(classification_report(y_train_bow, train_pred))
print(confusion_matrix(y_train_bow, train_pred))

print('============================================')
print()

print('On Validation Data')
print(accuracy_score(y_val_bow, val_pred))
print(classification_report(y_val_bow, val_pred))
print(confusion_matrix(y_val_bow, val_pred))

## 3.) Logistic Regression

In [21]:
dataset.head()

Unnamed: 0,text,target,Stemmed Text,Lemmatized Text
0,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquak may allah forgiv us,deed reason earthquake may allah forgive u
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la rong sask canada,forest fire near la ronge sask canada
2,All residents asked to 'shelter in place' are ...,1,resid ask shelter place notifi offic evacu she...,resident asked shelter place notified officer ...
3,"13,000 people receive #wildfires evacuation or...",1,peopl receiv wildfir evacu order california,people receive wildfire evacuation order cali...
4,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo rubi alaska smoke wildfir pour ...,got sent photo ruby alaska smoke wildfire pour...


In [22]:
X_logReg = dataset['Lemmatized Text']
y_logReg = dataset['target']

In [23]:
tfidf_logReg = TfidfVectorizer()
X_logReg_vectorized = tfidf_logReg.fit_transform(X_logReg).toarray()

In [24]:
X_train_logReg , X_Val_logReg, y_train_logReg , y_Val_logReg = train_test_split(X_logReg_vectorized, y_logReg, test_size=.2, random_state=42)

In [25]:
logReg = LogisticRegression(solver='sag', C=5)

In [26]:
cv_loss = np.mean(cross_val_score(logReg, X_train_logReg, y_train_logReg, cv=10, scoring='neg_log_loss'))
print('CV Log_loss score is {}'.format(cv_loss))

cv_score = np.mean(cross_val_score(logReg, X_train_logReg, y_train_logReg, cv=10, scoring='accuracy'))
print('CV Accuracy score is {}'.format(cv_score))

CV Log_loss score is -0.44988765057825


In [None]:
logReg.fit(X_train_logReg, y_train_logReg)

y_pred_val = logReg.predict(X_Val_logReg)
y_pred_val_prob = logReg.predict_proba(X_Val_logReg)
auc_score = roc_auc_score(y_Val_logReg, y_pred_val)

print("CV ROC_AUC score {}".format(auc_score))
print('Accuracy: ', accuracy_score(y_Val_logReg, y_pred_val))
    
print(confusion_matrix(y_Val_logReg, y_pred_val))
print(classification_report(y_Val_logReg, y_pred_val))

CV ROC_AUC score 0.7708823290892872

Accuracy:  0.7852921864740644
[[759 115]
 [212 437]]
              precision    recall  f1-score   support

           0       0.78      0.87      0.82       874
           1       0.79      0.67      0.73       649

    accuracy                           0.79      1523
   macro avg       0.79      0.77      0.78      1523
weighted avg       0.79      0.79      0.78      1523



# Using LSTM

In [93]:
dataset.head()

Unnamed: 0,text,target,Stemmed Text,Lemmatized Text,W2V text
0,Our Deeds are the Reason of this #earthquake M...,1,deed reason earthquak may allah forgiv us,deed reason earthquake may allah forgive u,our deeds are the reason of this earthquake ma...
1,Forest fire near La Ronge Sask. Canada,1,forest fire near la rong sask canada,forest fire near la ronge sask canada,forest fire near la ronge sask canada
2,All residents asked to 'shelter in place' are ...,1,resid ask shelter place notifi offic evacu she...,resident asked shelter place notified officer ...,all residents asked to shelter in place are be...
3,"13,000 people receive #wildfires evacuation or...",1,peopl receiv wildfir evacu order california,people receive wildfire evacuation order cali...,people receive wildfires evacuation orders in ...
4,Just got sent this photo from Ruby #Alaska as ...,1,got sent photo rubi alaska smoke wildfir pour ...,got sent photo ruby alaska smoke wildfire pour...,just got sent this photo from ruby alaska as s...


In [94]:
dataset.shape

(7613, 5)

In [366]:
dataset['Lemmatized Text']

0              deed reason earthquake may allah forgive u
1                   forest fire near la ronge sask canada
2       resident asked shelter place notified officer ...
3        people receive wildfire evacuation order cali...
4       got sent photo ruby alaska smoke wildfire pour...
                              ...                        
7608    two giant crane holding bridge collapse nearby...
7609     aria ahrary thetawniest control wild fire cal...
7610             utc km volcano hawaii http co zdtoyd ebj
7611    police investigating e bike collided car littl...
7612    latest home razed northern california wildfire...
Name: Lemmatized Text, Length: 7613, dtype: object

In [367]:
dataset['W2V text']

0       our deeds are the reason of this earthquake ma...
1                   forest fire near la ronge sask canada
2       all residents asked to shelter in place are be...
3       people receive wildfires evacuation orders in ...
4       just got sent this photo from ruby alaska as s...
                              ...                        
7608    two giant cranes holding bridge collapse into ...
7609    aria_ahrary thetawniest the out of control wil...
7610          utc km of volcano hawaii http co zdtoyd ebj
7611    police investigating after an bike collided wi...
7612    the latest more homes razed by northern califo...
Name: W2V text, Length: 7613, dtype: object

### Getting max len of sentences

In [96]:
def maxWordList(sentence):
  wordList = sentence.split()
  count = len(wordList)
  return count

In [97]:
counts = dataset['W2V text'].apply(maxWordList)

In [98]:
max(counts)

30

## LSTM - START

In [721]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional

In [722]:
X_LSTM = dataset['W2V text']
y_LSTM = dataset['target']

In [723]:
voc_size = 15000

### One_hot encoding

In [724]:
onehot_repr=[one_hot(sentence,voc_size)for sentence in X_LSTM] 
#onehot_repr

### Embedding Representation


In [725]:
sent_length=35

In [726]:
embedded_docs = pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

[[    0     0     0 ... 12580  1570   435]
 [    0     0     0 ... 11933   313  7896]
 [    0     0     0 ...  2735 11522  1851]
 ...
 [    0     0     0 ...  5289  5597  4642]
 [    0     0     0 ...  4729  7903  2873]
 [    0     0     0 ...  5289  4842  6940]]


In [727]:
embedded_docs[0]

array([    0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0,     0,     0,     0,     0,     0,
           0,     0,     0,     0, 11360,  2333, 11522,  2960, 11826,
       14741,  4732,  7349,  8534, 12952, 12580,  1570,   435])

### Creating model

In [728]:
embedding_vector_features=500

In [729]:
# model=Sequential()
# model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
# model.add(LSTM(100))
# model.add(Dense(1,activation='sigmoid'))
# model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
# print(model.summary())

# model=Sequential()
# model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
# model.add(Dropout(0.3))
# model.add(LSTM(100))
# model.add(Dropout(0.3))
# model.add(Dense(1,activation='sigmoid'))
# model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])

model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features,input_length=sent_length))
model.add(Bidirectional(LSTM(100)))
model.add(Dropout(0.3))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
#print(model1.summary())

In [730]:
len(embedded_docs),y_LSTM.shape

(7613, (7613,))

In [731]:
X_LSTM_final = np.array(embedded_docs)
y_LSTM_final = np.array(y_LSTM)

In [732]:
X_LSTM_final.shape, y_LSTM_final.shape

((7613, 35), (7613,))

In [733]:
X_train_LSTM, X_test_LSTM, y_train_LSTM, y_test_LSTM = train_test_split(X_LSTM_final, y_LSTM_final, test_size=.3, random_state=42)

### Model training

In [734]:
model.fit(X_train_LSTM,y_train_LSTM,epochs=5,batch_size=64)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x1c995898b50>

In [735]:
print('Train Data')
model.evaluate(X_train_LSTM, y_train_LSTM)
print()
print('Validation Data')
model.evaluate(X_test_LSTM, y_test_LSTM)

Train Data

Validation Data


[0.9629634022712708, 0.7548161149024963]

In [736]:
y_pred=model.predict(X_test_LSTM)
classes_x=np.argmax(y_pred,axis=1)

print(classification_report(y_test_LSTM, classes_x))
print(accuracy_score(y_test_LSTM, classes_x))

              precision    recall  f1-score   support

           0       0.58      1.00      0.73      1318
           1       0.00      0.00      0.00       966

    accuracy                           0.58      2284
   macro avg       0.29      0.50      0.37      2284
weighted avg       0.33      0.58      0.42      2284

0.5770577933450087


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [737]:
pn

NameError: name 'pn' is not defined

# Testing the model's performance

In [None]:
test_dataset.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [None]:
def processTestData(dataset, vectorizer):
  #dataset.drop(['id','keyword','location'], axis=1, inplace=True)
  transformed_data = vectorizer.transform(dataset['text']).toarray()
  return transformed_data

In [None]:
def getPredictions(dataset, vectorizer, model):
  transformed_data = processTestData(dataset, vectorizer)
  test_pred = model.predict(transformed_data)
  return test_pred

In [None]:
def predict(dataset, vectorizer, model, filename):
  # Getting predictions on test data.
  test_pred = getPredictions(dataset, vectorizer, model)

  #Creating predictions dataframe.
  predictions_df = pd.DataFrame(test_pred, columns=['target'])
  predictions_df = pd.concat([test_dataset['id'], predictions_df], axis=1)

  #Saving the predictions.
  predictions_df.to_csv('../Datasets/{}.csv'.format(filename), index=False)

In [None]:
#Getting predictions for tfidf
predict(test_dataset, tfidf, mnb_tfidf, 'predictions_tfidf')

In [None]:
#Getting predictions for bow
#predict(test_dataset, bow, mnb_bow, 'predictions_bow')

In [None]:
#Getting predictions for bow with w2v data.
predict(test_dataset, bow, mnb_bow, 'predictions_bow_w2v')

# Saving Model

In [None]:
import joblib

In [None]:
joblib.dump(value=mnb_bow, filename='../Model/mnb_bow_w2v.pkl')

['../Model/mnb_bow_w2v.pkl']