In [1]:
# Bag Of Words:

# We define a fixed length vector where each entry corresponds to a word in our pre-defined dictionary of words. The size of the vector equals the size of the dictionary. 
# Then, for representing a text using this vector, we count how many times each word of our dictionary appears in the text and we put this number in the corresponding vector entry.




# Tf-IDF:

# Term frequency-inverse document frequency (TF-IDF) gives a measure that takes the importance of a word into consideration depending on how frequently it occurs in a document and a corpus.




# Word to Vector:

# Converting words to vectors, or word vectorization, is a natural language processing (NLP) process. 
# The process uses language models to map words into vector space. A vector space represents each word by a vector of real numbers. It also allows words with similar meanings have similar representations




# Count Vectorizer: 

# It will fit and learn the word vocabulary and try to create a document term matrix in which the individual cells denote the frequency of that word in a particular document, which is also known as term frequency, and the columns are dedicated to each word in the corpus.




# Transfer Learning:

# It uses a prebuilt model on your data and gives pretty good results.

In [1]:
# https://medium.com/analytics-vidhya/nlp-tutorial-for-text-classification-in-python-8f19cd17b49e

In [125]:
# For data
import pandas as pd
import numpy as np
import json

# For Plotting
import plotly.express as px
import plotly.graph_objects as go

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline 
plt.style.use('ggplot')
from matplotlib.pyplot import figure
matplotlib.rcParams['figure.figsize'] = (22,10)
plt.rcParams.update({'font.size': 18})

import seaborn as sns 
sns.set_style('darkgrid')

# For processing
import re, string

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing, feature_selection
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score


# For explainer
from lime import lime_text

# For word embedding
import gensim
import gensim.downloader as gensim_api

# For deep learning
from tensorflow.keras import models, layers, preprocessing as kprocessing
from tensorflow.keras import backend as K

# For bert language model
import transformers

[nltk_data] Downloading package punkt to /Users/sammy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sammy/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/sammy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [126]:
# importing dataset

data = pd.read_csv('preprocessed_clean_train_set.csv') 


# Reviewing the data shape, columns and data types

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7503 entries, 0 to 7502
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   keyword                    7447 non-null   object 
 1   location                   5021 non-null   object 
 2   text                       7503 non-null   object 
 3   target                     7503 non-null   int64  
 4   total_words                7503 non-null   int64  
 5   char_count                 7503 non-null   int64  
 6   sentence_count             7503 non-null   int64  
 7   avg_word_length            7503 non-null   float64
 8   avg_sentence_lenght        7503 non-null   float64
 9   tokenized_text             7503 non-null   object 
 10  clean_text                 7460 non-null   object 
 11  clean_total_words          7503 non-null   int64  
 12  clean_char_count           7503 non-null   int64  
 13  clean_sentence_count       7503 non-null   int64

In [127]:
# Percentage of Data Missing Per Column Above 1%

total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Missing Percent'])
missing_data['Missing Percent'] = missing_data['Missing Percent'].apply(lambda x: x * 100)
missing_data.loc[missing_data['Missing Percent'] > .01][:152]

Unnamed: 0,Total,Missing Percent
location,2482,33.080101
keyword,56,0.746368
clean_text,43,0.573104


In [128]:
# Checking for Duplicates 

data[data.duplicated(subset = "clean_text")]

Unnamed: 0,keyword,location,text,target,total_words,char_count,sentence_count,avg_word_length,avg_sentence_lenght,tokenized_text,clean_text,clean_total_words,clean_char_count,clean_sentence_count,clean_avg_word_length,clean_avg_sentence_lenght
20,,,this is ridiculous....,0,3,20,5,6.666667,0.6,"['this', 'is', 'ridiculous', '...']",,1,0,1,0.000000,1.0
24,,,LOOOOOOL,0,1,8,1,8.000000,1.0,['LOOOOOOL'],,1,0,1,0.000000,1.0
89,accident,,???? it was an accident http://t.co/Oia5fxi4gM,0,6,41,2,6.833333,3.0,"['?', '?', '?', 'it', 'was', 'an', 'accident',...",accident,1,8,1,8.000000,1.0
93,accident,Alberta | Sask. | Montana,Suffield Alberta Accident https://t.co/bPTmlF4P10,1,4,46,2,11.500000,2.0,"['Suffield', 'Alberta', 'Accident', 'https://t...",accident,1,8,1,8.000000,1.0
97,accident,"Gloucestershire , UK",@flowri were you marinading it or was it an ac...,0,10,44,1,4.400000,10.0,"['@flowri', 'were', 'you', 'marinading', 'it',...",accident,1,8,1,8.000000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7492,,,#??? #?? #??? #??? MH370: Aircraft debris foun...,1,18,105,5,5.833333,3.6,"['#', '?', '?', '?', '#', '?', '?', '#', '?', ...",aircraft debris find la reunion miss malaysia ...,8,47,1,5.875000,8.0
7495,,,#breaking #LA Refugio oil spill may have been ...,1,13,87,2,6.692308,6.5,"['#breaking', '#LA', 'Refugio', 'oil', 'spill'...",refugio oil spill costlier big project,6,33,1,5.500000,6.0
7498,,,#WorldNews Fallen powerlines on G:link tram: U...,1,19,118,5,6.210526,3.8,"['#WorldNews', 'Fallen', 'powerlines', 'on', '...",fall pipeline link tram update fire crew evacu...,9,51,1,5.666667,9.0
7501,,,Two giant cranes holding a bridge collapse int...,1,11,73,2,6.636364,5.5,"['Two', 'giant', 'cranes', 'holding', 'a', 'br...",giant crane hold bridge collapse nearby home,7,38,1,5.428571,7.0


In [129]:
# Dropping Rows with any missing values

data.dropna(axis = 0, subset = ['clean_text'], inplace = True) 

In [130]:
# Percentage of Data Missing Per Column Above 1%

total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Missing Percent'])
missing_data['Missing Percent'] = missing_data['Missing Percent'].apply(lambda x: x * 100)
missing_data.loc[missing_data['Missing Percent'] > .01][:152]

Unnamed: 0,Total,Missing Percent
location,2458,32.949062
keyword,53,0.710456


In [131]:
train = data[['clean_text', 'target']][0:5595]
validation = data[['clean_text', 'target']][5595:]

In [132]:
train

Unnamed: 0,clean_text,target
0,reason allah forgive,1
1,forest fire near la range canada,1
2,resident ask shelter place officer evacuation ...,1
3,people evacuation order california,1
4,get send photo smoke school,1
...,...,...
5625,video pick body water rescuer search hundred m...,1
5626,fear miss migrant med rescuer search survivor ...,1
5627,rescuer search hundred migrant mediterranean boat,1
5628,desire watch rescuer,0


In [133]:
validation

Unnamed: 0,clean_text,target
5630,channel spa dog rescuer door,0
5631,video pick body water rescuer search hundred m...,1
5632,woman gas app guide rescuer injure county,1
5633,video pick body water rescuer search hundred m...,1
5634,video pick body water rescuer search hundred m...,1
...,...,...
7498,fall pipeline link tram update fire crew evacu...,1
7499,flip bomb evacuate stay blow,1
7500,suicide bomber kill saudi security site mosque...,1
7501,giant crane hold bridge collapse nearby home,1


In [113]:
# # splitting the data into a train and validation

# X_train, X_validation, y_train, y_validation = train_test_split(X, 
#                                                                 y, 
#                                                                 test_size = 0.25, 
#                                                                 shuffle = True,
#                                                                 random_state = 42)

In [134]:
import keras
from keras import layers
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Conv2D, MaxPool2D, Dropout, Flatten, BatchNormalization
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.metrics import confusion_matrix, classification_report
from keras import regularizers

In [135]:
import tensorflow_hub as hub 
import tensorflow as tf 

In [136]:
elmo = hub.load("https://tfhub.dev/google/elmo/3").signatures["default"]

In [137]:
# just a random sentence
x = ["Roasted ants are a popular snack in Columbia"]

# Extract ELMo features 
embeddings = elmo(tf.constant(x))["elmo"]

embeddings.shape

TensorShape([Dimension(1), Dimension(8), Dimension(1024)])

In [160]:
def elmo_vectors(x):
    embeddings = elmo(tf.constant(x.to_list()))["elmo"]
    
    with tf.compat.v1.Session() as sess:
        sess.run(tf.compat.v1.global_variables_initializer())
        sess.run(tf.compat.v1.tables_initializer())
        # return average of ELMo features
        return sess.run(tf.reduce_mean(embeddings,1))

In [161]:
list_train = [train[i:i+100] for i in range(0,train.shape[0],100)] 
list_validation = [validation[i:i+100] for i in range(0,validation.shape[0],100)]

In [None]:
# Extract ELMo embeddings 
elmo_train = [elmo_vectors(train['clean_text']) for x in list_train] 
elmo_validation = [elmo_vectors(validation['clean_text']) for x in list_validation]

2022-07-05 17:04:26.600636: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" vendor: "GenuineIntel" model: "110" frequency: 3100 num_cores: 8 environment { key: "cpu_instruction_set" value: "SSE, SSE2, SSE3, SSSE3, SSE4.1, SSE4.2" } environment { key: "eigen" value: "3.4.90" } l1_cache_size: 32768 l2_cache_size: 262144 l3_cache_size: 8388608 memory_size: 268435456 } outputs { dtype: DT_FLOAT shape { unknown_rank: true } }
2022-07-05 17:04:26.876456: W tensorflow/core/grappler/costs/op_level_cost_estimator.cc:690] Error in PredictCost() for the op: op: "Softmax" attr { key: "T" value { type: DT_FLOAT } } inputs { dtype: DT_FLOAT shape { unknown_rank: true } } device { type: "CPU" vendor: "GenuineIntel" model: "110" frequency: 3100 num_cores: 8 environment { key: "cpu_instruction_set" value: "SSE, SSE2, SS

In [None]:
# Once we have all the vectors, we can concatenate them back to a single array:

elmo_X_train = np.concatenate(elmo_train, axis = 0) 
elmo_X_validation = np.concatenate(elmo_validation, axis = 0)

# We still need to append the target column from the both of the train and validation dataframes 


# Both the elmo_X_train and elmo_X_validation need their target columns

In [None]:
# Saving these arrays becuase of the long time it took to get the ELMo vectors 
# Saving them as pickle files


# save elmo_train_new
pickle_out = open("elmo_X_train.pickle","wb")
pickle.dump(elmo_X_train, pickle_out)
pickle_out.close()

# save elmo_test_new
pickle_out = open("elmo_X_validation.pickle","wb")
pickle.dump(elmo_X_validation, pickle_out)
pickle_out.close()

In [None]:
# # We can use the following code to load them back

# # load elmo_train_new
# pickle_in = open("elmo_X_train.pickle", "rb")
# elmo_train_new = pickle.load(pickle_in)

# # load elmo_train_new
# pickle_in = open("elmo_X_validation.pickle", "rb")
# elmo_test_new = pickle.load(pickle_in)