In [1]:
# Bag Of Words:

# We define a fixed length vector where each entry corresponds to a word in our pre-defined dictionary of words. The size of the vector equals the size of the dictionary. 
# Then, for representing a text using this vector, we count how many times each word of our dictionary appears in the text and we put this number in the corresponding vector entry.




# Tf-IDF:

# Term frequency-inverse document frequency (TF-IDF) gives a measure that takes the importance of a word into consideration depending on how frequently it occurs in a document and a corpus.




# Word to Vector:

# Converting words to vectors, or word vectorization, is a natural language processing (NLP) process. 
# The process uses language models to map words into vector space. A vector space represents each word by a vector of real numbers. It also allows words with similar meanings have similar representations




# Count Vectorizer: 

# It will fit and learn the word vocabulary and try to create a document term matrix in which the individual cells denote the frequency of that word in a particular document, which is also known as term frequency, and the columns are dedicated to each word in the corpus.




# Transfer Learning:

# It uses a prebuilt model on your data and gives pretty good results.

In [2]:
# https://medium.com/analytics-vidhya/nlp-tutorial-for-text-classification-in-python-8f19cd17b49e

In [3]:
# For data
import pandas as pd
import numpy as np
import json

# For Plotting
import plotly.express as px
import plotly.graph_objects as go

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline 
plt.style.use('ggplot')
from matplotlib.pyplot import figure
matplotlib.rcParams['figure.figsize'] = (22,10)
plt.rcParams.update({'font.size': 18})

import seaborn as sns 
sns.set_style('darkgrid')

# For processing
import re, string

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import SnowballStemmer
from nltk.corpus import wordnet
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

from sklearn import feature_extraction, model_selection, naive_bayes, pipeline, manifold, preprocessing, feature_selection
from sklearn.model_selection import train_test_split
import sklearn.metrics as metrics
from sklearn.metrics import classification_report, f1_score, accuracy_score, confusion_matrix
from sklearn.metrics import roc_curve, auc, roc_auc_score


# For explainer
from lime import lime_text

# For word embedding
import gensim
import gensim.downloader as gensim_api

# For deep learning
from tensorflow.keras import models, layers, preprocessing as kprocessing
from tensorflow.keras import backend as K

# For bert language model
import transformers

[nltk_data] Downloading package punkt to /Users/sammy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/sammy/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/sammy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
# importing dataset

data = pd.read_csv('preprocessed_clean_train_set.csv') 


# Reviewing the data shape, columns and data types

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7503 entries, 0 to 7502
Data columns (total 16 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   keyword                    7447 non-null   object 
 1   location                   5021 non-null   object 
 2   text                       7503 non-null   object 
 3   target                     7503 non-null   int64  
 4   total_words                7503 non-null   int64  
 5   char_count                 7503 non-null   int64  
 6   sentence_count             7503 non-null   int64  
 7   avg_word_length            7503 non-null   float64
 8   avg_sentence_lenght        7503 non-null   float64
 9   tokenized_text             7503 non-null   object 
 10  clean_text                 7460 non-null   object 
 11  clean_total_words          7503 non-null   int64  
 12  clean_char_count           7503 non-null   int64  
 13  clean_sentence_count       7503 non-null   int64

In [5]:
# Percentage of Data Missing Per Column Above 1%

total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Missing Percent'])
missing_data['Missing Percent'] = missing_data['Missing Percent'].apply(lambda x: x * 100)
missing_data.loc[missing_data['Missing Percent'] > .01][:152]

Unnamed: 0,Total,Missing Percent
location,2482,33.080101
keyword,56,0.746368
clean_text,43,0.573104


In [6]:
# Checking for Duplicates 

data[data.duplicated(subset = "clean_text")]

Unnamed: 0,keyword,location,text,target,total_words,char_count,sentence_count,avg_word_length,avg_sentence_lenght,tokenized_text,clean_text,clean_total_words,clean_char_count,clean_sentence_count,clean_avg_word_length,clean_avg_sentence_lenght
20,,,this is ridiculous....,0,3,20,5,6.666667,0.6,"['this', 'is', 'ridiculous', '...']",,1,0,1,0.000000,1.0
24,,,LOOOOOOL,0,1,8,1,8.000000,1.0,['LOOOOOOL'],,1,0,1,0.000000,1.0
89,accident,,???? it was an accident http://t.co/Oia5fxi4gM,0,6,41,2,6.833333,3.0,"['?', '?', '?', 'it', 'was', 'an', 'accident',...",accident,1,8,1,8.000000,1.0
93,accident,Alberta | Sask. | Montana,Suffield Alberta Accident https://t.co/bPTmlF4P10,1,4,46,2,11.500000,2.0,"['Suffield', 'Alberta', 'Accident', 'https://t...",accident,1,8,1,8.000000,1.0
97,accident,"Gloucestershire , UK",@flowri were you marinading it or was it an ac...,0,10,44,1,4.400000,10.0,"['@flowri', 'were', 'you', 'marinading', 'it',...",accident,1,8,1,8.000000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7492,,,#??? #?? #??? #??? MH370: Aircraft debris foun...,1,18,105,5,5.833333,3.6,"['#', '?', '?', '?', '#', '?', '?', '#', '?', ...",aircraft debris find la reunion miss malaysia ...,8,47,1,5.875000,8.0
7495,,,#breaking #LA Refugio oil spill may have been ...,1,13,87,2,6.692308,6.5,"['#breaking', '#LA', 'Refugio', 'oil', 'spill'...",refugio oil spill costlier big project,6,33,1,5.500000,6.0
7498,,,#WorldNews Fallen powerlines on G:link tram: U...,1,19,118,5,6.210526,3.8,"['#WorldNews', 'Fallen', 'powerlines', 'on', '...",fall pipeline link tram update fire crew evacu...,9,51,1,5.666667,9.0
7501,,,Two giant cranes holding a bridge collapse int...,1,11,73,2,6.636364,5.5,"['Two', 'giant', 'cranes', 'holding', 'a', 'br...",giant crane hold bridge collapse nearby home,7,38,1,5.428571,7.0


In [7]:
# Dropping Rows with any missing values

data.dropna(axis = 0, subset = ['clean_text'], inplace = True) 

In [8]:
# Percentage of Data Missing Per Column Above 1%

total = data.isnull().sum().sort_values(ascending=False)
percent = (data.isnull().sum()/data.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Missing Percent'])
missing_data['Missing Percent'] = missing_data['Missing Percent'].apply(lambda x: x * 100)
missing_data.loc[missing_data['Missing Percent'] > .01][:152]

Unnamed: 0,Total,Missing Percent
location,2458,32.949062
keyword,53,0.710456


In [None]:
necessary_columns = data[['clean_text', 'target']]

In [None]:
# Creating a validation dataframe with 25%
# values of original dataframe
validation = necessary_columns.sample(frac = 0.25)
 
# Creating dataframe with the 75 % remaing values
train = necessary_columns.drop(validation.index)
 
print("Validation of the given DataFrame:")
print(validation.info())
 
print("Train set of the given DataFrame:")
print(train.info())

In [9]:
# splitting the data into a train and validation

X_train, X_validation, y_train, y_validation = train_test_split(data['clean_text'], 
                                                                data['target'], 
                                                                test_size = 0.25, 
                                                                shuffle = True,
                                                                random_state = 42)

In [15]:
from sentence_transformers import SentenceTransformer

bert = SentenceTransformer('stsb-roberta-large') #1.3 gb


# Vectorize the data

X_train_vec = pd.DataFrame(np.vstack(X_train.apply(bert.encode)))


X_validation_vec = pd.DataFrame(np.vstack(X_validation.apply(bert.encode)))

# BERT doesn't have feature names

# model = RandomForestClassifier(n_estimators=500, n_jobs=8)
# model.fit(X_train_vec, y_train)
# model.score(X_validation_vec, y_validation)

In [21]:
display(X_train_vec.shape)
display(X_validation_vec.shape)
display(y_train.shape)
y_validation.shape

(5595, 1024)

(1865, 1024)

(5595,)

(1865,)

In [26]:
type(X_train_vec)

pandas.core.frame.DataFrame

In [29]:
X_train_vec.to_csv('X_train_vec.csv', index = False)

In [None]:
X_validation_vec.to_csv('X_validation_vec.csv', index = False)

In [None]:
y_train.to_csv('y_train.csv', index = False)

In [None]:
y_validation.to_csv('y_validation.csv', index = False)