# NLP Project
## Libraries

In [1]:
#algorithms
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

#model validation and hyper-parameter tuning
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import accuracy_score, plot_confusion_matrix, classification_report, precision_score, recall_score
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline

#tokenization and stop words
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from spacy.lang.en import STOP_WORDS

#data manipulation
import pandas as pd
import numpy as np
import gzip
import warnings
warnings.filterwarnings('ignore')

## Data Loading and Preprocessing

In [2]:
train = pd.read_csv('train.csv')

In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
key_words = list(train['keyword'].unique())[1:]
key_words

['ablaze',
 'accident',
 'aftershock',
 'airplane%20accident',
 'ambulance',
 'annihilated',
 'annihilation',
 'apocalypse',
 'armageddon',
 'army',
 'arson',
 'arsonist',
 'attack',
 'attacked',
 'avalanche',
 'battle',
 'bioterror',
 'bioterrorism',
 'blaze',
 'blazing',
 'bleeding',
 'blew%20up',
 'blight',
 'blizzard',
 'blood',
 'bloody',
 'blown%20up',
 'body%20bag',
 'body%20bagging',
 'body%20bags',
 'bomb',
 'bombed',
 'bombing',
 'bridge%20collapse',
 'buildings%20burning',
 'buildings%20on%20fire',
 'burned',
 'burning',
 'burning%20buildings',
 'bush%20fires',
 'casualties',
 'casualty',
 'catastrophe',
 'catastrophic',
 'chemical%20emergency',
 'cliff%20fall',
 'collapse',
 'collapsed',
 'collide',
 'collided',
 'collision',
 'crash',
 'crashed',
 'crush',
 'crushed',
 'curfew',
 'cyclone',
 'damage',
 'danger',
 'dead',
 'death',
 'deaths',
 'debris',
 'deluge',
 'deluged',
 'demolish',
 'demolished',
 'demolition',
 'derail',
 'derailed',
 'derailment',
 'desolate',
 'de

In [6]:
#copy the training set into a new dataframe for manipulation purposes

data = train.copy()

#### Keyword Column
If less than 1% of the keyword values are null, drop the null null rows.

In [10]:
sum_null_keywords = data.keyword.isnull().sum()
total_keyword_count = data.keyword.count()
percentage_of_null_keywords = (sum_null_keywords/total_keyword_count) *100

In [13]:
print('{:.2f} % of keywords are null'.format(percentage_of_null_keywords))

0.81 % of keywords are null


#### Location Column

In [14]:
sum_of_null_locations = data.location.isnull().sum()
total_location_count = data.location.count()
percentage_of_null_locations = (sum_of_null_locations/total_location_count) * 100

In [15]:
print('{:.2f} % of keywords are null'.format(percentage_of_null_locations))

49.86 % of keywords are null


#### Drop rows with null keyword and location

In [16]:
data = data.dropna(axis=0, subset=['keyword', 'location'], how = 'all')

In [17]:
data.isna().sum()

id             0
keyword        0
location    2472
text           0
target         0
dtype: int64

### Preprocessing

In [18]:
# column select transformer

class KeySelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key
            
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return [row[self.key] for row in X]