In [1]:
from utils.preprocessor import Preprocessor
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

## Preprocess the Data

In [2]:
df_train = pd.read_csv(r'data\train.csv')
test = pd.read_csv(r'data\test.csv')

In [3]:
preprocessor = Preprocessor()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\PrinceEGY\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\PrinceEGY\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\PrinceEGY\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [4]:
df_train['text'] = preprocessor(df_train.text)
df_train['text']

0       [deed, reason, earthquake, may, allah, forgive...
1           [forest, fire, near, la, ronge, sask, canada]
2       [resident, asked, shelter, place, notified, of...
3       [13000, people, receive, wildfire, evacuation,...
4       [got, sent, photo, ruby, alaska, smoke, wildfi...
                              ...                        
7608    [two, giant, crane, holding, bridge, collapse,...
7609    [ariaahrary, thetawniest, control, wild, fire,...
7610                [m194, 0104, utc5km, volcano, hawaii]
7611    [police, investigating, ebike, collided, car, ...
7612    [latest, home, razed, northern, california, wi...
Name: text, Length: 7613, dtype: object

## Train-validate split

In [5]:
X = df_train[['text']]
y = df_train[['target']]

In [6]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2)

## Feature Extraction

### Binary Vectorizer

In [7]:
vec = CountVectorizer(binary= True)

In [8]:
X_train_joined = X_train.text.apply(lambda x: " ".join(x))
X_val_joined = X_val.text.apply(lambda x: " ".join(x))

In [9]:
vec.fit(X_train_joined)

In [10]:
X_train_vectorized = pd.DataFrame(vec.transform(X_train_joined).toarray(), columns=sorted(vec.vocabulary_.keys()))
X_val_vectorized = pd.DataFrame(vec.transform(X_val_joined).toarray(), columns=sorted(vec.vocabulary_.keys()))

In [11]:
X_train_vectorized.iloc[15:20, 1000:2000]

Unnamed: 0,amp039monsteramp039,amp163163millions,ampamp,ampask,ampmdash,ampor,ampstart,ampstory,ampwanted,amreading,...,bookofdaniel,booksbyroger,booktubeathon,boom,boone,booradleyvancullen,boost,boot,booth,booze
15,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# TODO:: modelling & evaluation

-----------
### Counting Vectorizer

In [13]:
vec = CountVectorizer()

In [14]:
X_train_joined = X_train.text.apply(lambda x: " ".join(x))
X_val_joined = X_val.text.apply(lambda x: " ".join(x))

In [15]:
vec.fit(X_train_joined)

In [16]:
X_train_vectorized = pd.DataFrame(vec.transform(X_train_joined).toarray(), columns=sorted(vec.vocabulary_.keys()))
X_val_vectorized = pd.DataFrame(vec.transform(X_val_joined).toarray(), columns=sorted(vec.vocabulary_.keys()))

In [17]:
X_train_vectorized.iloc[15:20, 1000:2000]

Unnamed: 0,amp039monsteramp039,amp163163millions,ampamp,ampask,ampmdash,ampor,ampstart,ampstory,ampwanted,amreading,...,bookofdaniel,booksbyroger,booktubeathon,boom,boone,booradleyvancullen,boost,boot,booth,booze
15,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
16,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
17,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
18,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [18]:
# TODO:: modelling & evaluation

-----------
### TF-IDF

In [19]:
vec = TfidfVectorizer()

In [20]:
X_train_joined = X_train.text.apply(lambda x: " ".join(x))
X_val_joined = X_val.text.apply(lambda x: " ".join(x))

In [21]:
vec.fit(X_train_joined)

In [22]:
X_train_vectorized = pd.DataFrame(vec.transform(X_train_joined).toarray(), columns=sorted(vec.vocabulary_.keys()))
X_val_vectorized = pd.DataFrame(vec.transform(X_val_joined).toarray(), columns=sorted(vec.vocabulary_.keys()))

In [23]:
X_train_vectorized.iloc[15:20, 1000:2000]

Unnamed: 0,amp039monsteramp039,amp163163millions,ampamp,ampask,ampmdash,ampor,ampstart,ampstory,ampwanted,amreading,...,bookofdaniel,booksbyroger,booktubeathon,boom,boone,booradleyvancullen,boost,boot,booth,booze
15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
# TODO:: modelling & evaluation