In [1]:
# Pickle
import pickle

# utils
import numpy as np
import pandas as pd

from load_utils import text2df
import nlp

# TF-IDF
from dojo.nlp import TF_IDF

# Dojo Classifiers
from dojo.linear import LogisticRegression
from dojo.bayes import NaiveBayes
from dojo.tree import ClassificationTree

# Model Selection
from dojo.split import cross_validate, train_test_split

# Metrics
from dojo.metrics.classification import accuracy_score

TF-IDF && Logistic binary not loaded.


***

# Data Loading into Data Frame

In [2]:
data = text2df("../../data/sentiment_labelled_sentences/yelp_labelled.txt")

In [3]:
data.head()

Unnamed: 0,text,label
0,Wow... Loved this place.,1
1,Crust is not good.,0
2,Not tasty and the texture was just nasty.,0
3,Stopped by during the late May bank holiday of...,1
4,The selection on the menu was great and so wer...,1


***

# Natural Language Processing

## Standardizing

In [4]:
data["text"] = nlp.standardize(data["text"])

## Remove Noise

In [5]:
data["text"] = nlp.remove_noise(data["text"])

## Lemmatizing / Stemming

In [6]:
# data["text"] = nlp.lemmatize(data["text"])
# data["text"] = nlp.stem(data["text"])

## Save data

In [7]:
# data.to_csv(...+"final.csv")

***

# Exploratory Data Analysis

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 2 columns):
text     1000 non-null object
label    1000 non-null object
dtypes: object(2)
memory usage: 15.7+ KB


In [9]:
data.describe()

Unnamed: 0,text,label
count,1000,1000
unique,991,2
top,awesome,0
freq,2,500


In [10]:
data.head()

Unnamed: 0,text,label
0,wow loved place,1
1,crust not good,0
2,not tasty texture nasty,0
3,stopped late may bank holiday rick steve recom...,1
4,selection menu great prices,1


***

# Text Vectorization

In [11]:
X, y = data["text"], data["label"]

In [12]:
try:
    tfidf = pickle.load(open("./tfidf.b", mode="rb"))
except Exception as e:
    tfidf = TF_IDF(idf_weighting_scheme="smooth").fit(X)

In [13]:
X_tf = tfidf.transform(X)

In [14]:
with open("./tfidf.b", mode="wb") as f:
    pickle.dump(tfidf, f)

***

# Model evaluation

In [16]:
# (X_tf, y)

### Logistic Regression

In [None]:
cross_validate(LogisticRegression(), X_tf, y, metric=accuracy_score)

In [None]:
with open("./logistic.b", mode="wb") as f:
    pickle.dump(ln, f)

### Naive Bayes

In [27]:
bayes = NaiveBayes().fit(X_train, y_train)
accuracy_score(y_test, bayes.predict(X_test))

0.5133333333333333

### Classification Tree

In [28]:
tree = ClassificationTree().fit(X_train, y_train)
accuracy_score(y_test, tree.predict(X_test))

0.5133333333333333

## Grid search (hyper-parameters fitting)