In [1]:
# Pickle
import pickle

# utils
import numpy as np
import pandas as pd

import nlp

# TF-IDF
from dojo.nlp import TF_IDF

# Dojo Classifiers
from dojo.linear import LogisticRegression
from dojo.bayes import NaiveBayes
from dojo.tree import ClassificationTree

# Model Selection
from dojo.split import cross_validate

# Metrics
from dojo.metrics.classification import accuracy_score

***

# Data Loading into Data Frame

In [2]:
data = pd.read_csv("../../data/sentiment.csv", index_col=0)

In [3]:
data.head()

Unnamed: 0,text,label
0,da vinci code book just awesom,1
1,wa first clive cussler have ever read even boo...,1
2,like da vinci code lot,1
3,like da vinci code lot,1
4,like da vinci code ultimatli not seem hold it is,1


***

# Natural Language Processing

## Standardizing

In [4]:
nlp.standardize(data)

## Remove Noise

In [5]:
nlp.remove_noise(data)

## Lemmatizing / Stemming

In [6]:
# lemmatize(data)
nlp.stem(data)

## Save data

In [7]:
# data.to_csv(...+"final.csv")

***

# Exploratory Data Analysis

In [8]:
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 10086 entries, 0 to 10085
Data columns (total 2 columns):
text     10086 non-null object
label    10086 non-null int64
dtypes: int64(1), object(1)
memory usage: 236.4+ KB


In [9]:
data.describe()

Unnamed: 0,label
count,10086.0
mean,0.544815
std,0.498012
min,0.0
25%,0.0
50%,1.0
75%,1.0
max,1.0


In [10]:
data.head()

Unnamed: 0,text,label
0,da vinci code book awesom,1
1,wa first clive cussler ever read even book lik...,1
2,like da vinci code lot,1
3,like da vinci code lot,1
4,like da vinci code ultimatli not seem hold,1


***

# Text Vectorization

In [11]:
X, y = data["text"], data["label"]

In [12]:
tfidf = TF_IDF().fit(X.values)

In [17]:
X_tf = tfidf.transform(X.values)

In [26]:
X_tf

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [16]:
# with open("...", mode="wb") as f:
#     pickle.dump(tfidf, f)

***

# Model evaluation

### Logistic Regression

In [None]:
cross_validate(LogisticRegression(), X_tf, y, metric=accuracy_score)

### Naive Bayes

In [29]:
cross_validate(NaiveBayes(), X_tf, y, metric=accuracy_score)

### Classification Tree

In [30]:
cross_validate(ClassificationTree(), X_tf, y, metric=accuracy_score)

## Grid search (hyper-parameters fitting)