In [1]:
from predictor.SmartPredictor import SmartPredictor
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline

from transformer.LowerCaseTransformer import LowerCaseTransformer
from transformer.MentionFlagger import MentionFlagger
from transformer.NumberFlagger import NumberFlagger
from transformer.SplitterPunctuation import SplitterPunctuation
from transformer.URLFlagger import URLFlagger
from sklearn.feature_extraction.text import CountVectorizer
from transformer.StopWordFilter import StopWordFilter
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC

In [2]:
np.random.seed(0) # for reproducibility with sklearn

In [3]:
sent_col = "opinion"
brand_col = "brand"
text_col = "body"

### Get the data

In [4]:
df = pd.read_csv("train_proper.csv")

In [5]:
df

Unnamed: 0,opinion,brand,body
0,neu,apl,20 min line @apple store @short pump.
1,irr,msf,Nueva tecnología convierte cualquier superfici...
2,neu,ggl,Some people should not post replies in #Google...
3,neg,apl,I know a few others having same issue RT @Joel...
4,neg,msf,"#Microsoft - We put the """"backwards"""" into bac..."
...,...,...,...
4168,neg,apl,fuck this see you hoes @ work @WeakTwip @Munnn...
4169,neg,msf,"#Microsoft, #Adobe lose $13.5bn to piracy: Rep..."
4170,neu,twt,"I tried to explain why you would do """"The #Twi..."
4171,neg,apl,Installed io5 - fine on ipad but wiped wife's ...


In [6]:
X = df["body"]
y = df["opinion"].values

In [7]:
X

0                   20 min line @apple store @short pump.
1       Nueva tecnología convierte cualquier superfici...
2       Some people should not post replies in #Google...
3       I know a few others having same issue RT @Joel...
4       #Microsoft - We put the ""backwards"" into bac...
                              ...                        
4168    fuck this see you hoes @ work @WeakTwip @Munnn...
4169    #Microsoft, #Adobe lose $13.5bn to piracy: Rep...
4170    I tried to explain why you would do ""The #Twi...
4171    Installed io5 - fine on ipad but wiped wife's ...
4172    #microsoft #careers site is giving errors for ...
Name: body, Length: 4173, dtype: object

In [8]:
y

array(['neu', 'irr', 'neu', ..., 'neu', 'neg', 'neg'], dtype=object)

### Predictor

#### Bayesian

In [68]:
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    ("Split", SplitterPunctuation()),
    #("count", CountVectorizer(analyzer=lambda x:x),
    ("count", CountVectorizer(analyzer=lambda x:x, ngram_range=(2,2), min_df=10)),
    ("Bayesian", MultinomialNB())
])

In [69]:
predictor = SmartPredictor(pipe=pipe)

In [70]:
predictor.fit(X, y)
predictor.score(X, y)

fit


0.7500599089384136

##### cross validation score

In [12]:
scores = cross_val_score(predictor, X, y, cv=5)

fit
fit
fit
fit
fit


In [13]:
scores

array([0.7508982 , 0.75209581, 0.7497006 , 0.76738609, 0.74460432])

In [14]:
scores.mean(), scores.var()

(0.7529370036904608, 5.870968695378163e-05)

#### SVM

In [76]:
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    ("Split", SplitterPunctuation()),
    #("count", CountVectorizer(analyzer=lambda x:x),
    ("count", CountVectorizer(analyzer=lambda x:x,ngram_range=(2,2), min_df=10)),
    ("SVM", SVC())
])

In [72]:
predictor = SmartPredictor(pipe=pipe)

In [73]:
predictor.fit(X, y)
predictor.score(X, y)

fit


0.8109273903666427

In [74]:
scores = cross_val_score(predictor, X, y, cv=5)

fit
fit
fit
fit
fit


In [19]:
scores

array([0.7245509 , 0.73053892, 0.73652695, 0.74580336, 0.73501199])

In [20]:
scores.mean(), scores.var()

(0.7344864228377778, 4.936207476400558e-05)