In [4]:
from predictor.SmartPredictor import SmartPredictor
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline

from transformer.LowerCaseTransformer import LowerCaseTransformer
from transformer.MentionFlagger import MentionFlagger
from transformer.NumberFlagger import NumberFlagger
from transformer.SplitterPunctuation import SplitterPunctuation, split_punctuation
from transformer.URLFlagger import URLFlagger
from sklearn.feature_extraction.text import CountVectorizer
from transformer.StopWordFilter import StopWordFilter
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [5]:
np.random.seed(0) # for reproducibility with sklearn

In [6]:
sent_col = "opinion"
brand_col = "brand"
text_col = "body"

### Get the data

In [7]:
df = pd.read_csv("train_proper.csv")

In [8]:
df

Unnamed: 0,opinion,brand,body
0,neu,apl,20 min line @apple store @short pump.
1,irr,msf,Nueva tecnología convierte cualquier superfici...
2,neu,ggl,Some people should not post replies in #Google...
3,neg,apl,I know a few others having same issue RT @Joel...
4,neg,msf,"#Microsoft - We put the """"backwards"""" into bac..."
...,...,...,...
4168,neg,apl,fuck this see you hoes @ work @WeakTwip @Munnn...
4169,neg,msf,"#Microsoft, #Adobe lose $13.5bn to piracy: Rep..."
4170,neu,twt,"I tried to explain why you would do """"The #Twi..."
4171,neg,apl,Installed io5 - fine on ipad but wiped wife's ...


In [9]:
X = df["body"]
y = df["opinion"].values

In [10]:
X

0                   20 min line @apple store @short pump.
1       Nueva tecnología convierte cualquier superfici...
2       Some people should not post replies in #Google...
3       I know a few others having same issue RT @Joel...
4       #Microsoft - We put the ""backwards"" into bac...
                              ...                        
4168    fuck this see you hoes @ work @WeakTwip @Munnn...
4169    #Microsoft, #Adobe lose $13.5bn to piracy: Rep...
4170    I tried to explain why you would do ""The #Twi...
4171    Installed io5 - fine on ipad but wiped wife's ...
4172    #microsoft #careers site is giving errors for ...
Name: body, Length: 4173, dtype: object

In [11]:
y

array(['neu', 'irr', 'neu', ..., 'neu', 'neg', 'neg'], dtype=object)

## Predictor

### Bayesian

In [12]:
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    #("count", CountVectorizer(analyzer=lambda x:x),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3))),
    ("Bayesian", MultinomialNB())
])

predictor = SmartPredictor(pipe=pipe)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

fit
0.9329019889767554  accuracy on train set
fit
fit
fit
fit
fit
[0.74730539 0.74610778 0.75928144 0.76858513 0.75539568]
0.755335085225233 6.815204001228885e-05


### SVM

In [13]:
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    #("count", CountVectorizer(analyzer=lambda x:x),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3))),
    ("SVM", SVC())
])

predictor = SmartPredictor(pipe=pipe)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

fit
0.8873711957824107  accuracy on train set
fit
fit
fit
fit
fit
[0.71137725 0.71137725 0.72215569 0.73501199 0.72302158]
0.720588750556441 7.722149311836644e-05


### Logistic regression

#### Liblinear + One versus rest scheme

In [14]:
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3))),
    ("Maxent", LogisticRegression(solver="liblinear")) # good solver for small dataset
])

predictor = SmartPredictor(pipe=pipe)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

fit
0.9374550682961897  accuracy on train set
fit
fit
fit
fit
fit
[0.73652695 0.74850299 0.77964072 0.79016787 0.75899281]
0.7627662660290929 0.00038835533085022304


#### newton cg + multinomial loss

In [15]:
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3))),
    ("Maxent", LogisticRegression(solver="newton-cg"))
])

predictor = SmartPredictor(pipe=pipe)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

fit
0.9379343398034987  accuracy on train set
fit
fit
fit
fit
fit
[0.73892216 0.7497006  0.77724551 0.78896882 0.75779376]
0.7625261706802222 0.000331982673140113


#### Lbfgs + multinomial

In [16]:
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3))),
    ("Maxent", LogisticRegression(solver="lbfgs"))
])

predictor = SmartPredictor(pipe=pipe)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

fit
0.9379343398034987  accuracy on train set
fit
fit
fit
fit
fit
[0.73892216 0.7497006  0.77724551 0.78896882 0.75779376]
0.7625261706802222 0.000331982673140113


### Random forest

In [18]:
clf = RandomForestClassifier()
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [19]:
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3))),
    ("RF", clf) # default
])

predictor = SmartPredictor(pipe=pipe)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

fit
0.9376947040498442  accuracy on train set
fit
fit
fit
fit
fit
[0.71616766 0.72215569 0.74131737 0.74940048 0.73021583]
0.7318514051034621 0.00014804723295718563


### XGBoost

#### Multi softmax

In [20]:
clf = XGBClassifier(objective="multi:softmax")
clf.get_params()

{'objective': 'multi:softmax',
 'use_label_encoder': True,
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'gamma': None,
 'gpu_id': None,
 'importance_type': 'gain',
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [21]:
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3))),
    ("RF", clf) # default
])

predictor = SmartPredictor(pipe=pipe)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

fit
0.8646057991852384  accuracy on train set
fit
fit
fit
fit
fit
[0.74610778 0.73413174 0.76287425 0.76858513 0.76378897]
0.7550975746349029 0.00016766278645828764


#### Multi softprob

In [22]:
clf = XGBClassifier(objective="multi:softprob")
clf.get_params()

{'objective': 'multi:softprob',
 'use_label_encoder': True,
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'gamma': None,
 'gpu_id': None,
 'importance_type': 'gain',
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [23]:
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3))),
    ("RF", clf) # default
])

predictor = SmartPredictor(pipe=pipe)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

fit
0.8646057991852384  accuracy on train set
fit
fit
fit
fit
fit
[0.74610778 0.73413174 0.76287425 0.76858513 0.76378897]
0.7550975746349029 0.00016766278645828764
