In [30]:
from tweet_sent_predictor.predictor.SmartPredictor import SmartPredictor
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline

from tweet_sent_predictor.transformer.LowerCaseTransformer import LowerCaseTransformer
from tweet_sent_predictor.transformer.MentionFlagger import MentionFlagger
from tweet_sent_predictor.transformer.NumberFlagger import NumberFlagger
from tweet_sent_predictor.transformer.SplitterPunctuation import SplitterPunctuation, split_punctuation
from tweet_sent_predictor.transformer.URLFlagger import URLFlagger
from tweet_sent_predictor.transformer.HashtagToWords import HashtagToWords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from tweet_sent_predictor.transformer.StopWordFilter import StopWordFilter
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
np.random.seed(0) # for reproducibility with sklearn

In [3]:
sent_col = "opinion"
brand_col = "brand"
text_col = "body"

### Get the data

In [4]:
df = pd.read_csv("tweet_sent_predictor/data/train_proper.csv")

In [5]:
df

Unnamed: 0,opinion,brand,body
0,neu,apl,20 min line @apple store @short pump.
1,irr,msf,Nueva tecnología convierte cualquier superfici...
2,neu,ggl,Some people should not post replies in #Google...
3,neg,apl,I know a few others having same issue RT @Joel...
4,neg,msf,"#Microsoft - We put the """"backwards"""" into bac..."
...,...,...,...
4168,neg,apl,fuck this see you hoes @ work @WeakTwip @Munnn...
4169,neg,msf,"#Microsoft, #Adobe lose $13.5bn to piracy: Rep..."
4170,neu,twt,"I tried to explain why you would do """"The #Twi..."
4171,neg,apl,Installed io5 - fine on ipad but wiped wife's ...


In [6]:
X = df["body"]
y = df["opinion"].values

In [7]:
X

0                   20 min line @apple store @short pump.
1       Nueva tecnología convierte cualquier superfici...
2       Some people should not post replies in #Google...
3       I know a few others having same issue RT @Joel...
4       #Microsoft - We put the ""backwards"" into bac...
                              ...                        
4168    fuck this see you hoes @ work @WeakTwip @Munnn...
4169    #Microsoft, #Adobe lose $13.5bn to piracy: Rep...
4170    I tried to explain why you would do ""The #Twi...
4171    Installed io5 - fine on ipad but wiped wife's ...
4172    #microsoft #careers site is giving errors for ...
Name: body, Length: 4173, dtype: object

In [8]:
y

array(['neu', 'irr', 'neu', ..., 'neu', 'neg', 'neg'], dtype=object)

## Predictor

### Bayesian

In [9]:
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    #("count", CountVectorizer(analyzer=lambda x:x),
    ("HashtagToWords", HashtagToWords()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3), min_df=5)),
    ("tfidf", TfidfTransformer()),
    ("Bayesian", MultinomialNB())
])

predictor = SmartPredictor(pipe=pipe)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

fit
0.7766594775940571  accuracy on train set
fit
fit
fit
fit
fit
[0.73413174 0.71497006 0.74610778 0.73860911 0.74460432]
0.7356846020189837 0.00012565208977783233


### SVM

In [32]:
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    ("HashtagToWords", HashtagToWords()),
    #("count", CountVectorizer(analyzer=lambda x:x),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3), min_df=5)),
    ("SVM", SGDClassifier(loss="hinge", penalty="l2", alpha=1e-3))
])

predictor = SmartPredictor(pipe=pipe)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

fit
0.8658039779535107  accuracy on train set
fit
fit
fit
fit
fit
[0.73293413 0.72215569 0.75568862 0.7470024  0.77697842]
0.7469518516922989 0.00035785490788470297


### Logistic regression

#### Liblinear + One versus rest scheme

In [23]:
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    ("HashtagToWords", HashtagToWords()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3), min_df=5)),
    ("Maxent", LogisticRegression(solver="liblinear")) # good solver for small dataset
])

predictor = SmartPredictor(pipe=pipe)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

fit
0.8629283489096573  accuracy on train set
fit
fit
fit
fit
fit
[0.74251497 0.73293413 0.76526946 0.75539568 0.77458034]
0.7541389164117807 0.0002256158267353151


#### newton cg + multinomial loss

In [24]:
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    ("HashtagToWords", HashtagToWords()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3), min_df=5)),
    ("Maxent", LogisticRegression(solver="newton-cg"))
])

predictor = SmartPredictor(pipe=pipe)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

fit
0.8804217589264318  accuracy on train set
fit
fit
fit
fit
fit
[0.73892216 0.72934132 0.76646707 0.74940048 0.77218225]
0.7512626545470211 0.0002610208809017463


#### Lbfgs + multinomial

In [25]:
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    ("HashtagToWords", HashtagToWords()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3), min_df=5)),
    ("Maxent", LogisticRegression(solver="lbfgs"))
])

predictor = SmartPredictor(pipe=pipe)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


fit
0.8804217589264318  accuracy on train set


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


fit


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


fit


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


fit


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


fit


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


fit
[0.73892216 0.72934132 0.76646707 0.74940048 0.77218225]
0.7512626545470211 0.0002610208809017463


### Random forest

In [14]:
clf = RandomForestClassifier()
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [22]:
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    ("HashtagToWords", HashtagToWords()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3), min_df=5)),
    ("RF", clf) # default
])

predictor = SmartPredictor(pipe=pipe)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())



fit
0.8638868919242751  accuracy on train set




fit




fit




fit




fit




fit
[0.73892216 0.72694611 0.7760479  0.75539568 0.77098321]
0.753659012909433 0.0003470321324076229


### XGBoost

#### Multi softmax

In [16]:
clf = XGBClassifier(objective="multi:softmax")
clf.get_params()

{'objective': 'multi:softmax',
 'use_label_encoder': True,
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'gamma': None,
 'gpu_id': None,
 'importance_type': 'gain',
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [20]:
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    ("HashtagToWords", HashtagToWords()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3), min_df=5)),
    ("RF", clf) # default
])

predictor = SmartPredictor(pipe=pipe)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())



fit
0.8638868919242751  accuracy on train set




fit




fit




fit




fit




fit
[0.73892216 0.72694611 0.7760479  0.75539568 0.77098321]
0.753659012909433 0.0003470321324076229


#### Multi softprob

In [18]:
clf = XGBClassifier(objective="multi:softprob")
clf.get_params()

{'objective': 'multi:softprob',
 'use_label_encoder': True,
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'gamma': None,
 'gpu_id': None,
 'importance_type': 'gain',
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [21]:
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    ("HashtagToWords", HashtagToWords()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3), min_df=5)),
    ("RF", clf) # default
])

predictor = SmartPredictor(pipe=pipe)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())



fit
0.8638868919242751  accuracy on train set




fit




fit




fit




fit




fit
[0.73892216 0.72694611 0.7760479  0.75539568 0.77098321]
0.753659012909433 0.0003470321324076229
