In [13]:
from tweet_sent_predictor.predictor.ChainPredictor import ChainPredictor
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline

from tweet_sent_predictor.transformer.LowerCaseTransformer import LowerCaseTransformer
from tweet_sent_predictor.transformer.MentionFlagger import MentionFlagger
from tweet_sent_predictor.transformer.NumberFlagger import NumberFlagger
from tweet_sent_predictor.transformer.SplitterPunctuation import SplitterPunctuation, split_punctuation
from tweet_sent_predictor.transformer.URLFlagger import URLFlagger
from sklearn.feature_extraction.text import CountVectorizer
from tweet_sent_predictor.transformer.StopWordFilter import StopWordFilter
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

In [2]:
np.random.seed(0) # for reproducibility with sklearn

In [3]:
sent_col = "opinion"
brand_col = "brand"
text_col = "body"

### Get the data

In [4]:
df = pd.read_csv("tweet_sent_predictor/data/train_proper.csv")

In [5]:
df = df.iloc[:100]
df

Unnamed: 0,opinion,brand,body
0,neu,apl,20 min line @apple store @short pump.
1,irr,msf,Nueva tecnología convierte cualquier superfici...
2,neu,ggl,Some people should not post replies in #Google...
3,neg,apl,I know a few others having same issue RT @Joel...
4,neg,msf,"#Microsoft - We put the """"backwards"""" into bac..."
...,...,...,...
95,neu,twt,Twitter Buzz Builds for the Occupy Wall Street...
96,neg,msf,"#Google Apps vs. #Microsoft #Office 365: """"it ..."
97,pos,apl,This good here iPhone will do me VERY well tod...
98,irr,msf,#Microsoft ofrece un sistema de #codificación ...


In [6]:
X = df["body"]
y = df["opinion"].values

In [7]:
X

0                 20 min line @apple store @short pump.
1     Nueva tecnología convierte cualquier superfici...
2     Some people should not post replies in #Google...
3     I know a few others having same issue RT @Joel...
4     #Microsoft - We put the ""backwards"" into bac...
                            ...                        
95    Twitter Buzz Builds for the Occupy Wall Street...
96    #Google Apps vs. #Microsoft #Office 365: ""it ...
97    This good here iPhone will do me VERY well tod...
98    #Microsoft ofrece un sistema de #codificación ...
99    #Job #ICT Sachbearbeiter Immobilienbewirtschaf...
Name: body, Length: 100, dtype: object

In [8]:
y

array(['neu', 'irr', 'neu', 'neg', 'neg', 'neg', 'neu', 'irr', 'neu',
       'irr', 'neu', 'neg', 'neu', 'neu', 'pos', 'neu', 'neu', 'pos',
       'neu', 'irr', 'neu', 'irr', 'neu', 'neg', 'irr', 'neg', 'irr',
       'neg', 'neu', 'irr', 'neu', 'pos', 'neg', 'neu', 'irr', 'irr',
       'neu', 'irr', 'irr', 'neu', 'irr', 'neu', 'irr', 'neu', 'neg',
       'neu', 'irr', 'pos', 'neu', 'neu', 'irr', 'irr', 'neu', 'neu',
       'neu', 'neu', 'irr', 'irr', 'neg', 'neu', 'neu', 'pos', 'neu',
       'neu', 'pos', 'neu', 'irr', 'irr', 'irr', 'neu', 'neu', 'irr',
       'irr', 'irr', 'neu', 'neu', 'irr', 'neu', 'neu', 'pos', 'irr',
       'neu', 'irr', 'neg', 'irr', 'neg', 'neu', 'neu', 'neu', 'pos',
       'pos', 'neu', 'irr', 'pos', 'irr', 'neu', 'neg', 'pos', 'irr',
       'irr'], dtype=object)

## Predictor

### Bayesian

In [9]:
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    #("count", CountVectorizer(analyzer=lambda x:x),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3)))
])

clf = MultinomialNB()

predictor = ChainPredictor(pipe=pipe, clf=clf)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

fit
0.96  accuracy on train set
fit
fit
fit
fit
fit
[0.6  0.6  0.55 0.6  0.75]
0.62 0.004599999999999999


### SVM

In [16]:
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    #("count", CountVectorizer(analyzer=lambda x:x),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3)))
])

clf = SGDClassifier(loss="hinge", penalty="l2", alpha=1e-3)

predictor = ChainPredictor(pipe=pipe, clf=clf)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

fit
1.0  accuracy on train set
fit
fit
fit
fit
fit
[0.5  0.5  0.45 0.5  0.6 ]
0.51 0.002399999999999999


### Logistic regression

#### Liblinear + One versus rest scheme

In [20]:
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3)))
])

clf = LogisticRegression(solver="liblinear")

predictor = ChainPredictor(pipe=pipe, clf=clf)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

fit
0.99  accuracy on train set
fit
fit
fit
fit
fit
[0.6  0.6  0.5  0.55 0.55]
0.5599999999999999 0.001399999999999999


#### newton cg + multinomial loss

In [12]:
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3))),
    ("Maxent", LogisticRegression(solver="newton-cg"))
])

predictor = SmartPredictor(pipe=pipe)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

fit
0.9439252336448598  accuracy on train set
fit
fit
fit
fit
fit
[0.74850299 0.74491018 0.78682635 0.78297362 0.77098321]
0.7668392710980917 0.000298815366593932


#### Lbfgs + multinomial

In [21]:
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3)))
])

clf = LogisticRegression(solver="lbfgs")

predictor = ChainPredictor(pipe=pipe, clf=clf)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

fit
1.0  accuracy on train set
fit
fit
fit
fit
fit
[0.55 0.6  0.55 0.5  0.45]
0.53 0.0026


### Random forest

In [14]:
clf = RandomForestClassifier()
clf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': None,
 'verbose': 0,
 'warm_start': False}

In [22]:
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3)))
])

predictor = ChainPredictor(pipe=pipe, clf=clf)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

fit
1.0  accuracy on train set
fit
fit
fit
fit
fit
[0.55 0.6  0.55 0.5  0.45]
0.53 0.0026


### XGBoost

#### Multi softmax

In [16]:
clf = XGBClassifier(objective="multi:softmax")
clf.get_params()

{'objective': 'multi:softmax',
 'use_label_encoder': True,
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'gamma': None,
 'gpu_id': None,
 'importance_type': 'gain',
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [24]:
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3)))
])

predictor = ChainPredictor(pipe=pipe, clf=clf)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

fit
1.0  accuracy on train set
fit
fit
fit
fit
fit
[0.55 0.6  0.55 0.5  0.45]
0.53 0.0026


#### Multi softprob

In [18]:
clf = XGBClassifier(objective="multi:softprob")
clf.get_params()

{'objective': 'multi:softprob',
 'use_label_encoder': True,
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'gamma': None,
 'gpu_id': None,
 'importance_type': 'gain',
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'missing': nan,
 'monotone_constraints': None,
 'n_estimators': 100,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None}

In [25]:
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3)))
])

predictor = ChainPredictor(pipe=pipe, clf=clf)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

fit
1.0  accuracy on train set
fit
fit
fit
fit
fit
[0.55 0.6  0.55 0.5  0.45]
0.53 0.0026
