In [18]:
from xgboost import XGBClassifier
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import warnings

from tweet_sent_predictor.transformer.LowerCaseTransformer import LowerCaseTransformer
from tweet_sent_predictor.transformer.MentionFlagger import MentionFlagger
from tweet_sent_predictor.transformer.NumberFlagger import NumberFlagger
from tweet_sent_predictor.transformer.SplitterPunctuation import SplitterPunctuation, split_punctuation
from tweet_sent_predictor.transformer.URLFlagger import URLFlagger
from tweet_sent_predictor.transformer.HashtagToWords import HashtagToWords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from tweet_sent_predictor.transformer.StopWordFilter import StopWordFilter
from tweet_sent_predictor.predictor.SmartPredictor import SmartPredictor

In [10]:
df = pd.read_csv("tweet_sent_predictor/data/train_proper.csv")

In [11]:
df

Unnamed: 0,opinion,brand,body
0,neu,apl,20 min line @apple store @short pump.
1,irr,msf,Nueva tecnología convierte cualquier superfici...
2,neu,ggl,Some people should not post replies in #Google...
3,neg,apl,I know a few others having same issue RT @Joel...
4,neg,msf,"#Microsoft - We put the """"backwards"""" into bac..."
...,...,...,...
4168,neg,apl,fuck this see you hoes @ work @WeakTwip @Munnn...
4169,neg,msf,"#Microsoft, #Adobe lose $13.5bn to piracy: Rep..."
4170,neu,twt,"I tried to explain why you would do """"The #Twi..."
4171,neg,apl,Installed io5 - fine on ipad but wiped wife's ...


In [12]:
X = df["body"]
y = df["opinion"].values

In [13]:
class_to_int = {"irr" : 0, "neg" : 1, "neu": 2, "pos": 3}
int_to_class = ["irr", "neg", "neu", "pos"]

In [14]:
np.array(list(map(lambda x : class_to_int[x], y)))

array([2, 0, 2, ..., 2, 1, 1])

In [15]:
y_int = np.array(list(map(lambda x : class_to_int[x], y)))

In [8]:
# default
warnings.simplefilter("ignore")

xgb = XGBClassifier()
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    #("count", CountVectorizer(analyzer=lambda x:x),
    ("HashtagToWords", HashtagToWords()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3))),
    ("clf", xgb)
])

predictor = SmartPredictor(pipe=pipe)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

fit
0.8660436137071651  accuracy on train set
fit
fit
fit
fit
fit
[0.74251497 0.73892216 0.76167665 0.74940048 0.76139089]
0.7507810278723129 8.843002027263456e-05


## Grid search

In [9]:
xgb

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=12, num_parallel_tree=1,
              objective='multi:softprob', random_state=0, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [25]:
grid = GridSearchCV(pipe, param_grid={
    "clf__colsample_bylevel" : [0.25, 0.5, 1],
    "clf__colsample_bynode" : [0.25, 0.5, 1],
    "clf__colsample_bytree" : [0.25, 0.5, 1],
    "clf__max_depth" : [2, 4, 6]
})

grid.fit(X, y)



GridSearchCV(estimator=Pipeline(steps=[('Lowercase',
                                        <tweet_sent_predictor.transformer.LowerCaseTransformer.LowerCaseTransformer object at 0x7f58f1270d90>),
                                       ('MentionFlagger',
                                        <tweet_sent_predictor.transformer.MentionFlagger.MentionFlagger object at 0x7f58f1270eb0>),
                                       ('URLFlagger',
                                        <tweet_sent_predictor.transformer.URLFlagger.URLFlagger object at 0x7f58f1270e20>...
                                                      n_estimators=100,
                                                      n_jobs=12,
                                                      num_parallel_tree=1,
                                                      objective='multi:softprob',
                                                      random_state=0,
                                                      reg_alpha=0, reg

In [27]:
pd.DataFrame(grid.cv_results_).sort_values("rank_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__colsample_bylevel,param_clf__colsample_bynode,param_clf__colsample_bytree,param_clf__max_depth,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
80,6.456005,0.481552,0.367606,0.016228,1,1,1,6,"{'clf__colsample_bylevel': 1, 'clf__colsample_...",0.729341,0.702994,0.738922,0.736211,0.749400,0.731374,0.015588,1
14,10.817273,1.311023,0.379630,0.015227,0.25,0.5,0.5,6,"{'clf__colsample_bylevel': 0.25, 'clf__colsamp...",0.726946,0.705389,0.734132,0.732614,0.755396,0.730895,0.015999,2
2,7.119960,0.218045,0.378679,0.007118,0.25,0.25,0.25,6,"{'clf__colsample_bylevel': 0.25, 'clf__colsamp...",0.736527,0.704192,0.731737,0.731415,0.748201,0.730414,0.014451,3
56,9.378527,0.702233,0.393940,0.012150,1,0.25,0.25,6,"{'clf__colsample_bylevel': 1, 'clf__colsample_...",0.725749,0.701796,0.729341,0.739808,0.752998,0.729938,0.016965,4
20,14.436944,0.323014,0.392527,0.007892,0.25,1,0.25,6,"{'clf__colsample_bylevel': 0.25, 'clf__colsamp...",0.725749,0.701796,0.729341,0.739808,0.752998,0.729938,0.016965,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
27,8.899510,0.585036,0.371853,0.006007,0.5,0.25,0.25,2,"{'clf__colsample_bylevel': 0.5, 'clf__colsampl...",0.700599,0.668263,0.695808,0.702638,0.707434,0.694949,0.013855,77
69,7.798388,0.306615,0.383284,0.007536,1,0.5,1,2,"{'clf__colsample_bylevel': 1, 'clf__colsample_...",0.688623,0.669461,0.699401,0.696643,0.715827,0.693991,0.015124,78
51,6.913999,0.195322,0.372971,0.006647,0.5,1,1,2,"{'clf__colsample_bylevel': 0.5, 'clf__colsampl...",0.688623,0.669461,0.699401,0.696643,0.715827,0.693991,0.015124,78
72,5.638862,0.458315,0.383803,0.004718,1,1,0.25,2,"{'clf__colsample_bylevel': 1, 'clf__colsample_...",0.695808,0.665868,0.702994,0.691847,0.713429,0.693989,0.015860,80


In [16]:
# default
warnings.simplefilter("ignore")

xgb = XGBClassifier(colsample_bytree=1, colsample_bynode=1, colsample_bylevel=1, max_depth=6)
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    #("count", CountVectorizer(analyzer=lambda x:x),
    ("HashtagToWords", HashtagToWords()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3))),
    ("clf", xgb)
])

predictor = SmartPredictor(pipe=pipe)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

fit
0.8660436137071651  accuracy on train set
fit
fit
fit
fit
fit
[0.74251497 0.73892216 0.76167665 0.74940048 0.76139089]
0.7507810278723129 8.843002027263456e-05


In [20]:
# default
warnings.simplefilter("ignore")

xgb = XGBClassifier(colsample_bytree=1, colsample_bynode=1, colsample_bylevel=1, max_depth=6)
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    #("count", CountVectorizer(analyzer=lambda x:x),
    ("HashtagToWords", HashtagToWords()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3))),
    ("tfidf", TfidfTransformer()),
    ("clf", xgb)
])

predictor = SmartPredictor(pipe=pipe)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)

print(scores.mean(), scores.var())

fit
0.8986340762041697  accuracy on train set
fit
fit
fit
fit
fit
[0.70898204 0.71137725 0.74610778 0.72182254 0.74100719]
0.725859360415859 0.00023006543640781836


In [21]:
# default
warnings.simplefilter("ignore")

xgb = XGBClassifier(colsample_bytree=1, colsample_bynode=1, colsample_bylevel=1, max_depth=6, n_estimators=500)
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    #("count", CountVectorizer(analyzer=lambda x:x),
    ("HashtagToWords", HashtagToWords()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3))),
    ("tfidf", TfidfTransformer()),
    ("clf", xgb)
])

predictor = SmartPredictor(pipe=pipe)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)

print(scores.mean(), scores.var())

fit
0.9422477833692787  accuracy on train set
fit
fit
fit
fit
fit
[0.70179641 0.69820359 0.73772455 0.71103118 0.73021583]
0.715794310659257 0.00024339553938993382
