In [41]:
from lightgbm import LGBMClassifier
import pandas as pd
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_score
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import warnings
import scipy
from sklearn.preprocessing import FunctionTransformer

from tweet_sent_predictor.transformer.LowerCaseTransformer import LowerCaseTransformer
from tweet_sent_predictor.transformer.MentionFlagger import MentionFlagger
from tweet_sent_predictor.transformer.NumberFlagger import NumberFlagger
from tweet_sent_predictor.transformer.SplitterPunctuation import SplitterPunctuation, split_punctuation
from tweet_sent_predictor.transformer.URLFlagger import URLFlagger
from tweet_sent_predictor.transformer.HashtagToWords import HashtagToWords
from sklearn.feature_extraction.text import CountVectorizer
from tweet_sent_predictor.transformer.StopWordFilter import StopWordFilter
from tweet_sent_predictor.predictor.SmartPredictor import SmartPredictor
from sklearn.feature_extraction.text import TfidfTransformer

In [2]:
df = pd.read_csv("tweet_sent_predictor/data/train_proper.csv")

In [3]:
df

Unnamed: 0,opinion,brand,body
0,neu,apl,20 min line @apple store @short pump.
1,irr,msf,Nueva tecnología convierte cualquier superfici...
2,neu,ggl,Some people should not post replies in #Google...
3,neg,apl,I know a few others having same issue RT @Joel...
4,neg,msf,"#Microsoft - We put the """"backwards"""" into bac..."
...,...,...,...
4168,neg,apl,fuck this see you hoes @ work @WeakTwip @Munnn...
4169,neg,msf,"#Microsoft, #Adobe lose $13.5bn to piracy: Rep..."
4170,neu,twt,"I tried to explain why you would do """"The #Twi..."
4171,neg,apl,Installed io5 - fine on ipad but wiped wife's ...


In [4]:
X = df["body"]
y = df["opinion"].values

In [5]:
class_to_int = {"irr" : 0, "neg" : 1, "neu": 2, "pos": 3}
int_to_class = ["irr", "neg", "neu", "pos"]

In [6]:
np.array(list(map(lambda x : class_to_int[x], y)))

array([2, 0, 2, ..., 2, 1, 1])

In [7]:
y_int = np.array(list(map(lambda x : class_to_int[x], y)))

In [10]:
X_tsf = pipe[:-1].fit_transform(X)

In [29]:
lgb.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 31,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

In [43]:
# default
warnings.simplefilter("ignore")

lgb = LGBMClassifier()
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    #("count", CountVectorizer(analyzer=lambda x:x),
    ("HashtagToWords", HashtagToWords()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3))),
    ("as float", FunctionTransformer(scipy.sparse.csr_matrix.astype, kw_args={"dtype" : "float"})),
    ("clf", lgb)
])

predictor = SmartPredictor(pipe=pipe)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

fit
0.8461538461538461  accuracy on train set
fit
fit
fit
fit
fit
[0.71137725 0.69700599 0.71377246 0.72302158 0.73501199]
0.7160378523528482 0.00015957110914070527


Using TF-IDF

In [44]:
# default
warnings.simplefilter("ignore")

lgb = LGBMClassifier()
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    #("count", CountVectorizer(analyzer=lambda x:x),
    ("HashtagToWords", HashtagToWords()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3))),
    ("as float", FunctionTransformer(scipy.sparse.csr_matrix.astype, kw_args={"dtype" : "float"})),
    ("tfidf", TfidfTransformer()),
    ("clf", lgb)
])

predictor = SmartPredictor(pipe=pipe)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

fit
0.9103762281332375  accuracy on train set
fit
fit
fit
fit
fit
[0.68742515 0.66706587 0.69221557 0.70743405 0.69904077]
0.6906362813940465 0.00018423407967701658


TF-IDF increase fit power (also increase variance)

## Parameter optimisation 
### num_leaves

In [47]:
grid = GridSearchCV(pipe, 
                    param_grid={
                        "clf__num_leaves" : np.arange(5, 50, 3),
                    },
                    cv=5,
                    n_jobs=4,
                    return_train_score=True
)

grid.fit(X, y)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('Lowercase',
                                        <tweet_sent_predictor.transformer.LowerCaseTransformer.LowerCaseTransformer object at 0x7ff732e9e160>),
                                       ('MentionFlagger',
                                        <tweet_sent_predictor.transformer.MentionFlagger.MentionFlagger object at 0x7ff732e9e400>),
                                       ('URLFlagger',
                                        <tweet_sent_predictor.transformer.URLFlagger.URLFlagger object at 0x7ff732e9...
                                        CountVectorizer(ngram_range=(1, 3),
                                                        tokenizer=<function split_punctuation at 0x7ff738dfa550>)),
                                       ('as float',
                                        FunctionTransformer(func=<function _data_matrix.astype at 0x7ff7cc1adca0>,
                                                            k

In [48]:
pd.DataFrame(grid.cv_results_).sort_values("rank_test_score")

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_clf__num_leaves,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score,split0_train_score,split1_train_score,split2_train_score,split3_train_score,split4_train_score,mean_train_score,std_train_score
2,1.952435,0.080353,0.379218,0.043797,11,{'clf__num_leaves': 11},0.643114,0.650299,0.632335,0.653477,0.664269,0.648699,0.010647,1,0.805872,0.809766,0.804973,0.802935,0.807727,0.806255,0.002337
1,1.78481,0.035409,0.368066,0.051498,8,{'clf__num_leaves': 8},0.640719,0.645509,0.632335,0.651079,0.666667,0.647262,0.011487,2,0.772918,0.77861,0.773517,0.778676,0.77059,0.774862,0.003238
0,1.788743,0.06799,0.341598,0.025448,5,{'clf__num_leaves': 5},0.638323,0.644311,0.622754,0.657074,0.665468,0.645586,0.014852,3,0.737867,0.730377,0.731276,0.740341,0.733154,0.734603,0.003863
3,2.016796,0.029882,0.382854,0.050959,14,{'clf__num_leaves': 14},0.641916,0.650299,0.62515,0.653477,0.653477,0.644864,0.010726,4,0.834032,0.834931,0.832834,0.830788,0.838574,0.834232,0.002577
6,2.460396,0.164457,0.370567,0.043145,23,{'clf__num_leaves': 23},0.637126,0.635928,0.627545,0.640288,0.666667,0.641511,0.013265,5,0.890953,0.89305,0.896046,0.891884,0.896077,0.893602,0.002115
4,2.094863,0.088741,0.354592,0.018818,17,{'clf__num_leaves': 17},0.634731,0.638323,0.627545,0.63789,0.66307,0.640312,0.012016,6,0.860695,0.855003,0.858298,0.855346,0.859239,0.857716,0.002214
11,2.516994,0.093259,0.354272,0.016437,38,{'clf__num_leaves': 38},0.62515,0.62994,0.628743,0.639089,0.67506,0.639596,0.018318,7,0.948772,0.949371,0.95027,0.953878,0.95298,0.951054,0.002017
14,2.90493,0.086012,0.382872,0.043671,47,{'clf__num_leaves': 47},0.627545,0.62994,0.635928,0.645084,0.657074,0.639114,0.010829,8,0.970342,0.968544,0.965848,0.973046,0.973645,0.970285,0.002886
5,2.132583,0.080779,0.357273,0.009311,20,{'clf__num_leaves': 20},0.62515,0.639521,0.623952,0.636691,0.669065,0.638876,0.016295,9,0.876273,0.870581,0.875075,0.875412,0.880803,0.875629,0.003256
8,2.414247,0.144314,0.367917,0.025168,29,{'clf__num_leaves': 29},0.62515,0.634731,0.628743,0.63789,0.667866,0.638876,0.015164,9,0.919113,0.918514,0.923907,0.920635,0.921533,0.92074,0.001911


In [49]:
num_leaves=11

### min_data_in_leaf

In [50]:
lgb = LGBMClassifier(num_leaves=num_leaves)
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    #("count", CountVectorizer(analyzer=lambda x:x),
    ("HashtagToWords", HashtagToWords()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3))),
    ("as float", FunctionTransformer(scipy.sparse.csr_matrix.astype, kw_args={"dtype" : "float"})),
    ("tfidf", TfidfTransformer()),
    ("clf", lgb)
])

f = "min_data_in_leaf"
grid = GridSearchCV(pipe, 
                    param_grid={
                        f"clf__{f}" : np.arange(1, 100, 5),
                    },
                    cv=5,
                    n_jobs=4,
                    return_train_score=True
)

grid.fit(X, y)

GridSearchCV(cv=5,
             estimator=Pipeline(steps=[('Lowercase',
                                        <tweet_sent_predictor.transformer.LowerCaseTransformer.LowerCaseTransformer object at 0x7ff7c7844910>),
                                       ('MentionFlagger',
                                        <tweet_sent_predictor.transformer.MentionFlagger.MentionFlagger object at 0x7ff7c7844bb0>),
                                       ('URLFlagger',
                                        <tweet_sent_predictor.transformer.URLFlagger.URLFlagger object at 0x7ff7c784...
                                                        tokenizer=<function split_punctuation at 0x7ff738dfa550>)),
                                       ('as float',
                                        FunctionTransformer(func=<function _data_matrix.astype at 0x7ff7cc1adca0>,
                                                            kw_args={'dtype': 'float'})),
                                       ('tfidf'

In [52]:
pd.DataFrame(grid.cv_results_).sort_values("rank_test_score")[["mean_test_score", "mean_train_score", f"param_clf__{f}"]]

Unnamed: 0,mean_test_score,mean_train_score,param_clf__min_data_in_leaf
0,0.704057,0.898155,1
1,0.703335,0.873353,6
2,0.686317,0.846394,11
3,0.660683,0.823089,16
4,0.641269,0.800863,21
5,0.630966,0.787443,26
6,0.624257,0.77642,31
7,0.616346,0.765636,36
11,0.606763,0.733226,56
8,0.605326,0.755152,41


In [53]:
min_data_in_leaf = 6

In [54]:
lgb.get_params()

{'boosting_type': 'gbdt',
 'class_weight': None,
 'colsample_bytree': 1.0,
 'importance_type': 'split',
 'learning_rate': 0.1,
 'max_depth': -1,
 'min_child_samples': 20,
 'min_child_weight': 0.001,
 'min_split_gain': 0.0,
 'n_estimators': 100,
 'n_jobs': -1,
 'num_leaves': 11,
 'objective': None,
 'random_state': None,
 'reg_alpha': 0.0,
 'reg_lambda': 0.0,
 'silent': True,
 'subsample': 1.0,
 'subsample_for_bin': 200000,
 'subsample_freq': 0}

In [55]:
# default
warnings.simplefilter("ignore")

lgb = LGBMClassifier(num_leaves=num_leaves, min_data_in_leaf=min_data_in_leaf, learning_rate=0.05, n_estimators=200)
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    #("count", CountVectorizer(analyzer=lambda x:x),
    ("HashtagToWords", HashtagToWords()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3))),
    ("as float", FunctionTransformer(scipy.sparse.csr_matrix.astype, kw_args={"dtype" : "float"})),
    ("tfidf", TfidfTransformer()),
    ("clf", lgb)
])

predictor = SmartPredictor(pipe=pipe)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

fit
0.8648454349388929  accuracy on train set
fit
fit
fit
fit
fit
[0.72694611 0.70658683 0.75329341 0.74100719 0.75179856]
0.7359264205402145 0.0003041618693464889


In [56]:
# default
warnings.simplefilter("ignore")

lgb = LGBMClassifier(num_leaves=num_leaves, min_data_in_leaf=min_data_in_leaf, learning_rate=0.01, n_estimators=1000)
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    #("count", CountVectorizer(analyzer=lambda x:x),
    ("HashtagToWords", HashtagToWords()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3))),
    ("as float", FunctionTransformer(scipy.sparse.csr_matrix.astype, kw_args={"dtype" : "float"})),
    ("tfidf", TfidfTransformer()),
    ("clf", lgb)
])

predictor = SmartPredictor(pipe=pipe)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

fit
0.8638868919242751  accuracy on train set
fit
fit
fit
fit
fit
[0.72694611 0.71137725 0.75329341 0.73860911 0.74940048]
0.735925271758641 0.0002347335087400261


In [57]:
# default
warnings.simplefilter("ignore")

lgb = LGBMClassifier(num_leaves=num_leaves, min_data_in_leaf=min_data_in_leaf, learning_rate=0.01, n_estimators=500)
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    #("count", CountVectorizer(analyzer=lambda x:x),
    ("HashtagToWords", HashtagToWords()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3))),
    ("as float", FunctionTransformer(scipy.sparse.csr_matrix.astype, kw_args={"dtype" : "float"})),
    ("tfidf", TfidfTransformer()),
    ("clf", lgb)
])

predictor = SmartPredictor(pipe=pipe)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

fit
0.813803019410496  accuracy on train set
fit
fit
fit
fit
fit
[0.72934132 0.71856287 0.75688623 0.74100719 0.75659472]
0.7404784675253808 0.0002267114425808423


In [58]:
# default
warnings.simplefilter("ignore")

lgb = LGBMClassifier(num_leaves=num_leaves*2, min_data_in_leaf=min_data_in_leaf*2, learning_rate=0.01, n_estimators=500)
pipe = Pipeline([
    ("Lowercase", LowerCaseTransformer()),
    ("MentionFlagger", MentionFlagger()),
    ("URLFlagger", URLFlagger()),
    ("NumberFlagger", NumberFlagger()),
    ("StopwordFilter", StopWordFilter()),
    #("count", CountVectorizer(analyzer=lambda x:x),
    ("HashtagToWords", HashtagToWords()),
    ("count", CountVectorizer(analyzer="word", tokenizer=split_punctuation, ngram_range=(1,3))),
    ("as float", FunctionTransformer(scipy.sparse.csr_matrix.astype, kw_args={"dtype" : "float"})),
    ("tfidf", TfidfTransformer()),
    ("clf", lgb)
])

predictor = SmartPredictor(pipe=pipe)
predictor.fit(X, y)
print(predictor.score(X, y), " accuracy on train set")
scores = cross_val_score(predictor, X, y, cv=5)
print(scores)
print(scores.mean(), scores.var())

fit
0.8490294751976994  accuracy on train set
fit
fit
fit
fit
fit
[0.72814371 0.7005988  0.73413174 0.73860911 0.74220624]
0.7287379198437657 0.0002200184467087074
