In [1]:
import pandas as pd
import numpy as np
import re
import time

import bs4 as bs4
import json

import glob
import tqdm

pd.set_option("max.columns", 131)

#https://strftime.org/
%matplotlib inline
%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
df = pd.read_excel('novos dados.xlsx')

In [3]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [4]:
df_limpo = pd.DataFrame(index = df.index)
df_limpo['title'] = df['title']

In [5]:
mask = ['classification','reviews']
treinar = df[mask]
treinar

Unnamed: 0,classification,reviews
0,4.1,32
1,4.4,3
2,4.2,18
3,4.7,17
4,4.3,108
...,...,...
669,3.1,12
670,3.0,1
671,3.0,2
672,2.9,5


In [6]:
y = df['target']

## 3. Features

In [7]:
# mask_train = df_limpo['date'] < "2019-04-01"
# mask_val = (df_limpo['date'] >= "2019-04-01")

title_train = df_limpo[:300]['title']
title_val = df_limpo[300:]['title']

# Xtrain, Xval = features[mask_train], features[mask_val]
# ytrain, yval = y[mask_train], y[mask_val]
Xtrain, Xval = treinar[:300], treinar[300:]
ytrain, yval = y[:300], y[300:]
Xtrain.shape, Xval.shape, ytrain.shape, yval.shape

((300, 2), (374, 2), (300,), (374,))

In [27]:
from sklearn.feature_extraction.text import TfidfVectorizer

# title_train = df_limpo[mask_train]['title']
# title_val = df_limpo[mask_val]['title']
title_train = df_limpo[:300]['title']
title_val = df_limpo[300:]['title']

title_vec = TfidfVectorizer(min_df=4, ngram_range=(1,5))
title_bow_train = title_vec.fit_transform(title_train)
title_bow_val = title_vec.transform(title_val)


In [28]:
title_bow_train.shape

(300, 435)

In [29]:
from scipy.sparse import hstack, vstack

Xtrain_wtitle = hstack([Xtrain, title_bow_train])
Xval_wtitle = hstack([Xval, title_bow_val])

In [30]:
Xtrain_wtitle.shape, Xval_wtitle.shape

((300, 437), (374, 437))

# 4 Random Forest

In [31]:
mdl = RandomForestClassifier(n_estimators=1000, random_state=0, min_samples_leaf=1, class_weight="balanced", n_jobs=6)
mdl.fit(Xtrain_wtitle, ytrain)

RandomForestClassifier(class_weight='balanced', n_estimators=1000, n_jobs=6,
                       random_state=0)

In [32]:
p = mdl.predict_proba(Xval_wtitle)[:, 1]

In [33]:
from sklearn.metrics import roc_auc_score, average_precision_score

In [34]:
average_precision_score(yval, p)

0.30155719798566594

In [35]:
roc_auc_score(yval, p)

0.4908476485497043

# 5 LightGBM

In [17]:
from lightgbm import LGBMClassifier

In [36]:
mdl = LGBMClassifier(learning_rate= 0.012750003188347398, num_leaves= 16, max_depth=6, 
                         min_child_samples=7, subsample=0.6283707044270589,
                         colsample_bytree=0.19713289098659126, bagging_freq=1,n_estimators=685, random_state=0, 
                         class_weight="balanced", n_jobs=6)
mdl.fit(Xtrain_wtitle, ytrain)

LGBMClassifier(bagging_freq=1, class_weight='balanced',
               colsample_bytree=0.19713289098659126,
               learning_rate=0.012750003188347398, max_depth=6,
               min_child_samples=7, n_estimators=685, n_jobs=6, num_leaves=16,
               random_state=0, subsample=0.6283707044270589)

In [37]:
p = mdl.predict_proba(Xval_wtitle)[:, 1]



In [26]:
space = [(1e-3, 1e-1, 'log-uniform'), # lr
          (1, 15), # max_depth
          (1, 20), # min_child_samples
          (0.05, 1.), # subsample
          (0.05, 1.), # colsample_bytree
          (100,1000), # n_estimators
          (1,5), # min_df
          (1,5)] # ngram_range
res.x

[0.012750003188347398,
 6,
 7,
 0.6283707044270589,
 0.19713289098659126,
 685,
 4,
 5]

In [None]:
[0.012250750764764855, 8, 6, 0.5976582413192033, 0.2474882432951916, 516, 4, 4]

In [38]:
average_precision_score(yval, p), roc_auc_score(yval, p)

(0.3474028562435173, 0.48285694170656157)

In [None]:
(0.27455058995659876, 0.4762390875809631)

# 6 Bayesian Optimization

In [21]:
from skopt import forest_minimize

In [25]:
def tune_lgbm(params):
    print(params)
    lr = params[0]
    max_depth = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    n_estimators = params[5]
    
    min_df = params[6]
    ngram_range = (1, params[7])
    
    title_vec = TfidfVectorizer(min_df=min_df, ngram_range=ngram_range)
    title_bow_train = title_vec.fit_transform(title_train)
    title_bow_val = title_vec.transform(title_val)
    
    Xtrain_wtitle = hstack([Xtrain, title_bow_train])
    Xval_wtitle = hstack([Xval, title_bow_val])
    
    mdl = LGBMClassifier(learning_rate=lr, num_leaves=2 ** max_depth, max_depth=max_depth, 
                         min_child_samples=min_child_samples, subsample=subsample,
                         colsample_bytree=colsample_bytree, bagging_freq=1,n_estimators=n_estimators, random_state=0, 
                         class_weight="balanced", n_jobs=6)
    mdl.fit(Xtrain_wtitle, ytrain)
    
    p = mdl.predict_proba(Xval_wtitle)[:, 1]
    
    print(roc_auc_score(yval, p))
    
    return -average_precision_score(yval, p)


space = [(1e-3, 1e-1, 'log-uniform'), # lr
          (1, 15), # max_depth
          (1, 20), # min_child_samples
          (0.05, 1.), # subsample
          (0.05, 1.), # colsample_bytree
          (100,1000), # n_estimators
          (1,5), # min_df
          (1,5)] # ngram_range

res = forest_minimize(tune_lgbm, space, random_state=160745, n_random_starts=20, n_calls=50, verbose=1)

Iteration No: 1 started. Evaluating function at random point.
[0.009944912110647982, 5, 1, 0.4677107511929402, 0.49263223036174764, 272, 3, 1]




0.4761686848774993
Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.2922
Function value obtained: -0.3102
Current minimum: -0.3102
Iteration No: 2 started. Evaluating function at random point.
[0.053887464791860025, 1, 15, 0.7437489153990157, 0.8675167974293533, 549, 3, 4]
0.4841241903689102
Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.1895
Function value obtained: -0.3010
Current minimum: -0.3102
Iteration No: 3 started. Evaluating function at random point.
[0.004151454520895999, 6, 20, 0.8682075103820793, 0.9491436163200662, 411, 4, 3]




0.48496902281047605
Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.2264
Function value obtained: -0.2806
Current minimum: -0.3102
Iteration No: 4 started. Evaluating function at random point.
[0.0014099928811969545, 9, 9, 0.6502182010234373, 0.6866210554187129, 828, 5, 2]




0.45198535623767955
Iteration No: 4 ended. Evaluation done at random point.
Time taken: 0.5366
Function value obtained: -0.2719
Current minimum: -0.3102
Iteration No: 5 started. Evaluating function at random point.
[0.08530558241838007, 8, 19, 0.2137736299768322, 0.1313765544201984, 961, 4, 1]
0.501126443255421
Iteration No: 5 ended. Evaluation done at random point.
Time taken: 0.1366
Function value obtained: -0.3071
Current minimum: -0.3102
Iteration No: 6 started. Evaluating function at random point.
[0.003567949451535685, 13, 19, 0.7232951768944309, 0.7298538828427115, 939, 4, 3]




0.4897212052942833
Iteration No: 6 ended. Evaluation done at random point.
Time taken: 2.4634
Function value obtained: -0.2863
Current minimum: -0.3102
Iteration No: 7 started. Evaluating function at random point.
[0.014828577273549474, 7, 1, 0.18428087097824575, 0.3261556557915816, 274, 1, 2]




0.4678435651929035
Iteration No: 7 ended. Evaluation done at random point.
Time taken: 0.5146
Function value obtained: -0.2815
Current minimum: -0.3102
Iteration No: 8 started. Evaluating function at random point.
[0.0015212976972079912, 14, 3, 0.8183084505971293, 0.7859673038076707, 189, 5, 3]




0.4657842861165869
Iteration No: 8 ended. Evaluation done at random point.
Time taken: 5.9719
Function value obtained: -0.2842
Current minimum: -0.3102
Iteration No: 9 started. Evaluating function at random point.
[0.009565866803971352, 6, 18, 0.5235636153223084, 0.6728679300083596, 747, 4, 5]




0.5046113770768798
Iteration No: 9 ended. Evaluation done at random point.
Time taken: 0.2872
Function value obtained: -0.2970
Current minimum: -0.3102
Iteration No: 10 started. Evaluating function at random point.
[0.0012116790683302117, 3, 2, 0.06616307483844217, 0.23025600705315752, 677, 2, 5]




0.49633905941988166
Iteration No: 10 ended. Evaluation done at random point.
Time taken: 0.4646
Function value obtained: -0.2944
Current minimum: -0.3102
Iteration No: 11 started. Evaluating function at random point.
[0.0053139776214487944, 6, 9, 0.14251441334450304, 0.8175761405215897, 297, 1, 5]




0.4928013235708251
Iteration No: 11 ended. Evaluation done at random point.
Time taken: 0.3112
Function value obtained: -0.2859
Current minimum: -0.3102
Iteration No: 12 started. Evaluating function at random point.
[0.0068572961982704935, 10, 5, 0.2390386584472456, 0.49053406102209746, 176, 2, 4]




0.4564559279076317
Iteration No: 12 ended. Evaluation done at random point.
Time taken: 0.2230
Function value obtained: -0.2771
Current minimum: -0.3102
Iteration No: 13 started. Evaluating function at random point.
[0.00781968225875022, 3, 4, 0.7078936710077383, 0.31818755505678337, 275, 4, 4]




0.46071529146719237
Iteration No: 13 ended. Evaluation done at random point.
Time taken: 0.1811
Function value obtained: -0.2997
Current minimum: -0.3102
Iteration No: 14 started. Evaluating function at random point.
[0.017293945600511968, 2, 15, 0.9007557574888567, 0.41026441194439994, 316, 5, 1]
0.505632216277105
Iteration No: 14 ended. Evaluation done at random point.
Time taken: 0.1097
Function value obtained: -0.3076
Current minimum: -0.3102
Iteration No: 15 started. Evaluating function at random point.
[0.012250750764764855, 8, 6, 0.5976582413192033, 0.2474882432951916, 516, 4, 4]




0.4989087580963109
Iteration No: 15 ended. Evaluation done at random point.
Time taken: 0.4142
Function value obtained: -0.3394
Current minimum: -0.3394
Iteration No: 16 started. Evaluating function at random point.
[0.018353598126553926, 4, 3, 0.47305622526323254, 0.1404164811277527, 133, 4, 1]
0.48570825119684596
Iteration No: 16 ended. Evaluation done at random point.
Time taken: 0.0698
Function value obtained: -0.2883
Current minimum: -0.3394
Iteration No: 17 started. Evaluating function at random point.
[0.0010383234748454694, 15, 19, 0.9256771571832196, 0.9321438677645206, 312, 4, 3]




0.4816072937200788
Iteration No: 17 ended. Evaluation done at random point.
Time taken: 6.5983
Function value obtained: -0.2692
Current minimum: -0.3394
Iteration No: 18 started. Evaluating function at random point.
[0.004955229758078229, 5, 5, 0.06939551310802591, 0.4193273080472823, 725, 4, 1]
0.46226415094339623
Iteration No: 18 ended. Evaluation done at random point.
Time taken: 0.1137
Function value obtained: -0.2702
Current minimum: -0.3394
Iteration No: 19 started. Evaluating function at random point.
[0.0699516121742407, 11, 9, 0.22351444794819092, 0.9946871410890346, 947, 5, 1]




0.49760630808223033
Iteration No: 19 ended. Evaluation done at random point.
Time taken: 0.5176
Function value obtained: -0.3026
Current minimum: -0.3394
Iteration No: 20 started. Evaluating function at random point.
[0.004955402904180171, 6, 1, 0.13021457554920057, 0.6158804906347372, 615, 3, 4]




0.4724021402421853
Iteration No: 20 ended. Evaluation done at random point.
Time taken: 1.0655
Function value obtained: -0.2866
Current minimum: -0.3394
Iteration No: 21 started. Searching for the next optimal point.
[0.012750003188347398, 6, 7, 0.6283707044270589, 0.19713289098659126, 685, 4, 5]




0.482680934947902
Iteration No: 21 ended. Search finished for the next optimal point.
Time taken: 0.5217
Function value obtained: -0.3453
Current minimum: -0.3453
Iteration No: 22 started. Searching for the next optimal point.
[0.013612240408179753, 5, 9, 0.664231009869041, 0.05974943295881339, 588, 3, 5]
0.4848282174035483




Iteration No: 22 ended. Search finished for the next optimal point.
Time taken: 0.4259
Function value obtained: -0.3150
Current minimum: -0.3453
Iteration No: 23 started. Searching for the next optimal point.
[0.011741343334192094, 10, 20, 0.7088267828380246, 0.23351519448516372, 739, 5, 5]




0.48686989580399886
Iteration No: 23 ended. Search finished for the next optimal point.
Time taken: 0.5535
Function value obtained: -0.3013
Current minimum: -0.3453
Iteration No: 24 started. Searching for the next optimal point.
[0.01515306653009862, 12, 6, 0.6089444963634667, 0.22716420884492683, 508, 1, 5]




0.46342579555054914
Iteration No: 24 ended. Search finished for the next optimal point.
Time taken: 1.3474
Function value obtained: -0.3057
Current minimum: -0.3453
Iteration No: 25 started. Searching for the next optimal point.
[0.017499638779439646, 8, 2, 0.6051120374126752, 0.22550984815909336, 885, 4, 4]




0.48542664038299066
Iteration No: 25 ended. Search finished for the next optimal point.
Time taken: 2.5056
Function value obtained: -0.3153
Current minimum: -0.3453
Iteration No: 26 started. Searching for the next optimal point.
[0.012938378829290699, 8, 6, 0.6534749237555699, 0.25433080123827206, 661, 4, 4]




0.4940157702055759
Iteration No: 26 ended. Search finished for the next optimal point.
Time taken: 0.6224
Function value obtained: -0.3341
Current minimum: -0.3453
Iteration No: 27 started. Searching for the next optimal point.
[0.010633272286736882, 9, 5, 0.6054564364679259, 0.24274063742635, 812, 4, 4]




0.4955646296817798
Iteration No: 27 ended. Search finished for the next optimal point.
Time taken: 0.8677
Function value obtained: -0.3210
Current minimum: -0.3453
Iteration No: 28 started. Searching for the next optimal point.
[0.012023986140826547, 15, 7, 0.5875564639420233, 0.2645366973861971, 286, 5, 4]




0.47423261053224447
Iteration No: 28 ended. Search finished for the next optimal point.
Time taken: 10.6194
Function value obtained: -0.3234
Current minimum: -0.3453
Iteration No: 29 started. Searching for the next optimal point.
[0.002639119770534989, 3, 7, 0.6571865685903414, 0.1523233552548061, 559, 5, 4]
0.4650802590819487




Iteration No: 29 ended. Search finished for the next optimal point.
Time taken: 0.5127
Function value obtained: -0.2849
Current minimum: -0.3453
Iteration No: 30 started. Searching for the next optimal point.
[0.08039047602339458, 6, 7, 0.3840130098103599, 0.2566651826928838, 472, 4, 4]




0.513200506899465
Iteration No: 30 ended. Search finished for the next optimal point.
Time taken: 0.5725
Function value obtained: -0.3315
Current minimum: -0.3453
Iteration No: 31 started. Searching for the next optimal point.
[0.012664209071636902, 7, 7, 0.6230784058315155, 0.14257257386478256, 618, 4, 4]




0.5057026189805689
Iteration No: 31 ended. Search finished for the next optimal point.
Time taken: 0.7291
Function value obtained: -0.3449
Current minimum: -0.3453
Iteration No: 32 started. Searching for the next optimal point.
[0.012408979766496776, 9, 6, 0.6156061343459918, 0.09124092035717463, 616, 4, 3]




0.47761194029850745
Iteration No: 32 ended. Search finished for the next optimal point.
Time taken: 0.9859
Function value obtained: -0.3148
Current minimum: -0.3453
Iteration No: 33 started. Searching for the next optimal point.
[0.007709874083894978, 6, 7, 0.4411889587294485, 0.19800962972700487, 826, 4, 4]




0.5084483244156576
Iteration No: 33 ended. Search finished for the next optimal point.
Time taken: 0.9022
Function value obtained: -0.3299
Current minimum: -0.3453
Iteration No: 34 started. Searching for the next optimal point.
[0.005214781933722956, 7, 6, 0.7538763542454772, 0.2532910571141877, 638, 4, 4]




0.46676992396508027
Iteration No: 34 ended. Search finished for the next optimal point.
Time taken: 0.9390
Function value obtained: -0.3295
Current minimum: -0.3453
Iteration No: 35 started. Searching for the next optimal point.
[0.007177661502445641, 8, 8, 0.3649637067706078, 0.22810607332174954, 692, 4, 4]




0.4974303013235708
Iteration No: 35 ended. Search finished for the next optimal point.
Time taken: 0.8759
Function value obtained: -0.3136
Current minimum: -0.3453
Iteration No: 36 started. Searching for the next optimal point.
[0.023791168458334386, 5, 6, 0.8033084757027618, 0.1503248871437941, 658, 4, 4]




0.46339059419881723
Iteration No: 36 ended. Search finished for the next optimal point.
Time taken: 0.7155
Function value obtained: -0.3108
Current minimum: -0.3453
Iteration No: 37 started. Searching for the next optimal point.
[0.012869285479458327, 6, 9, 0.9839860401672411, 0.3108326957186079, 516, 4, 4]




0.45712475359053784
Iteration No: 37 ended. Search finished for the next optimal point.
Time taken: 0.6293
Function value obtained: -0.3036
Current minimum: -0.3453
Iteration No: 38 started. Searching for the next optimal point.
[0.012201329277191314, 7, 7, 0.6487216908641696, 0.1784244745007053, 467, 1, 5]




0.4577583779217122
Iteration No: 38 ended. Search finished for the next optimal point.
Time taken: 0.7303
Function value obtained: -0.2838
Current minimum: -0.3453
Iteration No: 39 started. Searching for the next optimal point.
[0.025780277655696956, 8, 7, 0.06575113078969208, 0.08945927107001195, 654, 4, 4]
0.48109687411996616




Iteration No: 39 ended. Search finished for the next optimal point.
Time taken: 0.6009
Function value obtained: -0.2807
Current minimum: -0.3453
Iteration No: 40 started. Searching for the next optimal point.
[0.013675683648852405, 7, 1, 0.4123035781751848, 0.19482031406441402, 648, 4, 5]




0.5066178541255983
Iteration No: 40 ended. Search finished for the next optimal point.
Time taken: 1.4289
Function value obtained: -0.3250
Current minimum: -0.3453
Iteration No: 41 started. Searching for the next optimal point.
[0.0010355115431307895, 7, 14, 0.6004108713881031, 0.17103524042617924, 698, 4, 5]




0.48528583497606304
Iteration No: 41 ended. Search finished for the next optimal point.
Time taken: 0.7112
Function value obtained: -0.2985
Current minimum: -0.3453
Iteration No: 42 started. Searching for the next optimal point.
[0.09499420678873671, 15, 7, 0.6016414642239297, 0.2536508358433761, 136, 4, 5]




0.49915516755843425
Iteration No: 42 ended. Search finished for the next optimal point.
Time taken: 18.2115
Function value obtained: -0.3114
Current minimum: -0.3453
Iteration No: 43 started. Searching for the next optimal point.
[0.013300550258160548, 7, 5, 0.44301498242423987, 0.1475138901560187, 515, 5, 4]




0.4800056322162771
Iteration No: 43 ended. Search finished for the next optimal point.
Time taken: 0.5276
Function value obtained: -0.3018
Current minimum: -0.3453
Iteration No: 44 started. Searching for the next optimal point.
[0.010473894158335982, 1, 7, 0.4558484056183824, 0.06891402787544804, 994, 4, 5]
0.4749366375668826




Iteration No: 44 ended. Search finished for the next optimal point.
Time taken: 0.4747
Function value obtained: -0.3024
Current minimum: -0.3453
Iteration No: 45 started. Searching for the next optimal point.
[0.007753240912978019, 7, 6, 0.6272100524599133, 0.1976076666114046, 712, 4, 4]




0.49017882286679804
Iteration No: 45 ended. Search finished for the next optimal point.
Time taken: 0.6333
Function value obtained: -0.3422
Current minimum: -0.3453
Iteration No: 46 started. Searching for the next optimal point.
[0.004238487200650871, 12, 6, 0.6304248575666457, 0.19892435214099619, 669, 4, 2]




0.45557589411433397
Iteration No: 46 ended. Search finished for the next optimal point.
Time taken: 2.1752
Function value obtained: -0.2904
Current minimum: -0.3453
Iteration No: 47 started. Searching for the next optimal point.
[0.003993517127674334, 7, 8, 0.551951071317732, 0.14612313770699514, 454, 4, 4]
0.49672627428893273




Iteration No: 47 ended. Search finished for the next optimal point.
Time taken: 0.4767
Function value obtained: -0.3202
Current minimum: -0.3453
Iteration No: 48 started. Searching for the next optimal point.
[0.004532209462740312, 6, 6, 0.6418817342823382, 0.15680633626269902, 631, 4, 4]




0.47990002816108135
Iteration No: 48 ended. Search finished for the next optimal point.
Time taken: 0.5755
Function value obtained: -0.3345
Current minimum: -0.3453
Iteration No: 49 started. Searching for the next optimal point.
[0.005919393762796261, 11, 7, 0.6404927945042223, 0.11618422994721038, 719, 4, 4]




0.4961278513094903
Iteration No: 49 ended. Search finished for the next optimal point.
Time taken: 1.0781
Function value obtained: -0.3213
Current minimum: -0.3453
Iteration No: 50 started. Searching for the next optimal point.
[0.0034175211209914918, 6, 4, 0.6357006149183271, 0.1453736165293555, 984, 4, 5]




0.4712756969867643
Iteration No: 50 ended. Search finished for the next optimal point.
Time taken: 1.0290
Function value obtained: -0.3308
Current minimum: -0.3453


# 7 Logistic Reg

In [39]:
from sklearn.preprocessing import MaxAbsScaler, StandardScaler
from scipy.sparse import csr_matrix

In [129]:
Xtrain_wtitle2 = csr_matrix(Xtrain_wtitle.copy())
Xval_wtitle2 = csr_matrix(Xval_wtitle.copy())

scaler = StandardScaler()
# scaler = MaxAbsScaler()


Xtrain_wtitle2[:, :2] = scaler.fit_transform(Xtrain_wtitle2[:, :2].todense())
Xval_wtitle2[:, :2] = scaler.transform(Xval_wtitle2[:, :2].todense())

# Xtrain_wtitle2 = scaler.fit_transform(Xtrain_wtitle2)
# Xval_wtitle2 = scaler.transform(Xval_wtitle2)

  self._set_arrayXarray(i, j, x)


In [130]:
Xval_wtitle2.shape

(374, 437)

In [131]:

mdl = LogisticRegression(C=0.0001,n_jobs=6, random_state=0)
mdl.fit(Xtrain_wtitle2, ytrain)

LogisticRegression(C=0.0001, n_jobs=6, random_state=0)

In [132]:
p = mdl.predict_proba(Xval_wtitle2)[:, 1]

In [133]:
average_precision_score(yval, p), roc_auc_score(yval, p)

(0.33792352917966934, 0.5337580963108984)

In [None]:
(0.3238428572594492, 0.4953886229231203) - sem tuning, standardscaler

(0.2933647722524177, 0.46627710504083353) - sem tuning, maxabsscaler
(0.30539232757962287, 0.4772599267811884) - C=10, standardscaler

(0.27113551861312046, 0.4392424669107293) - C=10, maxabsscaler
(0.33792352917966934, 0.5337580963108984) - C=0.0001, standardscaler