# Same modelo 10 pero con TF-IDF

In [1]:
import pandas as pd
import re
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from catboost import CatBoostClassifier
%matplotlib inline

# Se lee el .csv

In [2]:
train = pd.read_csv('csv/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test = pd.read_csv('csv/test.csv')

In [5]:
word2vec_score = pd.read_csv('csv/solo_embedding_word2vec_train.csv', usecols=['word2vec_score'])
word2vec_score_test = pd.read_csv('csv/solo_embedding_word2vec_test.csv', usecols=['word2vec_score'])

In [6]:
elmo_score = pd.read_csv('csv/solo_embedding_elmo_train.csv', usecols=['elmo_score'])
elmo_score_test = pd.read_csv('csv/solo_embedding_elmo_test.csv', usecols=['elmo_score'])

In [7]:
glove_score = pd.read_csv('csv/solo_embedding_glove_train.csv', usecols=['glove_score'])
glove_score_test = pd.read_csv('csv/solo_embedding_glove_test.csv', usecols=['glove_score'])

In [8]:
bert_score = pd.read_csv('csv/solo_embedding_bert_train.csv', usecols=['bert_score'])
bert_score_test = pd.read_csv('csv/solo_embedding_bert_test.csv', usecols=['bert_score'])

In [9]:
word2vec_net = pd.read_csv('csv/train_red_w2v.csv', usecols=['w2v_net'])
word2vec_net_test = pd.read_csv('csv/test_red_w2v.csv', usecols=['w2v_net'])

In [10]:
glove_net = pd.read_csv('csv/train_red_glove.csv', usecols=['glove_net'])
glove_net_test = pd.read_csv('csv/test_red_glove.csv', usecols=['glove_net'])

In [11]:
bert_net = pd.read_csv('csv/train_red_bert.csv', usecols=['bert_net'])
bert_net_test = pd.read_csv('csv/test_red_bert.csv', usecols=['bert_net'])

In [12]:
elmo_net = pd.read_csv('csv/train_red_elmo.csv', usecols=['elmo_net'])
elmo_net_test = pd.read_csv('csv/test_red_elmo.csv', usecols=['elmo_net'])

In [13]:
tf_train = pd.read_csv('csv/train_TFIDF.csv', usecols=['tfidf_net'])
tf_test = pd.read_csv('csv/test_TFIDF.csv', usecols=['tfidf_net'])

In [14]:
tf_uni_train = pd.read_csv('csv/train_TFIDF_unigramas.csv', usecols=['tfidf_net_uni'])
tf_uni_test = pd.read_csv('csv/test_TFIDF_unigramas.csv', usecols=['tfidf_net_uni'])

In [15]:
tf_bi_train = pd.read_csv('csv/train_TFIDF_bigramas.csv', usecols=['tfidf_net_bi'])
tf_bi_test = pd.read_csv('csv/test_TFIDF_bigramas.csv', usecols=['tfidf_net_bi'])

In [16]:
tf_tri_train = pd.read_csv('csv/train_TFIDF_trigramas.csv', usecols=['tfidf_net_tri'])
tf_tri_test = pd.read_csv('csv/test_TFIDF_trigamas.csv', usecols=['tfidf_net_tri'])

In [17]:
trans_bert_train = pd.read_csv('eze_pulido/submission3000_train_float.csv', usecols=['target'])
trans_bert_test = pd.read_csv('eze_pulido/submission3000float.csv', usecols=['target'])

In [18]:
train['word2vec_tree'] = word2vec_score
train['elmo_tree'] = elmo_score
train['glove_tree'] = glove_score
train['bert_tree'] = bert_score
train['bert_net'] = bert_net
train['elmo_net'] = elmo_net
train['w2v_net'] = word2vec_net
train['glove_net'] = glove_net
train['tf_train'] = tf_train
train['tf_uni'] = tf_uni_train
train['tf_bi'] = tf_bi_train
train['tf_tri'] = tf_tri_train
train['trans_bert'] = trans_bert_train

In [19]:
train.drop(columns=['keyword','location','text'],inplace=True)

In [20]:
train.head()

Unnamed: 0,id,target,word2vec_tree,elmo_tree,glove_tree,bert_tree,bert_net,elmo_net,w2v_net,glove_net,tf_train,tf_uni,tf_bi,tf_tri,trans_bert
0,1,1,0.667608,0.715482,0.635028,0.640565,0.242175,0.45009,0.992572,0.65178,0.587438,0.580922,0.564173,0.607627,0.996106
1,4,1,0.855877,0.783273,0.877394,0.751214,0.237999,0.297658,0.999871,0.966338,0.967136,0.986876,0.982303,0.934687,0.993173
2,5,1,0.892648,0.814854,0.870452,0.796551,0.653434,0.809174,0.967203,0.971955,0.71239,0.788277,0.81296,0.70597,0.994093
3,6,1,0.946222,0.838757,0.860857,0.900976,0.901097,0.930959,0.999996,0.934217,0.976515,0.98241,0.983006,0.953801,0.998561
4,7,1,0.829571,0.830163,0.758754,0.694989,0.713383,0.484619,0.998377,0.60717,0.525771,0.531229,0.498404,0.558009,0.994882


In [21]:
test['word2vec_tree'] = word2vec_score_test
test['elmo_tree'] = elmo_score_test
test['glove_tree'] = glove_score_test
test['bert_tree'] = bert_score_test
test['bert_net'] = bert_net_test
test['elmo_net'] = elmo_net_test
test['w2v_net'] = word2vec_net_test
test['glove_net'] = glove_net_test
test['tf_test'] = tf_test
test['tf_uni'] = tf_uni_test
test['tf_bi'] = tf_bi_test
test['tf_tri'] = tf_tri_test
test['trans_bert'] = trans_bert_test

In [22]:
test.drop(columns=['keyword','location','text'],inplace=True)

In [23]:
test.head()

Unnamed: 0,id,word2vec_tree,elmo_tree,glove_tree,bert_tree,bert_net,elmo_net,w2v_net,glove_net,tf_test,tf_uni,tf_bi,tf_tri,trans_bert
0,0,0.699924,0.323115,0.729142,0.634452,0.925312,0.707569,0.921463,0.743638,0.591726,0.587112,0.731684,0.610922,0.970841
1,2,0.835608,0.397249,0.628241,0.562603,0.956846,0.889106,0.970476,0.864992,0.417002,0.552427,0.675684,0.454667,0.998173
2,3,0.658962,0.933304,0.4636,0.477252,0.631169,0.75123,0.97526,0.609823,0.793878,0.618961,0.806482,0.786749,0.985128
3,9,0.828614,0.695115,0.854859,0.82896,0.995054,0.952858,0.999825,0.909378,0.531445,0.48803,0.757723,0.559148,0.996822
4,11,0.853753,0.74111,1.039749,0.806881,0.971665,0.9584,0.999997,0.994435,0.974716,0.983036,0.997448,0.948272,0.994723


# Me quedo con resultados de red

In [24]:
aux3 = train.iloc[:,6:15]
div = aux3.shape[1]
aux3.head()

Unnamed: 0,bert_net,elmo_net,w2v_net,glove_net,tf_train,tf_uni,tf_bi,tf_tri,trans_bert
0,0.242175,0.45009,0.992572,0.65178,0.587438,0.580922,0.564173,0.607627,0.996106
1,0.237999,0.297658,0.999871,0.966338,0.967136,0.986876,0.982303,0.934687,0.993173
2,0.653434,0.809174,0.967203,0.971955,0.71239,0.788277,0.81296,0.70597,0.994093
3,0.901097,0.930959,0.999996,0.934217,0.976515,0.98241,0.983006,0.953801,0.998561
4,0.713383,0.484619,0.998377,0.60717,0.525771,0.531229,0.498404,0.558009,0.994882


In [25]:
aux_train = train.iloc[:,6:15]

### Realizo un Grid Search 

In [26]:
a = aux_train.columns
import itertools 
best_solo = 0
best_items = []
for j in range(2, len(a)+1):
    m = list(itertools.combinations(a, j))
    len(m)
    contador = 0 
    for i in range (0, len(m)):
        lista = list(m[i])
        
        grid = aux_train.filter(items= lista)
        div = grid.shape[1]
        grid['suma'] = grid.sum(axis=1)
        train['prediction'] = (grid['suma']/div)
        train['prediction'] = train['prediction'].round().astype(int)  
        
        if (accuracy_score(train['target'],train['prediction'])> best_solo): 
            best_solo = accuracy_score(train['prediction'],train['target'])
            best_items = lista 

In [27]:
print('Mejor: ' + str(best_solo) +  ' obtenido con: ' + str(best_items))

Mejor: 0.9259161959805595 obtenido con: ['w2v_net', 'trans_bert']


### Predigo con los mejores modelos

In [28]:
aux = test.iloc[:,5:14]

In [29]:
aux = aux.filter(items= best_items)
div = aux.shape[1]
aux['suma'] = aux.sum(axis=1)
test['prediction'] = (aux['suma']/div)
test['prediction'] = test['prediction'].round().astype(int) 

In [30]:
aux = test.filter(items= ['id', 'prediction'])
aux.rename(columns = {'prediction': 'target'}, inplace=True)

In [32]:
aux.set_index('id',inplace = True)

In [33]:
aux.to_csv("submit_11_grid.csv")