# Same modelo 10 pero con TF-IDF

In [1]:
import pandas as pd
import re
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from catboost import CatBoostClassifier
%matplotlib inline

# Se lee el .csv

In [2]:
train = pd.read_csv('csv/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test = pd.read_csv('csv/test.csv')

In [5]:
word2vec_score = pd.read_csv('csv/solo_embedding_word2vec_train.csv', usecols=['word2vec_score'])
word2vec_score_test = pd.read_csv('csv/solo_embedding_word2vec_test.csv', usecols=['word2vec_score'])

In [6]:
elmo_score = pd.read_csv('csv/solo_embedding_elmo_train.csv', usecols=['elmo_score'])
elmo_score_test = pd.read_csv('csv/solo_embedding_elmo_test.csv', usecols=['elmo_score'])

In [7]:
glove_score = pd.read_csv('csv/solo_embedding_glove_train.csv', usecols=['glove_score'])
glove_score_test = pd.read_csv('csv/solo_embedding_glove_test.csv', usecols=['glove_score'])

In [8]:
bert_score = pd.read_csv('csv/solo_embedding_bert_train.csv', usecols=['bert_score'])
bert_score_test = pd.read_csv('csv/solo_embedding_bert_test.csv', usecols=['bert_score'])

In [9]:
word2vec_net = pd.read_csv('csv/train_red_w2v.csv', usecols=['w2v_net'])
word2vec_net_test = pd.read_csv('csv/test_red_w2v.csv', usecols=['w2v_net'])

In [10]:
glove_net = pd.read_csv('csv/train_red_glove.csv', usecols=['glove_net'])
glove_net_test = pd.read_csv('csv/test_red_glove.csv', usecols=['glove_net'])

In [11]:
bert_net = pd.read_csv('csv/train_red_bert.csv', usecols=['bert_net'])
bert_net_test = pd.read_csv('csv/test_red_bert.csv', usecols=['bert_net'])

In [12]:
elmo_net = pd.read_csv('csv/train_red_elmo.csv', usecols=['elmo_net'])
elmo_net_test = pd.read_csv('csv/test_red_elmo.csv', usecols=['elmo_net'])

In [13]:
tf_train = pd.read_csv('csv/train_TFIDF.csv', usecols=['tfidf_net'])
tf_test = pd.read_csv('csv/test_TFIDF.csv', usecols=['tfidf_net'])

In [14]:
tf_uni_train = pd.read_csv('csv/train_TFIDF_unigramas.csv', usecols=['tfidf_net_uni'])
tf_uni_test = pd.read_csv('csv/test_TFIDF_unigramas.csv', usecols=['tfidf_net_uni'])

In [15]:
tf_bi_train = pd.read_csv('csv/train_TFIDF_bigramas.csv', usecols=['tfidf_net_bi'])
tf_bi_test = pd.read_csv('csv/test_TFIDF_bigramas.csv', usecols=['tfidf_net_bi'])

In [16]:
tf_tri_train = pd.read_csv('csv/train_TFIDF_trigramas.csv', usecols=['tfidf_net_tri'])
tf_tri_test = pd.read_csv('csv/test_TFIDF_trigamas.csv', usecols=['tfidf_net_tri'])

In [17]:
best_train = pd.read_csv('eze_train.csv', usecols=['net'])
best_test = pd.read_csv('eze_test.csv', usecols=['net'])

In [18]:
train['word2vec_tree'] = word2vec_score
train['elmo_tree'] = elmo_score
train['glove_tree'] = glove_score
train['bert_tree'] = bert_score
train['bert_net'] = bert_net
train['elmo_net'] = elmo_net
train['w2v_net'] = word2vec_net
train['glove_net'] = glove_net
train['tf_train'] = tf_train
train['tf_uni'] = tf_uni_train
train['tf_bi'] = tf_bi_train
train['tf_tri'] = tf_tri_train
train['eze'] = best_train

In [19]:
train.drop(columns=['keyword','location','text'],inplace=True)

In [20]:
train.head()

Unnamed: 0,id,target,word2vec_tree,elmo_tree,glove_tree,bert_tree,bert_net,elmo_net,w2v_net,glove_net,tf_train,tf_uni,tf_bi,tf_tri,eze
0,1,1,0.556296,0.548083,0.591382,0.514254,0.474456,0.633377,0.321005,0.467304,0.587438,0.580922,0.564173,0.607627,0.996106
1,4,1,0.674412,0.601624,0.652751,0.540123,0.463941,0.341595,0.792568,0.831315,0.967136,0.986876,0.982303,0.934687,0.993173
2,5,1,0.725413,0.643861,0.656788,0.62772,0.798098,0.870856,0.818725,0.987908,0.71239,0.788277,0.81296,0.70597,0.994093
3,6,1,0.758587,0.529898,0.599172,0.576681,0.889468,0.915799,0.977664,0.950814,0.976515,0.98241,0.983006,0.953801,0.998561
4,7,1,0.59832,0.559565,0.565799,0.483547,0.739818,0.82599,0.880264,0.652306,0.525771,0.531229,0.498404,0.558009,0.994882


In [21]:
test['word2vec_tree'] = word2vec_score_test
test['elmo_tree'] = elmo_score_test
test['glove_tree'] = glove_score_test
test['bert_tree'] = bert_score_test
test['bert_net'] = bert_net_test
test['elmo_net'] = elmo_net_test
test['w2v_net'] = word2vec_net_test
test['glove_net'] = glove_net_test
test['tf_test'] = tf_test
test['tf_uni'] = tf_uni_test
test['tf_bi'] = tf_bi_test
test['tf_tri'] = tf_tri_test
test['eze'] = best_test

In [22]:
test.drop(columns=['keyword','location','text'],inplace=True)

In [23]:
test.head()

Unnamed: 0,id,word2vec_tree,elmo_tree,glove_tree,bert_tree,bert_net,elmo_net,w2v_net,glove_net,tf_test,tf_uni,tf_bi,tf_tri,eze
0,0,0.534202,0.269342,0.433447,0.520021,0.741691,0.772848,0.885978,0.808325,0.591726,0.587112,0.731684,0.610922,0.970841
1,2,0.583796,0.294223,0.5717,0.405625,0.969635,0.859516,0.896222,0.662682,0.417002,0.552427,0.675684,0.454667,0.998173
2,3,0.527839,0.489406,0.370747,0.46912,0.591778,0.821385,0.782109,0.629621,0.793878,0.618961,0.806482,0.786749,0.985128
3,9,0.548836,0.326129,0.612576,0.489437,0.99296,0.926375,0.939039,0.852077,0.531445,0.48803,0.757723,0.559148,0.996822
4,11,0.726522,0.317996,0.716023,0.55314,0.939691,0.94136,0.992535,0.963519,0.974716,0.983036,0.997448,0.948272,0.994723


# Me quedo con resultados de red

In [24]:
aux3 = train.iloc[:,6:15]
div = aux3.shape[1]
aux3.head()

Unnamed: 0,bert_net,elmo_net,w2v_net,glove_net,tf_train,tf_uni,tf_bi,tf_tri,eze
0,0.474456,0.633377,0.321005,0.467304,0.587438,0.580922,0.564173,0.607627,0.996106
1,0.463941,0.341595,0.792568,0.831315,0.967136,0.986876,0.982303,0.934687,0.993173
2,0.798098,0.870856,0.818725,0.987908,0.71239,0.788277,0.81296,0.70597,0.994093
3,0.889468,0.915799,0.977664,0.950814,0.976515,0.98241,0.983006,0.953801,0.998561
4,0.739818,0.82599,0.880264,0.652306,0.525771,0.531229,0.498404,0.558009,0.994882


In [25]:
aux_train = train.iloc[:,6:15]

In [26]:
aux3['suma'] = aux3.sum(axis=1)
train['prediction'] = (aux3['suma']/div)
for i in range(len(train.index)):
    if train.iloc[i,10]==0.5:
        train.iloc[i,10] = (word2vec_score.iloc[i,0]+elmo_score.iloc[i,0]+
                                glove_score.iloc[i,0]+bert_score).iloc[i,0]/4
            
train['prediction'] = train['prediction'].round().astype(int)            
print('Accuracy Score : ' + str(accuracy_score(train['target'],train['prediction'])))
print('Precision Score : ' + str(precision_score(train['target'],train['prediction'])))
print('Recall Score : ' + str(recall_score(train['target'],train['prediction'])))
print('F1 Score : ' + str(f1_score(train['target'],train['prediction'])))

Accuracy Score : 0.8807303296991987
Precision Score : 0.9233249731279112
Recall Score : 0.7878324671354325
F1 Score : 0.8502144506763444


In [27]:
a = aux_train.columns
import itertools 
best_solo = 0
best_items = []
for j in range(2, len(a)+1):
    m = list(itertools.combinations(a, j))
    len(m)
    contador = 0 
    for i in range (0, len(m)):
        lista = list(m[i])
        
        grid = aux_train.filter(items= lista)
        div = grid.shape[1]
        grid['suma'] = grid.sum(axis=1)
        train['prediction'] = (grid['suma']/div)
        train['prediction'] = train['prediction'].round().astype(int)  
        
        if (accuracy_score(train['target'],train['prediction'])> best_solo): 
            best_solo = accuracy_score(train['prediction'],train['target'])
            best_items = lista 

In [28]:
print('Mejor: ' + str(best_solo) +  ' obtenido con: ' + str(lista))

Mejor: 0.9066071194010246 obtenido con: ['bert_net', 'elmo_net', 'w2v_net', 'glove_net', 'tf_train', 'tf_uni', 'tf_bi', 'tf_tri', 'eze']


In [29]:
aux = test.iloc[:,5:14]
aux.head()

Unnamed: 0,bert_net,elmo_net,w2v_net,glove_net,tf_test,tf_uni,tf_bi,tf_tri,eze
0,0.741691,0.772848,0.885978,0.808325,0.591726,0.587112,0.731684,0.610922,0.970841
1,0.969635,0.859516,0.896222,0.662682,0.417002,0.552427,0.675684,0.454667,0.998173
2,0.591778,0.821385,0.782109,0.629621,0.793878,0.618961,0.806482,0.786749,0.985128
3,0.99296,0.926375,0.939039,0.852077,0.531445,0.48803,0.757723,0.559148,0.996822
4,0.939691,0.94136,0.992535,0.963519,0.974716,0.983036,0.997448,0.948272,0.994723


In [30]:
aux = aux.filter(items= lista)
div = aux.shape[1]
aux['suma'] = aux.sum(axis=1)
test['prediction'] = (aux['suma']/div)
test['prediction'] = test['prediction'].round().astype(int) 

In [31]:
aux = test.filter(items= ['id', 'prediction'])
aux.rename(columns = {'prediction': 'target'}, inplace=True)

In [34]:
aux.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,1
2,1
3,1
9,1
11,1


In [35]:
aux.reset_index(inplace = True)

In [37]:
aux.set_index("id",inplace = True)

In [39]:
aux.to_csv("submitionHugoCorrioNico.csv", index=False)