# Se importan las bibliotecas necesarias

In [1]:
import pandas as pd
import re
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from catboost import CatBoostClassifier
%matplotlib inline

# Se lee el .csv

In [2]:
train = pd.read_csv('csv/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test = pd.read_csv('csv/test.csv')

# Lectura de predicciones previas

In [5]:
word2vec_score = pd.read_csv('csv/solo_embedding_word2vec_train.csv', usecols=['word2vec_score'])
word2vec_score_test = pd.read_csv('csv/solo_embedding_word2vec_test.csv', usecols=['word2vec_score'])

In [7]:
elmo_score = pd.read_csv('csv/solo_embedding_elmo_train.csv', usecols=['elmo_score'])
elmo_score_test = pd.read_csv('csv/solo_embedding_elmo_test.csv', usecols=['elmo_score'])

In [8]:
glove_score = pd.read_csv('csv/solo_embedding_glove_train.csv', usecols=['glove_score'])
glove_score_test = pd.read_csv('csv/solo_embedding_glove_test.csv', usecols=['glove_score'])

In [9]:
bert_score = pd.read_csv('csv/solo_embedding_bert_train.csv', usecols=['bert_score'])
bert_score_test = pd.read_csv('csv/solo_embedding_bert_test.csv', usecols=['bert_score'])

In [10]:
word2vec_net = pd.read_csv('csv/train_red_w2v.csv', usecols=['w2v_net'])
word2vec_net_test = pd.read_csv('csv/test_red_w2v.csv', usecols=['w2v_net'])

In [11]:
glove_net = pd.read_csv('csv/train_red_glove.csv', usecols=['glove_net'])
glove_net_test = pd.read_csv('csv/test_red_glove.csv', usecols=['glove_net'])

In [12]:
bert_net = pd.read_csv('csv/train_red_bert.csv', usecols=['bert_net'])
bert_net_test = pd.read_csv('csv/test_red_bert.csv', usecols=['bert_net'])

In [13]:
elmo_net = pd.read_csv('csv/train_red_elmo.csv', usecols=['elmo_net'])
elmo_net_test = pd.read_csv('csv/test_red_elmo.csv', usecols=['elmo_net'])

# Preparación del set de datos

In [14]:
train['word2vec_tree'] = word2vec_score
train['elmo_tree'] = elmo_score
train['glove_tree'] = glove_score
train['bert_tree'] = bert_score
train['bert_net'] = bert_net
train['elmo_net'] = elmo_net
train['w2v_net'] = word2vec_net
train['glove_net'] = glove_net

In [15]:
train.drop(columns=['keyword','location','text'],inplace=True)

In [16]:
train.head()

Unnamed: 0,id,target,word2vec_tree,elmo_tree,glove_tree,bert_tree,bert_net,elmo_net,w2v_net,glove_net
0,1,1,0.667608,0.715482,0.635028,0.640565,0.242175,0.45009,0.992572,0.65178
1,4,1,0.855877,0.783273,0.877394,0.751214,0.237999,0.297658,0.999871,0.966338
2,5,1,0.892648,0.814854,0.870452,0.796551,0.653434,0.809174,0.967203,0.971955
3,6,1,0.946222,0.838757,0.860857,0.900976,0.901097,0.930959,0.999996,0.934217
4,7,1,0.829571,0.830163,0.758754,0.694989,0.713383,0.484619,0.998377,0.60717


In [17]:
test['word2vec_tree'] = word2vec_score_test
test['elmo_tree'] = elmo_score_test
test['glove_tree'] = glove_score_test
test['bert_tree'] = bert_score_test
test['bert_net'] = bert_net_test
test['elmo_net'] = elmo_net_test
test['w2v_net'] = word2vec_net_test
test['glove_net'] = glove_net_test

In [18]:
test.drop(columns=['keyword','location','text'],inplace=True)

In [19]:
test.head()

Unnamed: 0,id,word2vec_tree,elmo_tree,glove_tree,bert_tree,bert_net,elmo_net,w2v_net,glove_net
0,0,0.699924,0.323115,0.729142,0.634452,0.925312,0.707569,0.921463,0.743638
1,2,0.835608,0.397249,0.628241,0.562603,0.956846,0.889106,0.970476,0.864992
2,3,0.658962,0.933304,0.4636,0.477252,0.631169,0.75123,0.97526,0.609823
3,9,0.828614,0.695115,0.854859,0.82896,0.995054,0.952858,0.999825,0.909378
4,11,0.853753,0.74111,1.039749,0.806881,0.971665,0.9584,0.999997,0.994435


# Me quedo con los resultados de red y tree

In [20]:
aux = train.iloc[:,2:10]
div = aux.shape[1]
suma = aux.sum(axis=1)
predicciones = suma/div

for i in range(len(predicciones)):
    if predicciones[i]==0.5:
        predicciones[i] = (word2vec_score.iloc[i,0]+elmo_score.iloc[i,0]+
                                glove_score.iloc[i,0]+bert_score.iloc[i,0])/4
            
train['prediction'] = predicciones.round().astype(int)            
train.head()

Unnamed: 0,id,target,word2vec_tree,elmo_tree,glove_tree,bert_tree,bert_net,elmo_net,w2v_net,glove_net,prediction
0,1,1,0.667608,0.715482,0.635028,0.640565,0.242175,0.45009,0.992572,0.65178,1
1,4,1,0.855877,0.783273,0.877394,0.751214,0.237999,0.297658,0.999871,0.966338,1
2,5,1,0.892648,0.814854,0.870452,0.796551,0.653434,0.809174,0.967203,0.971955,1
3,6,1,0.946222,0.838757,0.860857,0.900976,0.901097,0.930959,0.999996,0.934217,1
4,7,1,0.829571,0.830163,0.758754,0.694989,0.713383,0.484619,0.998377,0.60717,1


In [21]:
print('Accuracy Score : ' + str(accuracy_score(train['target'],train['prediction'])))
print('Precision Score : ' + str(precision_score(train['target'],train['prediction'])))
print('Recall Score : ' + str(recall_score(train['target'],train['prediction'])))
print('F1 Score : ' + str(f1_score(train['target'],train['prediction'])))

Accuracy Score : 0.9361618284513332
Precision Score : 0.9777015437392796
Recall Score : 0.8712931825129929
F1 Score : 0.9214354995150339


In [22]:
aux2 = test.iloc[:,1:9]
div = aux2.shape[1]
suma = aux2.sum(axis=1)
predicciones_test = (suma/div)

for i in range(len(predicciones_test)):
    if predicciones_test[i]==0.5:
        predicciones_test[i] = (word2vec_score_test.iloc[i,0]+elmo_score_test.iloc[i,0]+
                                glove_score_test.iloc[i,0]+bert_score_test.iloc[i,0])/4

test['prediction'] = predicciones_test.round().astype(int)            
test.head()

Unnamed: 0,id,word2vec_tree,elmo_tree,glove_tree,bert_tree,bert_net,elmo_net,w2v_net,glove_net,prediction
0,0,0.699924,0.323115,0.729142,0.634452,0.925312,0.707569,0.921463,0.743638,1
1,2,0.835608,0.397249,0.628241,0.562603,0.956846,0.889106,0.970476,0.864992,1
2,3,0.658962,0.933304,0.4636,0.477252,0.631169,0.75123,0.97526,0.609823,1
3,9,0.828614,0.695115,0.854859,0.82896,0.995054,0.952858,0.999825,0.909378,1
4,11,0.853753,0.74111,1.039749,0.806881,0.971665,0.9584,0.999997,0.994435,1


In [23]:
test['prediction'].to_frame().to_csv('csv/avg_embeddings_trees_nets.csv')

# Me quedo con resultados de red

In [24]:
aux3 = train.iloc[:,6:10]
aux3['suma'] = aux3.sum(axis=1)
train['prediction'] = (aux3['suma']/4)
for i in range(len(train.index)):
    if train.iloc[i,10]==0.5:
        train.iloc[i,10] = (word2vec_score.iloc[i,0]+elmo_score.iloc[i,0]+
                                glove_score.iloc[i,0]+bert_score).iloc[i,0]/4
            
train['prediction'] = train['prediction'].round().astype(int)            
train.head()

Unnamed: 0,id,target,word2vec_tree,elmo_tree,glove_tree,bert_tree,bert_net,elmo_net,w2v_net,glove_net,prediction
0,1,1,0.667608,0.715482,0.635028,0.640565,0.242175,0.45009,0.992572,0.65178,1
1,4,1,0.855877,0.783273,0.877394,0.751214,0.237999,0.297658,0.999871,0.966338,1
2,5,1,0.892648,0.814854,0.870452,0.796551,0.653434,0.809174,0.967203,0.971955,1
3,6,1,0.946222,0.838757,0.860857,0.900976,0.901097,0.930959,0.999996,0.934217,1
4,7,1,0.829571,0.830163,0.758754,0.694989,0.713383,0.484619,0.998377,0.60717,1


In [25]:
print('Accuracy Score : ' + str(accuracy_score(train['target'],train['prediction'])))
print('Precision Score : ' + str(precision_score(train['target'],train['prediction'])))
print('Recall Score : ' + str(recall_score(train['target'],train['prediction'])))
print('F1 Score : ' + str(f1_score(train['target'],train['prediction'])))

Accuracy Score : 0.8947852357809011
Precision Score : 0.9417024320457796
Recall Score : 0.8049526138795475
F1 Score : 0.8679742871270809


In [26]:
aux4 = test.iloc[:,5:9]
aux4['suma'] = aux4.sum(axis=1)
test['prediction'] = (aux4['suma']/4)
for i in range(len(test.index)):
    if test.iloc[i,9]==0.5:
        test.iloc[i,9] = (word2vec_score_test.iloc[i,0]+elmo_score_test.iloc[i,0]+
                                glove_score_test.iloc[i,0]+bert_score_test).iloc[i,0]/4
test['prediction'] = test['prediction'].round().astype(int)            
test.head()

Unnamed: 0,id,word2vec_tree,elmo_tree,glove_tree,bert_tree,bert_net,elmo_net,w2v_net,glove_net,prediction
0,0,0.699924,0.323115,0.729142,0.634452,0.925312,0.707569,0.921463,0.743638,1
1,2,0.835608,0.397249,0.628241,0.562603,0.956846,0.889106,0.970476,0.864992,1
2,3,0.658962,0.933304,0.4636,0.477252,0.631169,0.75123,0.97526,0.609823,1
3,9,0.828614,0.695115,0.854859,0.82896,0.995054,0.952858,0.999825,0.909378,1
4,11,0.853753,0.74111,1.039749,0.806881,0.971665,0.9584,0.999997,0.994435,1


In [27]:
test['prediction'].to_frame().to_csv('avg_embeddings_nets.csv')

In [28]:
final = test['prediction'].to_frame()

In [29]:
final['id'] = test['id']
final.set_index('id',inplace=True)

In [30]:
final.rename(columns={'prediction':'target'},inplace=True)

In [31]:
final.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,1
2,1
3,1
9,1
11,1


In [None]:
# final.to_csv('csv/mayor_voting_4_embedds_red.csv')