# Same modelo 10 pero con TF-IDF

In [60]:
import pandas as pd
import re
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from catboost import CatBoostClassifier
%matplotlib inline

# Se lee el .csv

In [61]:
train = pd.read_csv('csv/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [62]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [63]:
test = pd.read_csv('csv/test.csv')

In [64]:
word2vec_score = pd.read_csv('csv/solo_embedding_word2vec_train.csv', usecols=['word2vec_score'])
word2vec_score_test = pd.read_csv('csv/solo_embedding_word2vec_test.csv', usecols=['word2vec_score'])

In [65]:
elmo_score = pd.read_csv('csv/solo_embedding_elmo_train.csv', usecols=['elmo_score'])
elmo_score_test = pd.read_csv('csv/solo_embedding_elmo_test.csv', usecols=['elmo_score'])

In [66]:
glove_score = pd.read_csv('csv/solo_embedding_glove_train.csv', usecols=['glove_score'])
glove_score_test = pd.read_csv('csv/solo_embedding_glove_test.csv', usecols=['glove_score'])

In [67]:
bert_score = pd.read_csv('csv/solo_embedding_bert_train.csv', usecols=['bert_score'])
bert_score_test = pd.read_csv('csv/solo_embedding_bert_test.csv', usecols=['bert_score'])

In [68]:
word2vec_net = pd.read_csv('csv/train_red_w2v.csv', usecols=['w2v_net'])
word2vec_net_test = pd.read_csv('csv/test_red_w2v.csv', usecols=['w2v_net'])

In [69]:
glove_net = pd.read_csv('csv/train_red_glove.csv', usecols=['glove_net'])
glove_net_test = pd.read_csv('csv/test_red_glove.csv', usecols=['glove_net'])

In [70]:
bert_net = pd.read_csv('csv/train_red_bert.csv', usecols=['bert_net'])
bert_net_test = pd.read_csv('csv/test_red_bert.csv', usecols=['bert_net'])

In [71]:
elmo_net = pd.read_csv('csv/train_red_elmo.csv', usecols=['elmo_net'])
elmo_net_test = pd.read_csv('csv/test_red_elmo.csv', usecols=['elmo_net'])

In [72]:
tf_train = pd.read_csv('csv/train_TFIDF.csv', usecols=['tfidf_net'])
tf_test = pd.read_csv('csv/test_TFIDF.csv', usecols=['tfidf_net'])

In [73]:
train['word2vec_tree'] = word2vec_score
train['elmo_tree'] = elmo_score
train['glove_tree'] = glove_score
train['bert_tree'] = bert_score
train['bert_net'] = bert_net
train['elmo_net'] = elmo_net
train['w2v_net'] = word2vec_net
train['glove_net'] = glove_net
train['tf_train'] = tf_train

In [74]:
train.drop(columns=['keyword','location','text'],inplace=True)

In [75]:
train.head()

Unnamed: 0,id,target,word2vec_tree,elmo_tree,glove_tree,bert_tree,bert_net,elmo_net,w2v_net,glove_net,tf_train
0,1,1,0.556296,0.522046,0.587237,0.56796,0.474456,0.633377,0.321005,0.467304,0.587438
1,4,1,0.674412,0.53897,0.676435,0.548946,0.463941,0.341595,0.792568,0.831315,0.967136
2,5,1,0.725413,0.583167,0.668918,0.589782,0.798098,0.870856,0.818725,0.987908,0.71239
3,6,1,0.758587,0.65054,0.656915,0.602543,0.889468,0.915799,0.977664,0.950814,0.976515
4,7,1,0.59832,0.570466,0.573899,0.531318,0.739818,0.82599,0.880264,0.652306,0.525771


In [76]:
test['word2vec_tree'] = word2vec_score_test
test['elmo_tree'] = elmo_score_test
test['glove_tree'] = glove_score_test
test['bert_tree'] = bert_score_test
test['bert_net'] = bert_net_test
test['elmo_net'] = elmo_net_test
test['w2v_net'] = word2vec_net_test
test['glove_net'] = glove_net_test
test['tf_test'] = tf_test

In [77]:
test.drop(columns=['keyword','location','text'],inplace=True)

In [78]:
test.head()

Unnamed: 0,id,word2vec_tree,elmo_tree,glove_tree,bert_tree,bert_net,elmo_net,w2v_net,glove_net,tf_test
0,0,0.534202,0.379233,0.574705,0.482696,0.741691,0.772848,0.885978,0.808325,0.591726
1,2,0.583796,0.367322,0.506844,0.520214,0.969635,0.859516,0.896222,0.662682,0.417002
2,3,0.527839,0.6787,0.411923,0.455055,0.591778,0.821385,0.782109,0.629621,0.793878
3,9,0.548836,0.430616,0.630913,0.6421,0.99296,0.926375,0.939039,0.852077,0.531445
4,11,0.726522,0.584599,0.705246,0.644486,0.939691,0.94136,0.992535,0.963519,0.974716


# Me quedo con los resultados de red y tree

In [79]:
aux = train.iloc[:,2:11]
aux['suma'] = aux.sum(axis=1)
train['prediction'] = (aux['suma']/8)
for i in range(len(train.index)):
    if train.iloc[i,10]==0.5:
        train.iloc[i,10] = (word2vec_score.iloc[i,0]+elmo_score.iloc[i,0]+
                                glove_score.iloc[i,0]+bert_score).iloc[i,0]/4
            
train['prediction'] = train['prediction'].round().astype(int)            
train.head()

Unnamed: 0,id,target,word2vec_tree,elmo_tree,glove_tree,bert_tree,bert_net,elmo_net,w2v_net,glove_net,tf_train,prediction
0,1,1,0.556296,0.522046,0.587237,0.56796,0.474456,0.633377,0.321005,0.467304,0.587438,1
1,4,1,0.674412,0.53897,0.676435,0.548946,0.463941,0.341595,0.792568,0.831315,0.967136,1
2,5,1,0.725413,0.583167,0.668918,0.589782,0.798098,0.870856,0.818725,0.987908,0.71239,1
3,6,1,0.758587,0.65054,0.656915,0.602543,0.889468,0.915799,0.977664,0.950814,0.976515,1
4,7,1,0.59832,0.570466,0.573899,0.531318,0.739818,0.82599,0.880264,0.652306,0.525771,1


In [80]:
print('Accuracy Score : ' + str(accuracy_score(train['target'],train['prediction'])))
print('Precision Score : ' + str(precision_score(train['target'],train['prediction'])))
print('Recall Score : ' + str(recall_score(train['target'],train['prediction'])))
print('F1 Score : ' + str(f1_score(train['target'],train['prediction'])))

Accuracy Score : 0.8861158544594772
Precision Score : 0.8905133203378818
Recall Score : 0.8379700397431978
F1 Score : 0.8634430618995117


In [81]:
aux2 = test.iloc[:,1:9]
aux2['suma'] = aux2.sum(axis=1)
test['prediction'] = (aux2['suma']/8)
for i in range(len(test.index)):
    if test.iloc[i,9]==0.5:
        test.iloc[i,9] = (word2vec_score_test.iloc[i,0]+elmo_score_test.iloc[i,0]+
                                glove_score_test.iloc[i,0]+bert_score_test).iloc[i,0]/4

test['prediction'] = test['prediction'].round().astype(int)            
test.head()

Unnamed: 0,id,word2vec_tree,elmo_tree,glove_tree,bert_tree,bert_net,elmo_net,w2v_net,glove_net,tf_test,prediction
0,0,0.534202,0.379233,0.574705,0.482696,0.741691,0.772848,0.885978,0.808325,0.591726,1
1,2,0.583796,0.367322,0.506844,0.520214,0.969635,0.859516,0.896222,0.662682,0.417002,1
2,3,0.527839,0.6787,0.411923,0.455055,0.591778,0.821385,0.782109,0.629621,0.793878,1
3,9,0.548836,0.430616,0.630913,0.6421,0.99296,0.926375,0.939039,0.852077,0.531445,1
4,11,0.726522,0.584599,0.705246,0.644486,0.939691,0.94136,0.992535,0.963519,0.974716,1


In [49]:
test['prediction'].to_frame().to_csv('boliviano3.csv')

# Me quedo con resultados de red

In [51]:
aux3 = train.iloc[:,6:11]
aux3['suma'] = aux3.sum(axis=1)
train['prediction'] = (aux3['suma']/4)
for i in range(len(train.index)):
    if train.iloc[i,10]==0.5:
        train.iloc[i,10] = (word2vec_score.iloc[i,0]+elmo_score.iloc[i,0]+
                                glove_score.iloc[i,0]+bert_score).iloc[i,0]/4
            
train['prediction'] = train['prediction'].round().astype(int)            
train.head()

Unnamed: 0,id,target,word2vec_tree,elmo_tree,glove_tree,bert_tree,bert_net,elmo_net,w2v_net,glove_net,tf_train,prediction
0,1,1,0.556296,0.522046,0.587237,0.56796,0.474456,0.633377,0.321005,0.467304,0.587438,1
1,4,1,0.674412,0.53897,0.676435,0.548946,0.463941,0.341595,0.792568,0.831315,0.967136,1
2,5,1,0.725413,0.583167,0.668918,0.589782,0.798098,0.870856,0.818725,0.987908,0.71239,1
3,6,1,0.758587,0.65054,0.656915,0.602543,0.889468,0.915799,0.977664,0.950814,0.976515,1
4,7,1,0.59832,0.570466,0.573899,0.531318,0.739818,0.82599,0.880264,0.652306,0.525771,1


In [52]:
print('Accuracy Score : ' + str(accuracy_score(train['target'],train['prediction'])))
print('Precision Score : ' + str(precision_score(train['target'],train['prediction'])))
print('Recall Score : ' + str(recall_score(train['target'],train['prediction'])))
print('F1 Score : ' + str(f1_score(train['target'],train['prediction'])))

Accuracy Score : 0.8611585445947721
Precision Score : 0.8340374170187085
Recall Score : 0.8450015285845307
F1 Score : 0.8394836750189826


In [53]:
aux4 = test.iloc[:,5:10]
aux4['suma'] = aux4.sum(axis=1)
test['prediction'] = (aux4['suma']/4)
for i in range(len(test.index)):
    if test.iloc[i,9]==0.5:
        test.iloc[i,9] = (word2vec_score_test.iloc[i,0]+elmo_score_test.iloc[i,0]+
                                glove_score_test.iloc[i,0]+bert_score_test).iloc[i,0]/4
test['prediction'] = test['prediction'].round().astype(int)            
test.head()

Unnamed: 0,id,word2vec_tree,elmo_tree,glove_tree,bert_tree,bert_net,elmo_net,w2v_net,glove_net,tf_test,prediction
0,0,0.534202,0.379233,0.574705,0.482696,0.741691,0.772848,0.885978,0.808325,0.591726,1
1,2,0.583796,0.367322,0.506844,0.520214,0.969635,0.859516,0.896222,0.662682,0.417002,1
2,3,0.527839,0.6787,0.411923,0.455055,0.591778,0.821385,0.782109,0.629621,0.793878,1
3,9,0.548836,0.430616,0.630913,0.6421,0.99296,0.926375,0.939039,0.852077,0.531445,1
4,11,0.726522,0.584599,0.705246,0.644486,0.939691,0.94136,0.992535,0.963519,0.974716,1


In [54]:
test['prediction'].to_frame().to_csv('boliviano4.csv')

In [55]:
final = test['prediction'].to_frame()

In [56]:
final['id'] = test['id']
final.set_index('id',inplace=True)

In [57]:
final.rename(columns={'prediction':'target'},inplace=True)

In [58]:
final.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,1
2,1
3,1
9,1
11,1


In [59]:
final.to_csv('csv/mayor_voting_con_tfidf_modelo11.csv')