# Se importan las bibliotecas necesarias

In [None]:
import pandas as pd
import re
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from catboost import CatBoostRegressor
from sklearn.linear_model import LogisticRegression
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import STOPWORDS
%matplotlib inline

# Se lee el .csv

In [None]:
test = pd.read_csv('csv/test.csv')
test.info()

In [None]:
test.head()

# Se cargan los embeddings pre-entrenados

In [4]:
#descargar de aca https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz, pesa 1.5gb
EMBEDDING_FILE = '../GoogleNews-vectors-negative300.bin.gz'
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

# Elaboración del modelo

Para este modelo se aplicará NLP para el procesamiento de los tweets y se utilizarán distintos árboles de decisión para realizar las predicciones:

### Preparación del set de datos

In [5]:
#Palabras con mayor y menor porcentaje de veracidad
tweets = train['text'].str.split()
clean_tweets = []

for tweet in tweets:
    clean_tweet = []
    for word in tweet:
        clean_word = re.sub('[^a-zA-Z]','', word)
        if(clean_word != '')&(clean_word not in STOPWORDS):
            clean_tweet.append(clean_word.lower())
    clean_tweets.append(clean_tweet)

In [6]:
train['clean_text'] = clean_tweets
train.set_index('id', inplace=True)
train.drop(columns=['keyword','location','text'], inplace=True)
train.head()

Unnamed: 0_level_0,target,clean_text
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,"[our, deeds, reason, earthquake, may, allah, f..."
4,1,"[forest, near, la, ronge, sask, canada]"
5,1,"[all, residents, asked, shelter, place, notifi..."
6,1,"[people, receive, wildfires, evacuation, order..."
7,1,"[just, got, sent, photo, ruby, alaska, smoke, ..."


In [30]:
embeddings = np.zeros((len(train.index),300),dtype='float32') # 300 es la longitud del embedding
normalized_embeddings = np.zeros((len(train.index),300),dtype='float32')

contador = 0
for tweet in clean_tweets:
    cant_palabras = 0
    embedded_tweet = np.zeros((300,), dtype='float32')
    
    for word in tweet:
        
        if word in word2vec.vocab:
            embedded_tweet = np.add(word2vec[word],embedded_tweet)
            cant_palabras += 1     
        
    embeddings[contador] = embedded_tweet
    
    if cant_palabras!=0:
        normalized_embeddings[contador] = np.divide(embedded_tweet,cant_palabras)
    else:
        normalized_embeddings[contador] = embedded_tweet
        
    contador += 1    

In [34]:
train['embeddings'] = embeddings.tolist()
train['normalized_embeddings'] = normalized_embeddings.tolist()

In [35]:
train.head()

Unnamed: 0_level_0,target,clean_text,embeddings,normalized_embeddings
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,"[our, deeds, reason, earthquake, may, allah, f...","[0.748291015625, 0.62420654296875, 1.207519531...","[0.1068987175822258, 0.08917236328125, 0.17250..."
4,1,"[forest, near, la, ronge, sask, canada]","[0.07080078125, -0.23956298828125, -0.13565063...","[0.0177001953125, -0.0598907470703125, -0.0339..."
5,1,"[all, residents, asked, shelter, place, notifi...","[0.127685546875, 0.06494140625, 1.075439453125...","[0.009821965359151363, 0.0049954927526414394, ..."
6,1,"[people, receive, wildfires, evacuation, order...","[1.033935546875, -0.5733642578125, 0.087036132...","[0.1723225861787796, -0.0955607071518898, 0.01..."
7,1,"[just, got, sent, photo, ruby, alaska, smoke, ...","[0.7109375, 0.258026123046875, -0.57470703125,...","[0.07109375298023224, 0.0258026123046875, -0.0..."


### Entrenamiento del set de datos considerando sólo 'embeddings'

In [57]:
X, y = normalized_embeddings, train.iloc[:,0].to_frame()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=482)

In [62]:
xg_reg = xgb.XGBRegressor(objective ='binary:logistic', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 20, alpha = 10, n_estimators = 80)

xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Accuracy Score : 0.8222416812609457
Precision Score : 0.837708830548926
Recall Score : 0.7222222222222222
F1 Score : 0.7756906077348067
