In [2]:
import pandas as pd
import re
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from catboost import CatBoostRegressor
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import STOPWORDS
%matplotlib inline

### ACA VOY A USAR WORD2VEC PARA EL TEXTO Y PARA LAS LOCATIONS. CON AMBOS PARAMETROS VOY A HACER UN ARBOL CON XGBOOST

In [4]:
train = pd.read_csv('csv/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
ids_with_target_error = [328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226]
train.loc[train['id'].isin(ids_with_target_error),'target'] = 0

In [6]:
test = pd.read_csv('csv/test.csv')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [7]:

test['location'] = test['location'].fillna("0")
train['location'] = train['location'].fillna("0")

## Cargo los embedings pre_entrenados en memoria

In [6]:
EMBEDDING_FILE = 'GoogleNews-vectors-negative300.bin.gz'
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

In [7]:
#Devuelve una lista de tweets dejando solo caracteres alfanumericos
def clean_text(df, columna):
    words = df[columna].str.split()
    clean_words = []

    for sentence in words:
        clean_sentence = []
        for word in sentence:
            clean_word = re.sub('[^a-zA-Z]','', word)
            if(clean_word != '')&(clean_word not in STOPWORDS):
                clean_sentence.append(clean_word.lower())
        clean_words.append(clean_sentence)
    
    return clean_words

In [8]:
def tweets_embeddings(df,columna):

    long_embedding = 300
    embeddings = np.zeros((len(df.index),long_embedding), dtype='float32') 
    normalized_embeddings = np.zeros((len(df.index),long_embedding),dtype='float32')

    contador = 0
    for tweet in df[columna]:
        cant_palabras = 0
        embedded_tweet = np.zeros((long_embedding,), dtype='float32')

        for word in tweet:
            if word in word2vec.vocab:
                embedded_tweet = np.add(word2vec[word],embedded_tweet)
                cant_palabras += 1     

        embeddings[contador] = embedded_tweet

        if cant_palabras!=0:
            normalized_embeddings[contador] = np.divide(embedded_tweet,cant_palabras)
        else:
            normalized_embeddings[contador] = embedded_tweet

        contador += 1    
        
    return embeddings, normalized_embeddings

In [9]:
# Devuelve un dataframe con todos los features considerados para el modelo
def generate_embeddings(df,columna,nueva):
    
    df[nueva] = clean_text(df,columna)
    df.drop(columns=['keyword','location','text'], inplace=True)
    df.set_index('id', inplace=True)
    
    return tweets_embeddings(df,nueva)

In [10]:
embeddings_train_text, normalized_embeddings_train_text = generate_embeddings(train.copy(), 'text', 'clean_text')

In [11]:
embeddings_train_loc, normalized_embeddings_train_loc = generate_embeddings(train.copy(),'location', 'clean_loc')

In [17]:
def generar_txt(path,serie):
    file = open(path, "w")
    for i in serie:
        np.savetxt(file, i)

    file.close()

In [18]:
generar_txt("mios/w2v_train_text.txt",embeddings_train_text)
generar_txt("mios/w2v_train_text(norm).txt",normalized_embeddings_train_text)

In [19]:
generar_txt("mios/w2v_train_loc.txt",embeddings_train_loc)
generar_txt("mios/w2v_train_loc(norm).txt",normalized_embeddings_train_loc)

In [21]:
embeddings_test_text, normalized_embeddings_test_text = generate_embeddings(test.copy(), 'text', 'clean_text')
embeddings_test_loc, normalized_embeddings_test_loc = generate_embeddings(test.copy(),'location', 'clean_loc')

In [22]:
generar_txt("mios/w2v_test_text.txt",embeddings_test_text)
generar_txt("mios/w2v_test_text(norm).txt",normalized_embeddings_test_text)

In [23]:
generar_txt("mios/w2v_test_text.txt",embeddings_test_text)
generar_txt("mios/w2v_test_text(norm).txt",normalized_embeddings_test_text)

In [11]:
w2v_train_text = np.loadtxt("mios/w2v_train_text.txt").reshape(len(train.index),300)
w2v_train_text_norm = np.loadtxt("mios/w2v_train_text(norm).txt").reshape(len(train.index),300)
w2v_train_loc       = np.loadtxt("mios/w2v_train_loc.txt").reshape(len(train.index),300)
w2v_train_loc_norm  = np.loadtxt("mios/w2v_train_loc(norm).txt").reshape(len(train.index),300)

w2v_test_text       = np.loadtxt("mios/w2v_test_text.txt").reshape(len(test.index),300)
w2v_test_text_norm  = np.loadtxt("mios/w2v_test_text(norm).txt").reshape(len(test.index),300)
w2v_test_loc        = np.loadtxt("mios/w2v_test_text.txt").reshape(len(test.index),300)
w2v_test_loc_norm   = np.loadtxt("mios/w2v_test_text(norm).txt").reshape(len(test.index),300)

In [16]:
w2v_test_loc_norm.shape

(3263, 300)

#### entreno mi set de datos usando los sin normalizar

In [36]:
X, y = np.concatenate( (w2v_train_text,w2v_train_loc) , axis=1) , train['target'].to_frame()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=13)

In [38]:
xg_reg = xgb.XGBRegressor(objective ='binary:logistic', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 10, alpha = 15, n_estimators = 5)

xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Accuracy Score : 0.7782152230971129
Precision Score : 0.8045112781954887
Recall Score : 0.6465256797583081
F1 Score : 0.7169179229480735


### ahora lo hago con los datos normalizados



In [41]:
X, y = np.concatenate( (w2v_train_loc_norm,w2v_train_text_norm) , axis=1) , train['target'].to_frame()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=13)

In [42]:
xg_reg = xgb.XGBRegressor(objective ='binary:logistic', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 10, alpha = 15, n_estimators = 5)

xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Accuracy Score : 0.7716535433070866
Precision Score : 0.8054474708171206
Recall Score : 0.6253776435045317
F1 Score : 0.7040816326530612
