# Se importan las bibliotecas necesarias

In [1]:
import pandas as pd
import re
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from catboost import CatBoostRegressor
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.parsing.preprocessing import STOPWORDS
%matplotlib inline

# Se lee el .csv

In [27]:
train = pd.read_csv('csv/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [20]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [21]:
test = pd.read_csv('csv/test.csv')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


# Se cargan los embeddings pre-entrenados

In [5]:
# descargar glove.twitter.27B.zip de aca https://nlp.stanford.edu/projects/glove/ pesa 1.75gb
glove_input = 'glove.twitter.27B.200d.txt'
word2vec_output = 'glove.twitter.27B.200d.txt.word2vec'
glove2word2vec(glove_input, word2vec_output)

(1193514, 200)

In [12]:
glove = KeyedVectors.load_word2vec_format(word2vec_output, binary=False)

# Elaboración del modelo

Para este modelo se aplicará NLP para el procesamiento de los tweets y se utilizarán distintos árboles de decisión para realizar las predicciones:

### Preparación del set de datos

In [23]:
long_embedding = 200

In [14]:
#Devuelve una lista de tweets dejando solo caracteres alfanumericos
def clean_text(df):
    words = df['text'].str.split()
    clean_words = []

    for sentence in words:
        clean_sentence = []
        for word in sentence:
            clean_word = re.sub('[^a-zA-Z]','', word)
            if(clean_word != '')&(clean_word not in STOPWORDS):
                clean_sentence.append(clean_word.lower())
        clean_words.append(clean_sentence)
    
    return clean_words

In [24]:
def tweets_embeddings(df):

    embeddings = np.zeros((len(df.index),long_embedding), dtype='float32') 
    normalized_embeddings = np.zeros((len(df.index),long_embedding),dtype='float32')

    contador = 0
    for tweet in df['clean_text']:
        cant_palabras = 0
        embedded_tweet = np.zeros((long_embedding,), dtype='float32')

        for word in tweet:
            if word in glove.vocab:
                embedded_tweet = np.add(glove[word],embedded_tweet)
                cant_palabras += 1     

        embeddings[contador] = embedded_tweet

        if cant_palabras!=0:
            normalized_embeddings[contador] = np.divide(embedded_tweet,cant_palabras)
        else:
            normalized_embeddings[contador] = embedded_tweet

        contador += 1    
        
    return embeddings, normalized_embeddings


In [16]:
# Devuelve un dataframe con todos los features considerados para el modelo
def generate_embeddings(df):
    
    df['clean_text'] = clean_text(df)
    df.drop(columns=['keyword','location','text'], inplace=True)
    df.set_index('id', inplace=True)
    
    return tweets_embeddings(df)
    

In [28]:
embeddings_train, normalized_embeddings_train = generate_embeddings(train)

In [29]:
train_embedding_file = open("train_embedding_file_glove.txt", "w")

for i in embeddings_train:
    np.savetxt(train_embedding_file, i)

train_embedding_file.close()

In [30]:
train_embedding_file = open("train_embedding_file_glove(norm).txt", "w")

for i in normalized_embeddings_train:
    np.savetxt(train_embedding_file, i)

train_embedding_file.close()

In [31]:
embeddings_test, normalized_embeddings_test = generate_embeddings(test)

In [32]:
test_embedding_file = open("test_embedding_file_glove.txt", "w")

for i in embeddings_test:
    np.savetxt(test_embedding_file, i)

test_embedding_file.close()

In [33]:
test_embedding_file = open("test_embedding_file_glove(norm).txt", "w")

for i in normalized_embeddings_test:
    np.savetxt(test_embedding_file, i)

test_embedding_file.close()

In [None]:
# embeddings_train = np.loadtxt("train_embedding_file_glove.txt").reshape(len(train.index),300)
# embeddings_train.shape

In [None]:
# normalized_embeddings_train = np.loadtxt("train_embedding_file_glove(norm).txt").reshape(len(train.index),300)
# normalized_embeddings_train.shape

In [None]:
# embeddings_test = np.loadtxt("test_embedding_file_glove.txt").reshape(len(test.index),300)
# embeddings_test.shape

In [None]:
# normalized_embeddings_test = np.loadtxt("test_embedding_file_glove(norm).txt").reshape(len(test.index),300)
# normalized_embeddings_test.shape

### Entrenamiento del set de datos considerando sólo 'embeddings'

In [34]:
X, y = normalized_embeddings_train, train.iloc[:,0].to_frame()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=13)

In [35]:
xg_reg = xgb.XGBRegressor(objective ='binary:logistic', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 10, alpha = 15, n_estimators = 5)

xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Accuracy Score : 0.7821522309711286
Precision Score : 0.800711743772242
Recall Score : 0.6716417910447762
F1 Score : 0.7305194805194805


In [36]:
rf_model = RandomForestRegressor(random_state=13, n_estimators=5, max_depth=10)
rf_model.fit(X_train, y_train)
preds = rf_model.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

  


Accuracy Score : 0.7532808398950132
Precision Score : 0.7348242811501597
Recall Score : 0.6865671641791045
F1 Score : 0.7098765432098765


In [37]:
lgb_class = lgb.LGBMRegressor(learning_rate = 0.1,
                max_depth = 10, alpha = 10, n_estimators = 5)
lgb_class.fit(X_train, y_train)
preds = lgb_class.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

  y = column_or_1d(y, warn=True)


Accuracy Score : 0.7808398950131233
Precision Score : 0.9
Recall Score : 0.564179104477612
F1 Score : 0.6935779816513762


In [38]:
catb = CatBoostRegressor(iterations=5, depth=10)
catb.fit(X_train, y_train)
preds = catb.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Learning rate set to 0.5
0:	learn: 0.4278340	total: 591ms	remaining: 2.36s
1:	learn: 0.3873108	total: 1.1s	remaining: 1.65s
2:	learn: 0.3582024	total: 1.6s	remaining: 1.07s
3:	learn: 0.3391276	total: 2.11s	remaining: 527ms
4:	learn: 0.3244853	total: 2.61s	remaining: 0us
Accuracy Score : 0.7755905511811023
Precision Score : 0.7949640287769785
Recall Score : 0.6597014925373135
F1 Score : 0.7210440456769983


In [39]:
gb = GradientBoostingRegressor(n_estimators=5, learning_rate=0.1, 
                                max_features=2, max_depth = 10, random_state = 0)
gb.fit(X_train, y_train)
preds = gb.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Accuracy Score : 0.7414698162729659
Precision Score : 0.8194444444444444
Recall Score : 0.5283582089552239
F1 Score : 0.6424682395644283


  y = column_or_1d(y, warn=True)


In [40]:
eclf2 = VotingRegressor(estimators=[
         ('xgb', xg_reg), ('rf', rf_model), ('catb', catb), ('gb', gb), ('lgbm',lgb_class)])

eclf2 = eclf2.fit(X, y)
preds = eclf2.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

  y = column_or_1d(y, warn=True)


Learning rate set to 0.5
0:	learn: 0.4303131	total: 494ms	remaining: 1.98s
1:	learn: 0.3886305	total: 985ms	remaining: 1.48s
2:	learn: 0.3634779	total: 1.48s	remaining: 986ms
3:	learn: 0.3458650	total: 1.97s	remaining: 492ms
4:	learn: 0.3319806	total: 2.46s	remaining: 0us
Accuracy Score : 0.9133858267716536
Precision Score : 0.9686411149825784
Recall Score : 0.8298507462686567
F1 Score : 0.8938906752411576


In [41]:
df = pd.DataFrame(eclf2.predict(X),columns=["glove_score"])
df.head()

Unnamed: 0,glove_score
0,0.556011
1,0.671471
2,0.655318
3,0.639987
4,0.562215


In [42]:
final = df.to_csv('csv/solo_embedding_glove_train.csv')

### Predicciones

In [43]:
test['target'] = eclf2.predict(normalized_embeddings_test)
test.drop(columns=['clean_text'], inplace=True)
test.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,0.461348
2,0.51499
3,0.41244
9,0.620226
11,0.764246


In [44]:
final = test.to_csv('csv/submission_glove.csv')

In [45]:
df = pd.DataFrame(eclf2.predict(normalized_embeddings_test),columns=["glove_score"])
df.head()
final = df.to_csv('csv/solo_embedding_glove_test.csv')