# Se importan las bibliotecas necesarias

In [23]:
import pandas as pd
import re
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import STOPWORDS
%matplotlib inline

# Se lee el .csv

In [42]:
train = pd.read_csv('csv/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [25]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [43]:
test = pd.read_csv('csv/test.csv')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


# Se cargan los embeddings pre-entrenados

In [5]:
#descargar de aca https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz, pesa 1.5gb
EMBEDDING_FILE = 'GoogleNews-vectors-negative300.bin.gz'
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

# Elaboración del modelo

Para este modelo se aplicará NLP para el procesamiento de los tweets y se utilizarán distintos árboles de decisión para realizar las predicciones:

### Preparación del set de datos

In [27]:
long_embedding = 300

In [28]:
#Devuelve una lista de tweets dejando solo caracteres alfanumericos
def clean_text(df):
    words = df['text'].str.split()
    clean_words = []

    for sentence in words:
        clean_sentence = []
        for word in sentence:
            clean_word = re.sub('[^a-zA-Z]','', word)
            if(clean_word != '')&(clean_word not in STOPWORDS):
                clean_sentence.append(clean_word.lower())
        clean_words.append(clean_sentence)
    
    return clean_words

In [41]:
def tweets_embeddings(df):

    embeddings = np.zeros((len(df.index),long_embedding), dtype='float32') 
    normalized_embeddings = np.zeros((len(df.index),long_embedding),dtype='float32')

    contador = 0
    for tweet in df['clean_text']:
        cant_palabras = 0
        embedded_tweet = np.zeros((long_embedding,), dtype='float32')

        for word in tweet:
            if word in word2vec.vocab:
                embedded_tweet = np.add(word2vec[word],embedded_tweet)
                cant_palabras += 1     

        embeddings[contador] = embedded_tweet

        if cant_palabras!=0:
            normalized_embeddings[contador] = np.divide(embedded_tweet,cant_palabras)
        else:
            normalized_embeddings[contador] = embedded_tweet

        contador += 1    
        
    return embeddings, normalized_embeddings


In [30]:
# Devuelve un dataframe con todos los features considerados para el modelo
def generate_embeddings(df):
    
    df['clean_text'] = clean_text(df)
    df.drop(columns=['keyword','location','text'], inplace=True)
    df.set_index('id', inplace=True)
    
    return tweets_embeddings(df)
    

In [44]:
embeddings_train, normalized_embeddings_train = generate_embeddings(train)

In [45]:
embeddings_test, normalized_embeddings_test = generate_embeddings(test)

### Entrenamiento del set de datos considerando sólo 'embeddings'

In [46]:
X, y = embeddings_train, train.iloc[:,0].to_frame()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=41)

In [47]:
xg_reg = xgb.XGBClassifier(objective ='binary:logistic', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 20, alpha = 10, n_estimators = 30)

xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
print('Accuracy Score : ' + str(accuracy_score(y_test,preds)))
print('Precision Score : ' + str(precision_score(y_test,preds)))
print('Recall Score : ' + str(recall_score(y_test,preds)))
print('F1 Score : ' + str(f1_score(y_test,preds)))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy Score : 0.7900262467191601
Precision Score : 0.8066914498141264
Recall Score : 0.6676923076923077
F1 Score : 0.7306397306397308


In [48]:
X, y = normalized_embeddings_train, train.iloc[:,0].to_frame()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=41)

In [49]:
xg_reg = xgb.XGBClassifier(objective ='binary:logistic', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 20, alpha = 10, n_estimators = 30)

xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
print('Accuracy Score : ' + str(accuracy_score(y_test,preds)))
print('Precision Score : ' + str(precision_score(y_test,preds)))
print('Recall Score : ' + str(recall_score(y_test,preds)))
print('F1 Score : ' + str(f1_score(y_test,preds)))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy Score : 0.8044619422572179
Precision Score : 0.8358778625954199
Recall Score : 0.6738461538461539
F1 Score : 0.7461669505962522


In [50]:
df = pd.DataFrame(xg_reg.predict(X),columns=["word2vec_score"])
df.head()

Unnamed: 0,word2vec_score
0,1
1,1
2,1
3,1
4,1


In [51]:
final = df.to_csv('csv/solo_embedding_word2vec_train.csv')

### Predicciones

In [52]:
normalized_embeddings_test.shape

(3263, 300)

In [53]:
test['target'] = xg_reg.predict(normalized_embeddings_test)
test.drop(columns=['clean_text'], inplace=True)
test.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,1
2,0
3,1
9,1
11,1


In [54]:
final = test.to_csv('csv/submission_word2vec.csv')

In [55]:
df = pd.DataFrame(xg_reg.predict(normalized_embeddings_test),columns=["word2vec_score"])
df.head()
final = df.to_csv('csv/solo_embedding_word2vec_test.csv')