# Se importan las bibliotecas necesarias

In [77]:
import pandas as pd
import re
import numpy as np
import pickle
import nltk
import spacy
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from catboost import CatBoostRegressor
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import STOPWORDS
%matplotlib inline

# Se lee el .csv

In [2]:
train = pd.read_csv('../csv/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
ids_with_target_error = [328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226]
train.loc[train['id'].isin(ids_with_target_error),'target'] = 0

In [7]:
test = pd.read_csv('../csv/test.csv')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [14]:
# Se leen las abreviaturas 
file = open('../abreviaturas.pkl','rb')
abbreviations = pickle.load(file)
file.close()

# Se cargan los embeddings pre-entrenados

In [8]:
#descargar de aca https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz, pesa 1.5gb
EMBEDDING_FILE = '../GoogleNews-vectors-negative300.bin.gz'
word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

# Elaboración del modelo

Para este modelo se aplicará NLP para el procesamiento de los tweets y se utilizarán distintos árboles de decisión para realizar las predicciones:

### Preparación del set de datos

In [9]:
long_embedding = 300

In [54]:
# Devuelve lista de tweets con abreviaturas expandidas
def expand_abbreviations(sentences):
    expanded_sentences = []
    
    for sentence in sentences:
        expanded_sentence = []
        
        for word in sentence:
            if word in abbreviations:
                word = word.replace(word,abbreviations[word])
            expanded_sentence.append(word)
            
        tokenized = nltk.word_tokenize(" ".join(expanded_sentence))
        expanded_sentences.append(tokenized)
        
    return expanded_sentences

In [76]:
#Devuelve los tweets en minuscula eliminando simbolos y numeros
def remove_symbols_and_numbers(sentences):
    clean_sentences = []

    for sentence in sentences:
        clean_sentence = []
    
        for word in sentence:
            clean_word = re.sub('[^a-zA-Z]',' ', word)
            if(clean_word != ' '):
                clean_sentence.append(clean_word.lower())
                
        tokenized = nltk.word_tokenize(" ".join(clean_sentence))
        clean_sentences.append(tokenized)

    return clean_sentences

In [104]:
def remove_stopwords(sentences):
    clean_sentences = []
    for sentence in words:
        clean_sentence = []
        for word in sentence:
            if word not in STOPWORDS:
                clean_sentence.append(word)
        clean_sentences.append(clean_sentence)
    return clean_sentences

In [84]:
# PARA QUE CORRA ESTA FUNCION HAY QUE PONER EN CONSOLA:
# pip install spacy
# spacy download en
# Devuelve los tweets lematizados
def lemmatize_tweets(sentences):
    nlp = spacy.load('en')
    lemmatized = []
    
    for sentence in sentences:
        sentence = ' '.join(sentence)
        doc = nlp(sentence)
        lemmatized.append([token.lemma_ for token in doc])
    
    return lemmatized    

In [86]:
#Devuelve una lista de tweets dejando solo letras y lematizando las palabras
def clean_text(df):
    
    words = df['text'].str.split()
    words = remove_symbols_and_numbers(words)
    words = expand_abbreviations(words)
    words = remove_stopwords(words)
    
    return lemmatize_tweets(words)

In [11]:
def tweets_embeddings(df):

    embeddings = np.zeros((len(df.index),long_embedding), dtype='float32') 
    normalized_embeddings = np.zeros((len(df.index),long_embedding),dtype='float32')

    contador = 0
    for tweet in df['clean_text']:
        cant_palabras = 0
        embedded_tweet = np.zeros((long_embedding,), dtype='float32')

        for word in tweet:
            if word in word2vec.vocab:
                embedded_tweet = np.add(word2vec[word],embedded_tweet)
                cant_palabras += 1     

        embeddings[contador] = embedded_tweet

        if cant_palabras!=0:
            normalized_embeddings[contador] = np.divide(embedded_tweet,cant_palabras)
        else:
            normalized_embeddings[contador] = embedded_tweet

        contador += 1    
        
    return embeddings, normalized_embeddings


In [12]:
# Devuelve un dataframe con todos los features considerados para el modelo
def generate_embeddings(df):
    
    df['clean_text'] = clean_text(df)
    df.drop(columns=['keyword','location','text'], inplace=True)
    df.set_index('id', inplace=True)
    
    return tweets_embeddings(df)
    

In [13]:
embeddings_train, normalized_embeddings_train = generate_embeddings(train)

In [14]:
train_embedding_file = open("../train_embedding_file_w2v.txt", "w")

for i in embeddings_train:
    np.savetxt(train_embedding_file, i)

train_embedding_file.close()

In [15]:
train_embedding_file = open("../train_embedding_file_w2v(norm).txt", "w")

for i in normalized_embeddings_train:
    np.savetxt(train_embedding_file, i)

train_embedding_file.close()

In [16]:
embeddings_test, normalized_embeddings_test = generate_embeddings(test)

In [17]:
test_embedding_file = open("../test_embedding_file_w2v.txt", "w")

for i in embeddings_test:
    np.savetxt(test_embedding_file, i)

test_embedding_file.close()

In [18]:
test_embedding_file = open("../test_embedding_file_w2v(norm).txt", "w")

for i in normalized_embeddings_test:
    np.savetxt(test_embedding_file, i)

test_embedding_file.close()

In [19]:
embeddings_train = np.loadtxt("../train_embedding_file_w2v.txt").reshape(len(train.index),long_embedding)
embeddings_train.shape

(7613, 300)

In [20]:
normalized_embeddings_train = np.loadtxt("../train_embedding_file_w2v(norm).txt").reshape(len(train.index),long_embedding)
normalized_embeddings_train.shape

(7613, 300)

In [21]:
embeddings_test = np.loadtxt("../test_embedding_file_w2v.txt").reshape(len(test.index),long_embedding)
embeddings_test.shape

(3263, 300)

In [22]:
normalized_embeddings_test = np.loadtxt("../test_embedding_file_w2v(norm).txt").reshape(len(test.index),long_embedding)
normalized_embeddings_test.shape

(3263, 300)

### Entrenamiento del set de datos considerando sólo 'embeddings'

In [23]:
X, y = normalized_embeddings_train, train.iloc[:,4].to_frame()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=13)

In [24]:
xg_reg = xgb.XGBRegressor(objective ='binary:logistic', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 10, alpha = 15, n_estimators = 5)

xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Accuracy Score : 0.7834645669291339
Precision Score : 0.8074074074074075
Recall Score : 0.6586102719033232
F1 Score : 0.7254575707154741


In [25]:
rf_model = RandomForestRegressor(random_state=13, n_estimators=5, max_depth=10)
rf_model.fit(X_train, y_train)
preds = rf_model.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

  


Accuracy Score : 0.7650918635170604
Precision Score : 0.7567567567567568
Recall Score : 0.676737160120846
F1 Score : 0.7145135566188199


In [26]:
lgb_class = lgb.LGBMRegressor(learning_rate = 0.1,
                max_depth = 10, alpha = 10, n_estimators = 5)
lgb_class.fit(X_train, y_train)
preds = lgb_class.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

  y = column_or_1d(y, warn=True)


Accuracy Score : 0.7664041994750657
Precision Score : 0.8493150684931506
Recall Score : 0.5619335347432024
F1 Score : 0.6763636363636364


In [27]:
catb = CatBoostRegressor(iterations=5, depth=10)
catb.fit(X_train, y_train)
preds = catb.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Learning rate set to 0.5
0:	learn: 0.4336822	total: 751ms	remaining: 3s
1:	learn: 0.3964386	total: 1.44s	remaining: 2.16s
2:	learn: 0.3712556	total: 2.12s	remaining: 1.41s
3:	learn: 0.3503086	total: 2.81s	remaining: 704ms
4:	learn: 0.3343915	total: 3.49s	remaining: 0us
Accuracy Score : 0.7677165354330708
Precision Score : 0.7673611111111112
Recall Score : 0.6676737160120846
F1 Score : 0.7140549273021002


In [28]:
gb = GradientBoostingRegressor(n_estimators=5, learning_rate=0.1, 
                                max_features=2, max_depth = 10, random_state = 0)
gb.fit(X_train, y_train)
preds = gb.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Accuracy Score : 0.7270341207349081
Precision Score : 0.8219895287958116
Recall Score : 0.4743202416918429
F1 Score : 0.6015325670498084


  y = column_or_1d(y, warn=True)


In [29]:
eclf2 = VotingRegressor(estimators=[
         ('xgb', xg_reg), ('rf', rf_model), ('catb', catb), ('gb', gb), ('lgbm',lgb_class)])

eclf2 = eclf2.fit(X, y)
preds = eclf2.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

  y = column_or_1d(y, warn=True)


Learning rate set to 0.5
0:	learn: 0.4347102	total: 692ms	remaining: 2.77s
1:	learn: 0.3981527	total: 1.4s	remaining: 2.1s
2:	learn: 0.3726670	total: 2.11s	remaining: 1.41s
3:	learn: 0.3534779	total: 2.79s	remaining: 698ms
4:	learn: 0.3399541	total: 3.48s	remaining: 0us
Accuracy Score : 0.89501312335958
Precision Score : 0.9372822299651568
Recall Score : 0.8126888217522659
F1 Score : 0.8705501618122976


In [30]:
df = pd.DataFrame(eclf2.predict(X),columns=["word2vec_score"])
df.head()

Unnamed: 0,word2vec_score
0,0.556296
1,0.674412
2,0.725413
3,0.758587
4,0.59832


In [31]:
final = df.to_csv('../csv/solo_embedding_word2vec_train_lemm.csv')

### Predicciones

In [32]:
normalized_embeddings_test.shape

(3263, 300)

In [33]:
test['target'] = eclf2.predict(normalized_embeddings_test)
test.drop(columns=['clean_text'], inplace=True)
test.head(10)

Unnamed: 0,id,keyword,location,text,target
0,0,,,Just happened a terrible car crash,0.534202
1,2,,,"Heard about #earthquake is different cities, s...",0.583796
2,3,,,"there is a forest fire at spot pond, geese are...",0.527839
3,9,,,Apocalypse lighting. #Spokane #wildfires,0.548836
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,0.726522
5,12,,,We're shaking...It's an earthquake,0.624894
6,21,,,They'd probably still show more life than Arse...,0.237786
7,22,,,Hey! How are you?,0.295767
8,27,,,What a nice hat?,0.281688
9,29,,,Fuck off!,0.337627


In [34]:
final = test.to_csv('../csv/submission_word2vec_lemm.csv')

In [35]:
df = pd.DataFrame(eclf2.predict(normalized_embeddings_test),columns=["word2vec_score"])
df.head()
final = df.to_csv('../csv/solo_embedding_word2vec_test_lemm.csv')