# Se importan las bibliotecas necesarias

In [1]:
import pandas as pd
import re
import numpy as np
import pickle
import nltk
import spacy
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from catboost import CatBoostRegressor
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import STOPWORDS
%matplotlib inline

# Se lee el .csv

In [3]:
train = pd.read_csv('csv/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [4]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
ids_with_target_error = [328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226]
train.loc[train['id'].isin(ids_with_target_error),'target'] = 0

In [6]:
test = pd.read_csv('csv/test.csv')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [None]:
# Se leen las abreviaturas 
file = open('abreviaturas.pkl','rb')
abbreviations = pickle.load(file)
file.close()

# Se cargan los embeddings pre-entrenados

In [7]:
# # descargar glove.twitter.27B.zip de aca https://nlp.stanford.edu/projects/glove/ pesa 1.75gb
# glove_input = 'glove.twitter.27B.200d.txt'
# word2vec_output = 'glove.twitter.27B.200d.txt.word2vec'
# glove2word2vec(glove_input, word2vec_output)

In [8]:
# glove = KeyedVectors.load_word2vec_format(word2vec_output, binary=False)

# Elaboración del modelo

Para este modelo se aplicará NLP para el procesamiento de los tweets y se utilizarán distintos árboles de decisión para realizar las predicciones:

### Preparación del set de datos

In [9]:
long_embedding = 200

In [54]:
# Devuelve lista de tweets con abreviaturas expandidas
def expand_abbreviations(sentences):
    expanded_sentences = []
    
    for sentence in sentences:
        expanded_sentence = []
        
        for word in sentence:
            if word in abbreviations:
                word = word.replace(word,abbreviations[word])
            expanded_sentence.append(word)
            
        tokenized = nltk.word_tokenize(" ".join(expanded_sentence))
        expanded_sentences.append(tokenized)
        
    return expanded_sentences

In [76]:
#Devuelve los tweets en minuscula eliminando simbolos y numeros
def remove_symbols_and_numbers(sentences):
    clean_sentences = []

    for sentence in sentences:
        clean_sentence = []
    
        for word in sentence:
            clean_word = re.sub('[^a-zA-Z]',' ', word)
            if(clean_word != ' '):
                clean_sentence.append(clean_word.lower())
                
        tokenized = nltk.word_tokenize(" ".join(clean_sentence))
        clean_sentences.append(tokenized)

    return clean_sentences

In [104]:
def remove_stopwords(sentences):
    clean_sentences = []
    for sentence in words:
        clean_sentence = []
        for word in sentence:
            if word not in STOPWORDS:
                clean_sentence.append(word)
        clean_sentences.append(clean_sentence)
    return clean_sentences

In [84]:
# PARA QUE CORRA ESTA FUNCION HAY QUE PONER EN CONSOLA:
# pip install spacy
# spacy download en
# Devuelve los tweets lematizados
def lemmatize_tweets(sentences):
    nlp = spacy.load('en')
    lemmatized = []
    
    for sentence in sentences:
        sentence = ' '.join(sentence)
        doc = nlp(sentence)
        lemmatized.append([token.lemma_ for token in doc])
    
    return lemmatized    

In [86]:
#Devuelve una lista de tweets dejando solo letras y lematizando las palabras
def clean_text(df):
    
    words = df['text'].str.split()
    words = remove_symbols_and_numbers(words)
    words = expand_abbreviations(words)
    words = remove_stopwords(words)
    
    return lemmatize_tweets(words)

In [11]:
# def tweets_embeddings(df):

#     embeddings = np.zeros((len(df.index),long_embedding), dtype='float32') 
#     normalized_embeddings = np.zeros((len(df.index),long_embedding),dtype='float32')

#     contador = 0
#     for tweet in df['clean_text']:
#         cant_palabras = 0
#         embedded_tweet = np.zeros((long_embedding,), dtype='float32')

#         for word in tweet:
#             if word in glove.vocab:
#                 embedded_tweet = np.add(glove[word],embedded_tweet)
#                 cant_palabras += 1     

#         embeddings[contador] = embedded_tweet

#         if cant_palabras!=0:
#             normalized_embeddings[contador] = np.divide(embedded_tweet,cant_palabras)
#         else:
#             normalized_embeddings[contador] = embedded_tweet

#         contador += 1    
        
#     return embeddings, normalized_embeddings


In [12]:
# # Devuelve un dataframe con todos los features considerados para el modelo
# def generate_embeddings(df):
    
#     df['clean_text'] = clean_text(df)
#     df.drop(columns=['keyword','location','text'], inplace=True)
#     df.set_index('id', inplace=True)
    
#     return tweets_embeddings(df)
    

In [13]:
# embeddings_train, normalized_embeddings_train = generate_embeddings(train)

In [14]:
# train_embedding_file = open("train_embedding_file_glove.txt", "w")

# for i in embeddings_train:
#     np.savetxt(train_embedding_file, i)

# train_embedding_file.close()

In [15]:
# train_embedding_file = open("train_embedding_file_glove(norm).txt", "w")

# for i in normalized_embeddings_train:
#     np.savetxt(train_embedding_file, i)

# train_embedding_file.close()

In [16]:
# embeddings_test, normalized_embeddings_test = generate_embeddings(test)

In [17]:
# test_embedding_file = open("test_embedding_file_glove.txt", "w")

# for i in embeddings_test:
#     np.savetxt(test_embedding_file, i)

# test_embedding_file.close()

In [18]:
# test_embedding_file = open("test_embedding_file_glove(norm).txt", "w")

# for i in normalized_embeddings_test:
#     np.savetxt(test_embedding_file, i)

# test_embedding_file.close()

In [20]:
embeddings_train = np.loadtxt("train_embedding_file_glove.txt").reshape(len(train.index),long_embedding)
embeddings_train.shape

(7613, 200)

In [21]:
normalized_embeddings_train = np.loadtxt("train_embedding_file_glove(norm).txt").reshape(len(train.index),long_embedding)
normalized_embeddings_train.shape

(7613, 200)

In [22]:
embeddings_test = np.loadtxt("test_embedding_file_glove.txt").reshape(len(test.index),long_embedding)
embeddings_test.shape

(3263, 200)

In [23]:
normalized_embeddings_test = np.loadtxt("test_embedding_file_glove(norm).txt").reshape(len(test.index),long_embedding)
normalized_embeddings_test.shape

(3263, 200)

### Entrenamiento del set de datos considerando sólo 'embeddings'

In [27]:
X, y = normalized_embeddings_train, train.iloc[:,4].to_frame()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=13)

In [29]:
xg_reg = xgb.XGBRegressor(objective ='binary:logistic', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 10, alpha = 15, n_estimators = 5)

xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Accuracy Score : 0.7847769028871391
Precision Score : 0.8104089219330854
Recall Score : 0.6586102719033232
F1 Score : 0.7266666666666667


In [30]:
rf_model = RandomForestRegressor(random_state=13, n_estimators=5, max_depth=10)
rf_model.fit(X_train, y_train)
preds = rf_model.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

  


Accuracy Score : 0.7755905511811023
Precision Score : 0.761437908496732
Recall Score : 0.7039274924471299
F1 Score : 0.7315541601255887


In [31]:
lgb_class = lgb.LGBMRegressor(learning_rate = 0.1,
                max_depth = 10, alpha = 10, n_estimators = 5)
lgb_class.fit(X_train, y_train)
preds = lgb_class.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

  y = column_or_1d(y, warn=True)


Accuracy Score : 0.7939632545931758
Precision Score : 0.8990825688073395
Recall Score : 0.5921450151057401
F1 Score : 0.7140255009107469


In [32]:
catb = CatBoostRegressor(iterations=5, depth=10)
catb.fit(X_train, y_train)
preds = catb.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Learning rate set to 0.5
0:	learn: 0.4270643	total: 534ms	remaining: 2.14s
1:	learn: 0.3905892	total: 1.01s	remaining: 1.52s
2:	learn: 0.3615000	total: 1.48s	remaining: 986ms
3:	learn: 0.3415294	total: 1.95s	remaining: 488ms
4:	learn: 0.3245570	total: 2.42s	remaining: 0us
Accuracy Score : 0.7782152230971129
Precision Score : 0.7934782608695652
Recall Score : 0.6616314199395771
F1 Score : 0.7215815485996705


In [33]:
gb = GradientBoostingRegressor(n_estimators=5, learning_rate=0.1, 
                                max_features=2, max_depth = 10, random_state = 0)
gb.fit(X_train, y_train)
preds = gb.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Accuracy Score : 0.7493438320209974
Precision Score : 0.8431372549019608
Recall Score : 0.5196374622356495
F1 Score : 0.6429906542056074


  y = column_or_1d(y, warn=True)


In [34]:
eclf2 = VotingRegressor(estimators=[
         ('xgb', xg_reg), ('rf', rf_model), ('catb', catb), ('gb', gb), ('lgbm',lgb_class)])

eclf2 = eclf2.fit(X, y)
preds = eclf2.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

  y = column_or_1d(y, warn=True)


Learning rate set to 0.5
0:	learn: 0.4293841	total: 479ms	remaining: 1.92s
1:	learn: 0.3873705	total: 968ms	remaining: 1.45s
2:	learn: 0.3594870	total: 1.44s	remaining: 960ms
3:	learn: 0.3442441	total: 1.91s	remaining: 478ms
4:	learn: 0.3286493	total: 2.39s	remaining: 0us
Accuracy Score : 0.905511811023622
Precision Score : 0.96415770609319
Recall Score : 0.8126888217522659
F1 Score : 0.8819672131147541


In [35]:
df = pd.DataFrame(eclf2.predict(X),columns=["glove_score"])
df.head()

Unnamed: 0,glove_score
0,0.587237
1,0.676435
2,0.668918
3,0.656915
4,0.573899


In [36]:
final = df.to_csv('csv/solo_embedding_glove_train.csv')

### Predicciones

In [39]:
# test['target'] = eclf2.predict(normalized_embeddings_test)
# test.drop(columns=['clean_text'], inplace=True)
test.head(10)

Unnamed: 0,id,keyword,location,text,target
0,0,,,Just happened a terrible car crash,0.574705
1,2,,,"Heard about #earthquake is different cities, s...",0.506844
2,3,,,"there is a forest fire at spot pond, geese are...",0.411923
3,9,,,Apocalypse lighting. #Spokane #wildfires,0.630913
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,0.705246
5,12,,,We're shaking...It's an earthquake,0.676676
6,21,,,They'd probably still show more life than Arse...,0.309746
7,22,,,Hey! How are you?,0.296216
8,27,,,What a nice hat?,0.258178
9,29,,,Fuck off!,0.289213


In [40]:
final = test.to_csv('csv/submission_glove.csv')

In [41]:
df = pd.DataFrame(eclf2.predict(normalized_embeddings_test),columns=["glove_score"])
df.head()
final = df.to_csv('csv/solo_embedding_glove_test.csv')