# Se importan las bibliotecas necesarias

In [1]:
import pandas as pd
import re
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.linear_model import LogisticRegression
from gensim.parsing.preprocessing import STOPWORDS
import lightgbm as lgb
from catboost import CatBoostRegressor
import tensorflow_hub as hub
import tensorflow as tf
%matplotlib inline

# Se leen los .csv

In [2]:
train = pd.read_csv('csv/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test = pd.read_csv('csv/test.csv')

In [5]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


# Elaboración del modelo

Para este modelo se aplicará NLP para el procesamiento de los tweets y se utilizarán distintos árboles de decisión para realizar las predicciones:

### Preparación del set de datos

In [None]:
!pip install "tensorflow>=1.7.0"
!pip install tensorflow-hub

In [6]:
def clean_text(df):
    
    tweets = df['text'].str.split()
    clean_tweets = []

    for tweet in tweets:
        clean_tweet = []
        for word in tweet:
            clean_word = re.sub('[^a-zA-Z]','', word)
            if(clean_word != '')&(clean_word not in STOPWORDS):
                clean_tweet.append(clean_word.lower())
        clean_tweets.append(" ".join(clean_tweet))

    df['clean_text'] = clean_tweets
    #df.set_index('id', inplace=True)
    df.drop(columns=['keyword','location','text'], inplace=True)
    
    return df

In [7]:
train = clean_text(train)
train.head()

Unnamed: 0,id,target,clean_text
0,1,1,our deeds reason earthquake may allah forgive
1,4,1,forest near la ronge sask canada
2,5,1,all residents asked shelter place notified off...
3,6,1,people receive wildfires evacuation orders cal...
4,7,1,just got sent photo ruby alaska smoke wildfire...


In [12]:
test = clean_text(test)
test.head()

Unnamed: 0,id,clean_text
0,0,just happened terrible car crash
1,2,heard earthquake different cities stay safe
2,3,forest spot pond geese fleeing street i save
3,9,apocalypse lighting spokane wildfires
4,11,typhoon soudelor kills china taiwan


### Embedding de los tweets con ELMo

In [8]:
tf.compat.v1.disable_eager_execution()
elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

In [9]:
def elmo_vectoring(tweets):
    
    embeddings = elmo(tweets.tolist(), signature="default", as_dict=True)["elmo"]
    
    tf_session = tf.Session()
    tf_session.run(tf.global_variables_initializer())
    tf_session.run(tf.tables_initializer())
    
    return tf_session.run(tf.reduce_mean(embeddings,1))

In [10]:
list_train = [train[i:i+100] for i in range(0,train.shape[0],100)]
list_test = [test[i:i+100] for i in range(0,test.shape[0],100)]

In [11]:
elmo_train = [elmo_vectoring(x['clean_tweet']) for x in list_train]
elmo_test = [elmo_vectoring(x['clean_tweet']) for x in list_test]

NameError: name 'elmo_vectors' is not defined

In [None]:
elmo_train_new = np.concatenate(elmo_train, axis = 0)
elmo_test_new = np.concatenate(elmo_train, axis = 0)

In [None]:
train_embedding_file = open("train_embedding_file.txt", "w")

for i in elmo_train_new:
    np.savetxt(train_embedding_file, i)

train_embedding_file.close()

In [None]:
test_embedding_file = open("test_embedding_file.txt", "w")

for i in elmo_test_new:
    np.savetxt(test_embedding_file, i)

test_embedding_file.close()

### Se entrena el modelo

In [None]:
X, y = elmo_train_new, train.iloc[:,1].to_frame()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=482)

In [None]:
xg_reg = xgb.XGBRegressor(objective ='binary:logistic', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 20, alpha = 10, n_estimators = 80)

xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

### Predicciones

In [None]:
test['target'] = xg_reg.predict(normalized_embeddings_test).round().astype(int)

In [None]:
test.drop(columns=['clean_text'], inplace=True)
test.set_index('id', inplace=True)
test.head()

In [None]:
final = test.to_csv('csv/submission_elmo.csv')