# Se importan las bibliotecas necesarias

In [1]:
import pandas as pd
import re
import numpy as np
import pickle
import nltk
import spacy
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from catboost import CatBoostRegressor
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import STOPWORDS
%matplotlib inline

# Se leen los .csv

In [6]:
train = pd.read_csv('csv/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [7]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [8]:
ids_with_target_error = [328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226]
train.loc[train['id'].isin(ids_with_target_error),'target'] = 0

In [9]:
test = pd.read_csv('csv/test.csv')

In [10]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [None]:
# Se leen las abreviaturas 
file = open('abreviaturas.pkl','rb')
abbreviations = pickle.load(file)
file.close()

# Elaboración del modelo

Para este modelo se aplicará NLP para el procesamiento de los tweets y se utilizarán distintos árboles de decisión para realizar las predicciones:

### Preparación del set de datos

In [11]:
!pip install "tensorflow>=1.7.0"
!pip install tensorflow-hub



In [54]:
# Devuelve lista de tweets con abreviaturas expandidas
def expand_abbreviations(sentences):
    expanded_sentences = []
    
    for sentence in sentences:
        expanded_sentence = []
        
        for word in sentence:
            if word in abbreviations:
                word = word.replace(word,abbreviations[word])
            expanded_sentence.append(word)
            
        tokenized = nltk.word_tokenize(" ".join(expanded_sentence))
        expanded_sentences.append(tokenized)
        
    return expanded_sentences

In [76]:
#Devuelve los tweets en minuscula eliminando simbolos y numeros
def remove_symbols_and_numbers(sentences):
    clean_sentences = []

    for sentence in sentences:
        clean_sentence = []
    
        for word in sentence:
            clean_word = re.sub('[^a-zA-Z]',' ', word)
            if(clean_word != ' '):
                clean_sentence.append(clean_word.lower())
                
        tokenized = nltk.word_tokenize(" ".join(clean_sentence))
        clean_sentences.append(tokenized)

    return clean_sentences

In [104]:
def remove_stopwords(sentences):
    clean_sentences = []
    for sentence in words:
        clean_sentence = []
        for word in sentence:
            if word not in STOPWORDS:
                clean_sentence.append(word)
        clean_sentences.append(clean_sentence)
    return clean_sentences

In [84]:
# PARA QUE CORRA ESTA FUNCION HAY QUE PONER EN CONSOLA:
# pip install spacy
# spacy download en
# Devuelve los tweets lematizados
def lemmatize_tweets(sentences):
    nlp = spacy.load('en')
    lemmatized = []
    
    for sentence in sentences:
        sentence = ' '.join(sentence)
        doc = nlp(sentence)
        lemmatized.append([token.lemma_ for token in doc])
    
    return lemmatized    

In [86]:
#Devuelve una lista de tweets dejando solo letras y lematizando las palabras
def clean_text(df):
    
    words = df['text'].str.split()
    words = remove_symbols_and_numbers(words)
    words = expand_abbreviations(words)
    words = remove_stopwords(words)
    
    return lemmatize_tweets(words)

In [13]:
train['clean_text'] = clean_text(train)
train.head()

Unnamed: 0,id,target,clean_text
0,1,1,our deeds reason earthquake may allah forgive
1,4,1,forest near la ronge sask canada
2,5,1,all residents asked shelter place notified off...
3,6,1,people receive wildfires evacuation orders cal...
4,7,1,just got sent photo ruby alaska smoke wildfire...


In [14]:
test['clean_text'] = clean_text(test)
test.head()

Unnamed: 0,id,clean_text
0,0,just happened terrible car crash
1,2,heard earthquake different cities stay safe
2,3,forest spot pond geese fleeing street i save
3,9,apocalypse lighting spokane wildfires
4,11,typhoon soudelor kills china taiwan


### Embedding de los tweets con ELMo

In [15]:
# tf.compat.v1.disable_eager_execution()
# elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

In [16]:
# def elmo_vectoring(tweets):
    
#     embeddings = elmo(tweets.tolist(), signature="default", as_dict=True)["elmo"]
    
#     tf_session = tf.compat.v1.Session()
#     tf_session.run(tf.compat.v1.global_variables_initializer())
#     tf_session.run(tf.compat.v1.tables_initializer())
    
#     return tf_session.run(tf.reduce_mean(embeddings,1))

In [17]:
# list_train = [train[i:i+100] for i in range(0,train.shape[0],100)]
# list_test = [test[i:i+100] for i in range(0,test.shape[0],100)]

In [18]:
# elmo_train = [elmo_vectoring(x['clean_text']) for x in list_train]
# elmo_test = [elmo_vectoring(x['clean_text']) for x in list_test]

In [19]:
# elmo_train_new = np.concatenate(elmo_train, axis = 0)
# elmo_test_new = np.concatenate(elmo_test, axis = 0)

In [20]:
# train_embedding_file = open("train_embedding_file.txt", "w")

# for i in elmo_train_new:
#     np.savetxt(train_embedding_file, i)

# train_embedding_file.close()

In [21]:
# test_embedding_file = open("test_embedding_file.txt", "w")

# for i in elmo_test_new:
#     np.savetxt(test_embedding_file, i)

# test_embedding_file.close()

In [22]:
elmo_train = np.loadtxt("train_embedding_file.txt").reshape(len(train.index),1024)
elmo_train.shape

(7613, 1024)

In [23]:
elmo_test = np.loadtxt("test_embedding_file.txt").reshape(len(test.index),1024)
elmo_test.shape

(3263, 1024)

### Se entrena el modelo

In [24]:
X, y = elmo_train, train.iloc[:,1].to_frame()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=13)

In [25]:
xg_reg = xgb.XGBRegressor(objective ='binary:logistic', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 10, alpha = 10, n_estimators = 5)

xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Accuracy Score : 0.7742782152230971
Precision Score : 0.797752808988764
Recall Score : 0.6435045317220544
F1 Score : 0.7123745819397992


In [26]:
rf_model = RandomForestRegressor(random_state=13, n_estimators=5, max_depth=10)
rf_model.fit(X_train, y_train)
preds = rf_model.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

  


Accuracy Score : 0.7637795275590551
Precision Score : 0.7594501718213058
Recall Score : 0.6676737160120846
F1 Score : 0.7106109324758842


In [27]:
lgb_class = lgb.LGBMRegressor(learning_rate = 0.1,
                max_depth = 10, alpha = 10, n_estimators = 5)
lgb_class.fit(X_train, y_train)
preds = lgb_class.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

  y = column_or_1d(y, warn=True)


Accuracy Score : 0.7598425196850394
Precision Score : 0.8425925925925926
Recall Score : 0.5498489425981873
F1 Score : 0.6654478976234003


In [28]:
catb = CatBoostRegressor(iterations=5, depth=10)
catb.fit(X_train, y_train)
preds = catb.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Learning rate set to 0.5
0:	learn: 0.4371077	total: 2.35s	remaining: 9.39s
1:	learn: 0.3975722	total: 4.8s	remaining: 7.2s
2:	learn: 0.3725510	total: 7.16s	remaining: 4.77s
3:	learn: 0.3535919	total: 9.53s	remaining: 2.38s
4:	learn: 0.3346758	total: 11.9s	remaining: 0us
Accuracy Score : 0.7545931758530183
Precision Score : 0.7608695652173914
Recall Score : 0.6344410876132931
F1 Score : 0.6919275123558485


In [29]:
gb = GradientBoostingRegressor(n_estimators=5, learning_rate=0.1, 
                                max_features=2, max_depth = 10, random_state = 0)
gb.fit(X_train, y_train)
preds = gb.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Accuracy Score : 0.7191601049868767
Precision Score : 0.806282722513089
Recall Score : 0.4652567975830816
F1 Score : 0.5900383141762452


  y = column_or_1d(y, warn=True)


In [30]:
eclf2 = VotingRegressor(estimators=[
         ('xgb', xg_reg), ('rf', rf_model), ('catb', catb), ('gb', gb), ('lgbm',lgb_class)])

eclf2 = eclf2.fit(X, y)
preds = eclf2.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

  y = column_or_1d(y, warn=True)


Learning rate set to 0.5
0:	learn: 0.4391880	total: 2.31s	remaining: 9.25s
1:	learn: 0.4003186	total: 4.64s	remaining: 6.96s
2:	learn: 0.3760340	total: 7.06s	remaining: 4.71s
3:	learn: 0.3597639	total: 9.46s	remaining: 2.37s
4:	learn: 0.3446684	total: 11.8s	remaining: 0us
Accuracy Score : 0.8976377952755905
Precision Score : 0.966789667896679
Recall Score : 0.7915407854984894
F1 Score : 0.8704318936877076


In [31]:
df = pd.DataFrame(eclf2.predict(X),columns=["elmo_score"])
df.head()

Unnamed: 0,elmo_score
0,0.522046
1,0.53897
2,0.583167
3,0.65054
4,0.570466


In [32]:
final = df.to_csv('csv/solo_embedding_elmo_train.csv')

### Predicciones

In [33]:
test['target'] = eclf2.predict(elmo_test)

In [37]:
test.drop(columns=['clean_text'], inplace=True)
test.set_index('id', inplace=True)
test.head(10)

Unnamed: 0,id,clean_text,target
0,0,just happened terrible car crash,0.379233
1,2,heard earthquake different cities stay safe,0.367322
2,3,forest spot pond geese fleeing street i save,0.6787
3,9,apocalypse lighting spokane wildfires,0.430616
4,11,typhoon soudelor kills china taiwan,0.584599
5,12,were shakingits earthquake,0.442172
6,21,theyd probably life arsenal yesterday eh eh,0.338432
7,22,hey how,0.224773
8,27,what nice hat,0.254261
9,29,fuck,0.228584


In [38]:
final = test.to_csv('csv/submission_elmo.csv')

In [39]:
df = pd.DataFrame(eclf2.predict(elmo_test),columns=["elmo_score"])
df.head()
final = df.to_csv('csv/solo_embedding_elmo_test.csv')