# Se importan las bibliotecas necesarias

In [1]:
import pandas as pd
import re
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, GradientBoostingRegressor
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.linear_model import LogisticRegression
from gensim.parsing.preprocessing import STOPWORDS
import lightgbm as lgb
from catboost import CatBoostRegressor
import tensorflow_hub as hub
import tensorflow as tf
%matplotlib inline

# Se leen los .csv

In [2]:
train = pd.read_csv('../csv/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test = pd.read_csv('../csv/test.csv')

In [5]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


# Elaboración del modelo

Para este modelo se aplicará NLP para el procesamiento de los tweets y se utilizarán distintos árboles de decisión para realizar las predicciones:

### Preparación del set de datos

#### Generación de embeddings

In [6]:
# !pip install "tensorflow>=1.7.0"
# !pip install tensorflow-hub

In [7]:
def clean_text(df):
    
    tweets = df['text'].str.split()
    clean_tweets = []

    for tweet in tweets:
        clean_tweet = []
        for word in tweet:
            clean_word = re.sub('[^a-zA-Z]','', word)
            if(clean_word != '')&(clean_word not in STOPWORDS):
                clean_tweet.append(clean_word.lower())
        clean_tweets.append(" ".join(clean_tweet))

    df['clean_text'] = clean_tweets
    #df.set_index('id', inplace=True)
    df.drop(columns=['keyword','location','text'], inplace=True)
    
    return df

In [8]:
train = clean_text(train)
train.head()

Unnamed: 0,id,target,clean_text
0,1,1,our deeds reason earthquake may allah forgive
1,4,1,forest near la ronge sask canada
2,5,1,all residents asked shelter place notified off...
3,6,1,people receive wildfires evacuation orders cal...
4,7,1,just got sent photo ruby alaska smoke wildfire...


In [9]:
test = clean_text(test)
test.head()

Unnamed: 0,id,clean_text
0,0,just happened terrible car crash
1,2,heard earthquake different cities stay safe
2,3,forest spot pond geese fleeing street i save
3,9,apocalypse lighting spokane wildfires
4,11,typhoon soudelor kills china taiwan


### Embedding de los tweets con ELMo

In [10]:
# tf.compat.v1.disable_eager_execution()
# elmo = hub.Module("https://tfhub.dev/google/elmo/2", trainable=True)

In [11]:
# def elmo_vectoring(tweets):
    
#     embeddings = elmo(tweets.tolist(), signature="default", as_dict=True)["elmo"]
    
#     tf_session = tf.compat.v1.Session()
#     tf_session.run(tf.compat.v1.global_variables_initializer())
#     tf_session.run(tf.compat.v1.tables_initializer())
    
#     return tf_session.run(tf.reduce_mean(embeddings,1))

In [12]:
# list_train = [train[i:i+100] for i in range(0,train.shape[0],100)]
# list_test = [test[i:i+100] for i in range(0,test.shape[0],100)]

In [13]:
# elmo_train = [elmo_vectoring(x['clean_text']) for x in list_train]
# elmo_test = [elmo_vectoring(x['clean_text']) for x in list_test]

In [14]:
# elmo_train_new = np.concatenate(elmo_train, axis = 0)
# elmo_test_new = np.concatenate(elmo_test, axis = 0)

In [15]:
# train_embedding_file = open("../train_embedding_file.txt", "w")

# for i in elmo_train_new:
#     np.savetxt(train_embedding_file, i)

# train_embedding_file.close()

In [16]:
# test_embedding_file = open("../test_embedding_file.txt", "w")

# for i in elmo_test_new:
#     np.savetxt(test_embedding_file, i)

# test_embedding_file.close()

#### Lectura de embeddings

In [17]:
elmo_train = np.loadtxt("../train_embedding_file_elmo.txt").reshape(len(train.index),1024)
elmo_train.shape

(7613, 1024)

In [18]:
elmo_test = np.loadtxt("../test_embedding_file_elmo.txt").reshape(len(test.index),1024)
elmo_test.shape

(3263, 1024)

### Se entrena el modelo

In [75]:
X, y = elmo_train, train['target'].to_frame()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=13)

In [76]:
xg_reg = xgb.XGBRegressor(objective ='binary:logistic', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 10, alpha = 10, n_estimators = 30)

xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Accuracy Score : 0.7979002624671916
Precision Score : 0.8153310104529616
Recall Score : 0.6985074626865672
F1 Score : 0.752411575562701


In [77]:
rf_model = RandomForestRegressor(random_state=13, n_estimators=30, max_depth=10)
rf_model.fit(X_train, y_train)
preds = rf_model.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

  


Accuracy Score : 0.7808398950131233
Precision Score : 0.795774647887324
Recall Score : 0.6746268656716418
F1 Score : 0.7302100161550888


In [78]:
lgb_class = lgb.LGBMRegressor(learning_rate = 0.1,
                max_depth = 10, alpha = 10, n_estimators = 30)
lgb_class.fit(X_train, y_train)
preds = lgb_class.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

  y = column_or_1d(y, warn=True)


Accuracy Score : 0.7926509186351706
Precision Score : 0.8149466192170819
Recall Score : 0.6835820895522388
F1 Score : 0.7435064935064936


In [79]:
catb = CatBoostRegressor(iterations=30, depth=10)
catb.fit(X_train, y_train)
preds = catb.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Learning rate set to 0.5
0:	learn: 0.4376897	total: 4.66s	remaining: 2m 15s
1:	learn: 0.3980029	total: 9.16s	remaining: 2m 8s
2:	learn: 0.3714696	total: 13.8s	remaining: 2m 4s
3:	learn: 0.3562799	total: 18.4s	remaining: 1m 59s
4:	learn: 0.3411295	total: 23.4s	remaining: 1m 56s
5:	learn: 0.3277970	total: 28.3s	remaining: 1m 53s
6:	learn: 0.3133969	total: 32.8s	remaining: 1m 47s
7:	learn: 0.3040964	total: 37.6s	remaining: 1m 43s
8:	learn: 0.2955155	total: 42.3s	remaining: 1m 38s
9:	learn: 0.2832384	total: 47s	remaining: 1m 33s
10:	learn: 0.2734364	total: 51.9s	remaining: 1m 29s
11:	learn: 0.2666857	total: 56.7s	remaining: 1m 25s
12:	learn: 0.2582159	total: 1m 4s	remaining: 1m 23s
13:	learn: 0.2524102	total: 1m 11s	remaining: 1m 21s
14:	learn: 0.2464651	total: 1m 16s	remaining: 1m 16s
15:	learn: 0.2408896	total: 1m 19s	remaining: 1m 9s
16:	learn: 0.2353470	total: 1m 21s	remaining: 1m 2s
17:	learn: 0.2254909	total: 1m 24s	remaining: 56.2s
18:	learn: 0.2198078	total: 1m 26s	remaining: 50.3s

In [80]:
gb = GradientBoostingRegressor(n_estimators=30, learning_rate=0.1, 
                                max_depth = 10, random_state = 0)
gb.fit(X_train, y_train)
preds = gb.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

  y = column_or_1d(y, warn=True)


Accuracy Score : 0.7637795275590551
Precision Score : 0.7491961414790996
Recall Score : 0.6955223880597015
F1 Score : 0.7213622291021672


In [81]:
eclf2 = VotingRegressor(estimators=[
         ('xgb', xg_reg), ('rf', rf_model), ('catb', catb), ('gb', gb), ('lgbm',lgb_class)])

eclf2 = eclf2.fit(X, y)
preds = eclf2.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

  y = column_or_1d(y, warn=True)


Learning rate set to 0.5
0:	learn: 0.4371966	total: 2.74s	remaining: 1m 19s
1:	learn: 0.4005837	total: 5.46s	remaining: 1m 16s
2:	learn: 0.3755287	total: 8.23s	remaining: 1m 14s
3:	learn: 0.3587720	total: 10.9s	remaining: 1m 11s
4:	learn: 0.3452809	total: 13.6s	remaining: 1m 7s
5:	learn: 0.3341491	total: 16.3s	remaining: 1m 5s
6:	learn: 0.3212069	total: 19s	remaining: 1m 2s
7:	learn: 0.3146599	total: 21.8s	remaining: 1m
8:	learn: 0.3062450	total: 24.7s	remaining: 57.6s
9:	learn: 0.2955953	total: 27.3s	remaining: 54.6s
10:	learn: 0.2886078	total: 30.1s	remaining: 51.9s
11:	learn: 0.2803598	total: 32.9s	remaining: 49.3s
12:	learn: 0.2733440	total: 35.6s	remaining: 46.5s
13:	learn: 0.2652583	total: 38.4s	remaining: 43.8s
14:	learn: 0.2571470	total: 41.1s	remaining: 41.1s
15:	learn: 0.2528571	total: 43.9s	remaining: 38.4s
16:	learn: 0.2457762	total: 46.7s	remaining: 35.7s
17:	learn: 0.2390287	total: 49.4s	remaining: 32.9s
18:	learn: 0.2353652	total: 52.1s	remaining: 30.2s
19:	learn: 0.2298

In [82]:
df = pd.DataFrame(eclf2.predict(X),columns=["elmo_score"])
df.head()

Unnamed: 0,elmo_score
0,0.715482
1,0.783273
2,0.814854
3,0.838757
4,0.830163


In [83]:
final = df.to_csv('../csv/solo_embedding_elmo_train.csv')

### Predicciones

In [84]:
df = pd.DataFrame(eclf2.predict(elmo_test),columns=["elmo_score"])
df.head()
final = df.to_csv('../csv/solo_embedding_elmo_test.csv')

# Averaging de los 5 árboles

In [85]:
aux = pd.DataFrame()
aux['xgboost'] = xg_reg.predict(elmo_train)
aux['random_forest'] = rf_model.predict(elmo_train)
aux['catboost'] = catb.predict(elmo_train)
aux['gradient_boosting'] = gb.predict(elmo_train)
aux['lightgbm'] = lgb_class.predict(elmo_train)
div = aux.shape[1]

In [86]:
suma = aux.sum(axis=1)
train_preds = (suma/div)      
train_preds

0       0.773026
1       0.684040
2       0.826939
3       0.805688
4       0.813628
          ...   
7608    0.962927
7609    0.898941
7610    0.825236
7611    0.918542
7612    0.962243
Length: 7613, dtype: float64

In [87]:
print('Accuracy Score : ' + str(accuracy_score(train['target'],train_preds.round())))
print('Precision Score : ' + str(precision_score(train['target'],train_preds.round())))
print('Recall Score : ' + str(recall_score(train['target'],train_preds.round())))
print('F1 Score : ' + str(f1_score(train['target'],train_preds.round())))

Accuracy Score : 0.9586234073295679
Precision Score : 0.9808067664281067
Recall Score : 0.9217364720269031
F1 Score : 0.950354609929078


In [88]:
aux = pd.DataFrame()
aux['xgboost'] = xg_reg.predict(elmo_test)
aux['random_forest'] = rf_model.predict(elmo_test)
aux['catboost'] = catb.predict(elmo_test)
aux['gradient_boosting'] = gb.predict(elmo_test)
aux['lightgbm'] = lgb_class.predict(elmo_test)
div = aux.shape[1]

In [89]:
suma = aux.sum(axis=1)
test['target'] = (suma/div)
test_preds = test['target']           
test_preds

0       0.326994
1       0.334088
2       0.750949
3       0.678941
4       0.793246
          ...   
3258    0.524425
3259    0.781783
3260    0.817613
3261    0.779158
3262    0.632328
Name: target, Length: 3263, dtype: float64

In [90]:
aux['target'] = test_preds
aux = aux['target'].to_frame()
aux.to_csv('../csv/avg_elmo_test.csv')