In [1]:
import pandas as pd
import re
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, GradientBoostingRegressor
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.linear_model import LogisticRegression
from gensim.parsing.preprocessing import STOPWORDS
from bert_serving.client import BertClient
import lightgbm as lgb
from catboost import CatBoostRegressor
import tensorflow_hub as hub
import tensorflow as tf
%matplotlib inline

# Se leen los .csv

In [4]:
train = pd.read_csv('csv/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [5]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
test = pd.read_csv('csv/test.csv')

In [7]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


# Elaboración del modelo

Para este modelo se aplicará NLP para el procesamiento de los tweets y se utilizarán distintos árboles de decisión para realizar las predicciones:

In [8]:
# !pip install bert-serving-server  # server
# !pip install bert-serving-client  # client, independent of `bert-serving-server`

In [10]:
!wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip && unzip uncased_L-12_H-768_A-12.zip

--2020-07-27 04:04:20--  https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
Resolviendo storage.googleapis.com (storage.googleapis.com)... 172.217.172.48, 172.217.172.112, 172.217.172.80, ...
Conectando con storage.googleapis.com (storage.googleapis.com)[172.217.172.48]:443... conectado.
Petición HTTP enviada, esperando respuesta... 200 OK
Longitud: 407727028 (389M) [application/zip]
Guardando como: “uncased_L-12_H-768_A-12.zip”


2020-07-27 04:05:21 (6,38 MB/s) - “uncased_L-12_H-768_A-12.zip” guardado [407727028/407727028]

Archive:  uncased_L-12_H-768_A-12.zip
   creating: uncased_L-12_H-768_A-12/
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.meta  
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001  
  inflating: uncased_L-12_H-768_A-12/vocab.txt  
  inflating: uncased_L-12_H-768_A-12/bert_model.ckpt.index  
  inflating: uncased_L-12_H-768_A-12/bert_config.json  


In [11]:
!bert-serving-start -model_dir uncased_L-12_H-768_A-12/ -num_worker=2 -max_seq_len 50

  'Feel free to submit an issue at https://github.com/hanxiao/bert-as-service/issues/' % tf.__version__)
usage: /home/gonzams/anaconda3/bin/bert-serving-start -model_dir uncased_L-12_H-768_A-12/ -num_worker=2 -max_seq_len 50
                 ARG   VALUE
__________________________________________________
           ckpt_name = bert_model.ckpt
         config_name = bert_config.json
                cors = *
                 cpu = False
          device_map = []
       do_lower_case = True
  fixed_embed_length = False
                fp16 = False
 gpu_memory_fraction = 0.5
       graph_tmp_dir = None
    http_max_connect = 10
           http_port = None
        mask_cls_sep = False
      max_batch_size = 256
         max_seq_len = 50
           model_dir = uncased_L-12_H-768_A-12/
no_position_embeddings = False
    no_special_token = False
          num_worker = 2
       pooling_layer = [-2]
    pooling_strategy = REDUCE_MEAN
                port = 5555
            port_out = 5556
       

In [94]:
#descargar y descomprimir modelo pre-entrenado https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip pesa 1.2gb
# !bert-serving-start -model_dir uncased_L-24_H-1024_A-16 -num_worker=2 -max_seq_len 50

  'Feel free to submit an issue at https://github.com/hanxiao/bert-as-service/issues/' % tf.__version__)
usage: /home/gonzams/anaconda3/bin/bert-serving-start -model_dir uncased_L-24_H-1024_A-16 -num_worker=2 -max_seq_len 50
                 ARG   VALUE
__________________________________________________
           ckpt_name = bert_model.ckpt
         config_name = bert_config.json
                cors = *
                 cpu = False
          device_map = []
       do_lower_case = True
  fixed_embed_length = False
                fp16 = False
 gpu_memory_fraction = 0.5
       graph_tmp_dir = None
    http_max_connect = 10
           http_port = None
        mask_cls_sep = False
      max_batch_size = 256
         max_seq_len = 50
           model_dir = uncased_L-24_H-1024_A-16
no_position_embeddings = False
    no_special_token = False
          num_worker = 2
       pooling_layer = [-2]
    pooling_strategy = REDUCE_MEAN
                port = 5555
            port_out = 5556
       

### Preparación del set de datos

In [7]:
def clean_text(df):
    
    tweets = df['text'].str.split()
    clean_tweets = []

    for tweet in tweets:
        clean_tweet = []
        for word in tweet:
            clean_word = re.sub('[^a-zA-Z]','', word)
            if(clean_word != '')&(clean_word not in STOPWORDS):
                clean_tweet.append(clean_word.lower())
        clean_tweets.append(" ".join(clean_tweet))

    df['clean_text'] = clean_tweets
    #df.set_index('id', inplace=True)
    df.drop(columns=['keyword','location','text'], inplace=True)
    
    return df

In [8]:
train = clean_text(train)
train.head()

Unnamed: 0,id,target,clean_text
0,1,1,our deeds reason earthquake may allah forgive
1,4,1,forest near la ronge sask canada
2,5,1,all residents asked shelter place notified off...
3,6,1,people receive wildfires evacuation orders cal...
4,7,1,just got sent photo ruby alaska smoke wildfire...


In [9]:
test = clean_text(test)
test.head()

Unnamed: 0,id,clean_text
0,0,just happened terrible car crash
1,2,heard earthquake different cities stay safe
2,3,forest spot pond geese fleeing street i save
3,9,apocalypse lighting spokane wildfires
4,11,typhoon soudelor kills china taiwan


### Embedding de los tweets con bert

In [89]:
bc = BertClient()

KeyboardInterrupt: 

In [13]:
bert_train = bc.encode(train['clean_text'].to_list())
bert_train.shape

In [None]:
bert_test = bc.encode(test['clean_text'].to_list())
bert_test.shape

In [15]:
train_embedding_file = open("train_embedding_file_bert.txt", "w")

for i in bert_train:
    np.savetxt(train_embedding_file, i)

train_embedding_file.close()

In [None]:
test_embedding_file = open("test_embedding_file_bert.txt", "w")

for i in bert_train:
    np.savetxt(test_embedding_file, i)

test_embedding_file.close()

In [17]:
# bert_train = np.loadtxt("train_embedding_file_bert.txt").reshape(len(train.index),1024)
# bert_train.shape

(7613, 1024)

In [None]:
# bert_train = np.loadtxt("train_embedding_file_bert.txt").reshape(len(train.index),1024)
# bert_train.shape

### Se entrena el modelo

In [19]:
X, y = bert_train, train.iloc[:,1].to_frame()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=13)

In [20]:
xg_reg = xgb.XGBRegressor(objective ='binary:logistic', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 10, alpha = 10, n_estimators = 5)

xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Accuracy Score : 0.7729658792650919
Precision Score : 0.8
Recall Score : 0.6447761194029851
F1 Score : 0.7140495867768595


In [21]:
rf_model = RandomForestRegressor(random_state=13, n_estimators=5, max_depth=10)
rf_model.fit(X_train, y_train)
preds = rf_model.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

  


Accuracy Score : 0.7519685039370079
Precision Score : 0.7552447552447552
Recall Score : 0.6447761194029851
F1 Score : 0.6956521739130435


In [22]:
lgb_class = lgb.LGBMRegressor(learning_rate = 0.1,
                max_depth = 10, alpha = 10, n_estimators = 5)
lgb_class.fit(X_train, y_train)
preds = lgb_class.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

  y = column_or_1d(y, warn=True)


Accuracy Score : 0.7572178477690289
Precision Score : 0.8504672897196262
Recall Score : 0.5432835820895522
F1 Score : 0.6630236794171219


In [23]:
catb = CatBoostRegressor(iterations=5, depth=10)
catb.fit(X_train, y_train)
preds = catb.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Learning rate set to 0.5
0:	learn: 0.4376897	total: 2.71s	remaining: 10.8s
1:	learn: 0.3980029	total: 5.32s	remaining: 7.98s
2:	learn: 0.3714696	total: 8s	remaining: 5.33s
3:	learn: 0.3562799	total: 10.7s	remaining: 2.67s
4:	learn: 0.3411295	total: 13.3s	remaining: 0us
Accuracy Score : 0.7611548556430446
Precision Score : 0.7908745247148289
Recall Score : 0.6208955223880597
F1 Score : 0.6956521739130435


In [24]:
gb = GradientBoostingRegressor(n_estimators=5, learning_rate=0.1, 
                                max_features=2, max_depth = 10, random_state = 0)
gb.fit(X_train, y_train)
preds = gb.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

  y = column_or_1d(y, warn=True)


Accuracy Score : 0.7060367454068242
Precision Score : 0.8066298342541437
Recall Score : 0.43582089552238806
F1 Score : 0.5658914728682171


In [25]:
eclf2 = VotingRegressor(estimators=[
         ('xgb', xg_reg), ('rf', rf_model), ('catb', catb), ('gb', gb), ('lgbm',lgb_class)])

eclf2 = eclf2.fit(X, y)
preds = eclf2.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

  y = column_or_1d(y, warn=True)


Learning rate set to 0.5
0:	learn: 0.4371966	total: 2.83s	remaining: 11.3s
1:	learn: 0.4005837	total: 5.49s	remaining: 8.23s
2:	learn: 0.3755287	total: 8.15s	remaining: 5.43s
3:	learn: 0.3587720	total: 10.8s	remaining: 2.7s
4:	learn: 0.3452809	total: 13.5s	remaining: 0us
Accuracy Score : 0.9015748031496063
Precision Score : 0.9779411764705882
Recall Score : 0.7940298507462686
F1 Score : 0.8764415156507412


In [33]:
df = pd.DataFrame(eclf2.predict(X),columns=["bert_score"])
df.head()

Unnamed: 0,elmo_score
0,0.555488
1,0.602315
2,0.589332
3,0.623855
4,0.677215


In [34]:
final = df.to_csv('csv/solo_embedding_bert_train.csv')

### Predicciones

In [35]:
test['target'] = eclf2.predict(bert_test)

In [37]:
# test.drop(columns=['clean_text'], inplace=True)
# test.set_index('id', inplace=True)
test.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,0.345923
2,0.38682
3,0.667707
9,0.534257
11,0.631251


In [38]:
final = test.to_csv('csv/submission_bert.csv')

In [39]:
df = pd.DataFrame(eclf2.predict(bert_test),columns=["bert_score"])
df.head()
final = df.to_csv('csv/solo_embedding_bert_test.csv')