In [1]:
import pandas as pd
import re
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, GradientBoostingRegressor
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.linear_model import LogisticRegression
from gensim.parsing.preprocessing import STOPWORDS
from bert_serving.client import BertClient
import lightgbm as lgb
from catboost import CatBoostRegressor
import tensorflow_hub as hub
import tensorflow as tf
%matplotlib inline

# Se leen los .csv

In [2]:
train = pd.read_csv('csv/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
ids_with_target_error = [328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226]
train.loc[train['id'].isin(ids_with_target_error),'target'] = 0

In [5]:
test = pd.read_csv('csv/test.csv')

In [6]:
test.head()

Unnamed: 0,id,keyword,location,text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


# Elaboración del modelo

Para este modelo se aplicará NLP para el procesamiento de los tweets y se utilizarán distintos árboles de decisión para realizar las predicciones:

In [7]:
# !pip install bert-serving-server  # server
# !pip install bert-serving-client  # client, independent of `bert-serving-server`

In [8]:
# !wget https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip && unzip uncased_L-12_H-768_A-12.zip

In [9]:
# !bert-serving-start -model_dir uncased_L-12_H-768_A-12/ -num_worker=2 -max_seq_len 50

In [10]:
#descargar y descomprimir modelo pre-entrenado https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-24_H-1024_A-16.zip pesa 1.2gb
# !bert-serving-start -model_dir uncased_L-24_H-1024_A-16 -num_worker=2 -max_seq_len 50

### Preparación del set de datos

In [11]:
def clean_text(df):
    
    tweets = df['text'].str.split()
    clean_tweets = []

    for tweet in tweets:
        clean_tweet = []
        for word in tweet:
            clean_word = re.sub('[^a-zA-Z]','', word)
            if(clean_word != '')&(clean_word not in STOPWORDS):
                clean_tweet.append(clean_word.lower())
        clean_tweets.append(" ".join(clean_tweet))

    df['clean_text'] = clean_tweets
    #df.set_index('id', inplace=True)
    df.drop(columns=['keyword','location','text'], inplace=True)
    
    return df

In [12]:
train = clean_text(train)
train.head()

Unnamed: 0,id,target,clean_text
0,1,1,our deeds reason earthquake may allah forgive
1,4,1,forest near la ronge sask canada
2,5,1,all residents asked shelter place notified off...
3,6,1,people receive wildfires evacuation orders cal...
4,7,1,just got sent photo ruby alaska smoke wildfire...


In [13]:
test = clean_text(test)
test.head()

Unnamed: 0,id,clean_text
0,0,just happened terrible car crash
1,2,heard earthquake different cities stay safe
2,3,forest spot pond geese fleeing street i save
3,9,apocalypse lighting spokane wildfires
4,11,typhoon soudelor kills china taiwan


### Embedding de los tweets con bert

In [14]:
# bc = BertClient()

In [15]:
# bert_train = bc.encode(train['clean_text'].to_list())
# bert_train.shape

In [16]:
# bert_test = bc.encode(test['clean_text'].to_list())
# bert_test.shape

In [17]:
# train_embedding_file = open("train_embedding_file_bert.txt", "w")

# for i in bert_train:
#     np.savetxt(train_embedding_file, i)

# train_embedding_file.close()

In [18]:
# test_embedding_file = open("test_embedding_file_bert.txt", "w")

# for i in bert_test:
#     np.savetxt(test_embedding_file, i)

# test_embedding_file.close()

In [19]:
bert_train = np.loadtxt("train_embedding_file_bert.txt").reshape(len(train.index),768)
bert_train.shape

(7613, 768)

In [20]:
bert_test = np.loadtxt("test_embedding_file_bert.txt").reshape(len(test.index),768)
bert_test.shape

(3263, 768)

In [21]:
type(bert_test[0])

numpy.ndarray

### Se entrena el modelo

In [23]:
X, y = bert_train, train.loc[:,['target']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=13)

In [24]:
xg_reg = xgb.XGBRegressor(objective ='binary:logistic', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 10, alpha = 10, n_estimators = 5)

xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Accuracy Score : 0.7847769028871391
Precision Score : 0.8104089219330854
Recall Score : 0.6586102719033232
F1 Score : 0.7266666666666667


In [25]:
rf_model = RandomForestRegressor(random_state=13, n_estimators=5, max_depth=10)
rf_model.fit(X_train, y_train)
preds = rf_model.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

  


Accuracy Score : 0.7572178477690289
Precision Score : 0.7517241379310344
Recall Score : 0.6586102719033232
F1 Score : 0.7020933977455716


In [26]:
lgb_class = lgb.LGBMRegressor(learning_rate = 0.1,
                max_depth = 10, alpha = 10, n_estimators = 5)
lgb_class.fit(X_train, y_train)
preds = lgb_class.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

  y = column_or_1d(y, warn=True)


Accuracy Score : 0.7519685039370079
Precision Score : 0.8480392156862745
Recall Score : 0.5226586102719033
F1 Score : 0.6467289719626169


In [27]:
catb = CatBoostRegressor(iterations=5, depth=10)
catb.fit(X_train, y_train)
preds = catb.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Learning rate set to 0.5
0:	learn: 0.4296792	total: 1.74s	remaining: 6.94s
1:	learn: 0.3926107	total: 3.45s	remaining: 5.18s
2:	learn: 0.3688837	total: 5.11s	remaining: 3.41s
3:	learn: 0.3517549	total: 6.76s	remaining: 1.69s
4:	learn: 0.3401956	total: 8.42s	remaining: 0us
Accuracy Score : 0.7965879265091863
Precision Score : 0.8165467625899281
Recall Score : 0.6858006042296072
F1 Score : 0.7454844006568144


In [28]:
gb = GradientBoostingRegressor(n_estimators=5, learning_rate=0.1, 
                                max_features=2, max_depth = 10, random_state = 0)
gb.fit(X_train, y_train)
preds = gb.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Accuracy Score : 0.7060367454068242
Precision Score : 0.8128654970760234
Recall Score : 0.4199395770392749
F1 Score : 0.5537848605577689


  y = column_or_1d(y, warn=True)


In [29]:
eclf2 = VotingRegressor(estimators=[
         ('xgb', xg_reg), ('rf', rf_model), ('catb', catb), ('gb', gb), ('lgbm',lgb_class)])

eclf2 = eclf2.fit(X, y)
preds = eclf2.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

  y = column_or_1d(y, warn=True)


Learning rate set to 0.5
0:	learn: 0.4311727	total: 1.73s	remaining: 6.91s
1:	learn: 0.3964615	total: 3.46s	remaining: 5.19s
2:	learn: 0.3720319	total: 5.19s	remaining: 3.46s
3:	learn: 0.3529097	total: 6.9s	remaining: 1.73s
4:	learn: 0.3368528	total: 8.63s	remaining: 0us
Accuracy Score : 0.9094488188976378
Precision Score : 0.9548611111111112
Recall Score : 0.8308157099697885
F1 Score : 0.888529886914378


In [30]:
df = pd.DataFrame(eclf2.predict(X),columns=["bert_score"])
df.head()

Unnamed: 0,bert_score
0,0.56796
1,0.548946
2,0.589782
3,0.602543
4,0.531318


In [31]:
final = df.to_csv('csv/solo_embedding_bert_train.csv')

### Predicciones

In [32]:
test['target'] = eclf2.predict(bert_test)

In [33]:
test.head()

Unnamed: 0,id,clean_text,target
0,0,just happened terrible car crash,0.482696
1,2,heard earthquake different cities stay safe,0.520214
2,3,forest spot pond geese fleeing street i save,0.455055
3,9,apocalypse lighting spokane wildfires,0.6421
4,11,typhoon soudelor kills china taiwan,0.644486


In [36]:
# test.drop(columns=['keyword','location','text'], inplace=True)
# test.set_index('id', inplace=True)
test.head(10)

Unnamed: 0,id,clean_text,target
0,0,just happened terrible car crash,0.482696
1,2,heard earthquake different cities stay safe,0.520214
2,3,forest spot pond geese fleeing street i save,0.455055
3,9,apocalypse lighting spokane wildfires,0.6421
4,11,typhoon soudelor kills china taiwan,0.644486
5,12,were shakingits earthquake,0.426919
6,21,theyd probably life arsenal yesterday eh eh,0.265076
7,22,hey how,0.304337
8,27,what nice hat,0.221768
9,29,fuck,0.290656


In [37]:
final = test.to_csv('csv/submission_bert.csv')

In [38]:
df = pd.DataFrame(eclf2.predict(bert_test),columns=["bert_score"])
df.head()
final = df.to_csv('csv/solo_embedding_bert_test.csv')