# Se importan las bibliotecas necesarias

In [1]:
import pandas as pd
import re
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from catboost import CatBoostRegressor
from gensim.models import KeyedVectors
from gensim.parsing.preprocessing import STOPWORDS
%matplotlib inline

# Se lee el .csv

In [2]:
train = pd.read_csv('../csv/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test = pd.read_csv('../csv/test.csv')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


# Se cargan los embeddings pre-entrenados

In [5]:
# #descargar de aca https://s3.amazonaws.com/dl4j-distribution/GoogleNews-vectors-negative300.bin.gz, pesa 1.5gb
# EMBEDDING_FILE = '../GoogleNews-vectors-negative300.bin.gz'
# word2vec = KeyedVectors.load_word2vec_format(EMBEDDING_FILE, binary=True)

# Elaboración del modelo

Para este modelo se aplicará NLP para el procesamiento de los tweets y se utilizarán distintos árboles de decisión para realizar las predicciones:

### Preparación del set de datos

In [6]:
long_embedding = 300

#### Generación de embeddings

In [7]:
#Devuelve una lista de tweets dejando solo caracteres alfanumericos
def clean_text(df):
    words = df['text'].str.split()
    clean_words = []

    for sentence in words:
        clean_sentence = []
        for word in sentence:
            clean_word = re.sub('[^a-zA-Z]','', word)
            if(clean_word != '')&(clean_word not in STOPWORDS):
                clean_sentence.append(clean_word.lower())
        clean_words.append(clean_sentence)
    
    return clean_words3

In [8]:
# def tweets_embeddings(df):

#     embeddings = np.zeros((len(df.index),long_embedding), dtype='float32') 
#     normalized_embeddings = np.zeros((len(df.index),long_embedding),dtype='float32')

#     contador = 0
#     for tweet in df['clean_text']:
#         cant_palabras = 0
#         embedded_tweet = np.zeros((long_embedding,), dtype='float32')

#         for word in tweet:
#             if word in word2vec.vocab:
#                 embedded_tweet = np.add(word2vec[word],embedded_tweet)
#                 cant_palabras += 1     

#         embeddings[contador] = embedded_tweet

#         if cant_palabras!=0:
#             normalized_embeddings[contador] = np.divide(embedded_tweet,cant_palabras)
#         else:
#             normalized_embeddings[contador] = embedded_tweet

#         contador += 1    
        
#     return embeddings, normalized_embeddings


In [9]:
# # Devuelve un dataframe con todos los features considerados para el modelo
# def generate_embeddings(df):
    
#     df['clean_text'] = clean_text(df)
#     df.drop(columns=['keyword','location','text'], inplace=True)
#     df.set_index('id', inplace=True)
    
#     return tweets_embeddings(df)
    

In [10]:
# embeddings_train, normalized_embeddings_train = generate_embeddings(train)

In [11]:
# train_embedding_file = open("../train_embedding_file_w2v.txt", "w")

# for i in embeddings_train:
#     np.savetxt(train_embedding_file, i)

# train_embedding_file.close()

In [12]:
# train_embedding_file = open("../train_embedding_file_w2v(norm).txt", "w")

# for i in normalized_embeddings_train:
#     np.savetxt(train_embedding_file, i)

# train_embedding_file.close()

In [13]:
# embeddings_test, normalized_embeddings_test = generate_embeddings(test)

In [14]:
# test_embedding_file = open("../test_embedding_file_w2v.txt", "w")

# for i in embeddings_test:
#     np.savetxt(test_embedding_file, i)

# test_embedding_file.close()

In [15]:
# test_embedding_file = open("../test_embedding_file_w2v(norm).txt", "w")

# for i in normalized_embeddings_test:
#     np.savetxt(test_embedding_file, i)

# test_embedding_file.close()

#### Lectura de embeddings

In [16]:
# embeddings_train = np.loadtxt("../train_embedding_file_w2v.txt").reshape(len(train.index),300)
# embeddings_train.shape

In [17]:
normalized_embeddings_train = np.loadtxt("../train_embedding_file_w2v(norm).txt").reshape(len(train.index),300)
normalized_embeddings_train.shape

(7613, 300)

In [18]:
# embeddings_test = np.loadtxt("../test_embedding_file_w2v.txt").reshape(len(test.index),300)
# embeddings_test.shape

In [19]:
normalized_embeddings_test = np.loadtxt("../test_embedding_file_w2v(norm).txt").reshape(len(test.index),300)
normalized_embeddings_test.shape

(3263, 300)

### Entrenamiento del set de datos considerando sólo 'embeddings'

In [56]:
X, y = normalized_embeddings_train, train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=13)

In [57]:
xg_reg = xgb.XGBRegressor(objective ='binary:logistic', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 10, alpha = 15, n_estimators = 30)

xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Accuracy Score : 0.7863397548161121
Precision Score : 0.8007159904534606
Recall Score : 0.6764112903225806
F1 Score : 0.7333333333333333


In [58]:
rf_model = RandomForestRegressor(random_state=13, n_estimators=30, max_depth=10)
rf_model.fit(X_train, y_train)
preds = rf_model.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Accuracy Score : 0.7802101576182137
Precision Score : 0.7951807228915663
Recall Score : 0.6653225806451613
F1 Score : 0.7244785949506038


In [59]:
lgb_class = lgb.LGBMRegressor(learning_rate = 0.1,
                max_depth = 10, alpha = 10, n_estimators = 30)
lgb_class.fit(X_train, y_train)
preds = lgb_class.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Accuracy Score : 0.7841506129597198
Precision Score : 0.8031591737545565
Recall Score : 0.6663306451612904
F1 Score : 0.7283746556473829


In [60]:
catb = CatBoostRegressor(iterations=30, depth=10)
catb.fit(X_train, y_train)
preds = catb.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Learning rate set to 0.5
0:	learn: 0.4317063	total: 679ms	remaining: 19.7s
1:	learn: 0.3910841	total: 1.38s	remaining: 19.3s
2:	learn: 0.3602000	total: 2.08s	remaining: 18.7s
3:	learn: 0.3377070	total: 2.77s	remaining: 18s
4:	learn: 0.3200682	total: 3.45s	remaining: 17.3s
5:	learn: 0.3046475	total: 4.18s	remaining: 16.7s
6:	learn: 0.2915204	total: 4.93s	remaining: 16.2s
7:	learn: 0.2847071	total: 5.61s	remaining: 15.4s
8:	learn: 0.2772680	total: 6.32s	remaining: 14.8s
9:	learn: 0.2670316	total: 7.03s	remaining: 14.1s
10:	learn: 0.2573363	total: 7.79s	remaining: 13.5s
11:	learn: 0.2497790	total: 8.58s	remaining: 12.9s
12:	learn: 0.2427693	total: 9.33s	remaining: 12.2s
13:	learn: 0.2360570	total: 10.1s	remaining: 11.5s
14:	learn: 0.2315891	total: 10.8s	remaining: 10.8s
15:	learn: 0.2278511	total: 11.6s	remaining: 10.1s
16:	learn: 0.2218301	total: 12.4s	remaining: 9.46s
17:	learn: 0.2150525	total: 13.1s	remaining: 8.76s
18:	learn: 0.2074904	total: 13.9s	remaining: 8.05s
19:	learn: 0.20168

In [61]:
gb = GradientBoostingRegressor(n_estimators=30, learning_rate=0.1, 
                                max_depth = 10, random_state = 0)
gb.fit(X_train, y_train)
preds = gb.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Accuracy Score : 0.7530647985989493
Precision Score : 0.7426303854875284
Recall Score : 0.6602822580645161
F1 Score : 0.6990394877267876


In [62]:
eclf2 = VotingRegressor(estimators=[
         ('xgb', xg_reg), ('rf', rf_model), ('catb', catb), ('gb', gb), ('lgbm',lgb_class)])

eclf2 = eclf2.fit(X, y)
preds = eclf2.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Learning rate set to 0.5
0:	learn: 0.4339145	total: 934ms	remaining: 27.1s
1:	learn: 0.3949470	total: 1.79s	remaining: 25s
2:	learn: 0.3702802	total: 2.67s	remaining: 24s
3:	learn: 0.3491851	total: 3.57s	remaining: 23.2s
4:	learn: 0.3334044	total: 4.37s	remaining: 21.9s
5:	learn: 0.3215564	total: 5.2s	remaining: 20.8s
6:	learn: 0.3112869	total: 5.99s	remaining: 19.7s
7:	learn: 0.3024755	total: 6.85s	remaining: 18.8s
8:	learn: 0.2970200	total: 7.67s	remaining: 17.9s
9:	learn: 0.2897154	total: 8.5s	remaining: 17s
10:	learn: 0.2823977	total: 9.41s	remaining: 16.3s
11:	learn: 0.2775503	total: 10.3s	remaining: 15.4s
12:	learn: 0.2740458	total: 11s	remaining: 14.4s
13:	learn: 0.2697189	total: 11.9s	remaining: 13.6s
14:	learn: 0.2629776	total: 12.7s	remaining: 12.7s
15:	learn: 0.2574541	total: 13.5s	remaining: 11.8s
16:	learn: 0.2496999	total: 14.4s	remaining: 11s
17:	learn: 0.2446372	total: 15.2s	remaining: 10.1s
18:	learn: 0.2395233	total: 16s	remaining: 9.27s
19:	learn: 0.2330959	total: 16

In [63]:
df = pd.DataFrame(eclf2.predict(X),columns=["word2vec_score"])
df.head()

Unnamed: 0,word2vec_score
0,0.667608
1,0.855877
2,0.892648
3,0.946222
4,0.829571


In [64]:
final = df.to_csv('../csv/solo_embedding_word2vec_train.csv')

### Predicciones

In [65]:
normalized_embeddings_test.shape

(3263, 300)

In [66]:
df = pd.DataFrame(eclf2.predict(normalized_embeddings_test),columns=["word2vec_score"])
df.head()
final = df.to_csv('../csv/solo_embedding_word2vec_test.csv')

# Averaging de los 5 árboles

In [67]:
aux = pd.DataFrame()
aux['xgboost'] = xg_reg.predict(normalized_embeddings_train)
aux['random_forest'] = rf_model.predict(normalized_embeddings_train)
aux['catboost'] = catb.predict(normalized_embeddings_train)
aux['gradient_boosting'] = gb.predict(normalized_embeddings_train)
aux['lightgbm'] = lgb_class.predict(normalized_embeddings_train)
div = aux.shape[1]

In [68]:
suma = aux.sum(axis=1)
train_preds = (suma/div)      
train_preds

0       0.682808
1       0.835731
2       0.929766
3       0.977826
4       0.830327
          ...   
7608    0.933017
7609    0.929151
7610    0.812793
7611    0.922264
7612    0.973906
Length: 7613, dtype: float64

In [69]:
print('Accuracy Score : ' + str(accuracy_score(train['target'],train_preds.round())))
print('Precision Score : ' + str(precision_score(train['target'],train_preds.round())))
print('Recall Score : ' + str(recall_score(train['target'],train_preds.round())))
print('F1 Score : ' + str(f1_score(train['target'],train_preds.round())))

Accuracy Score : 0.9197425456456062
Precision Score : 0.9427430093209055
Recall Score : 0.8657902782023846
F1 Score : 0.9026294820717131


In [70]:
aux = pd.DataFrame()
aux['xgboost'] = xg_reg.predict(normalized_embeddings_test)
aux['random_forest'] = rf_model.predict(normalized_embeddings_test)
aux['catboost'] = catb.predict(normalized_embeddings_test)
aux['gradient_boosting'] = gb.predict(normalized_embeddings_test)
aux['lightgbm'] = lgb_class.predict(normalized_embeddings_test)
div = aux.shape[1]

In [71]:
suma = aux.sum(axis=1)
test['target'] = (suma/div)
test_preds = test['target']           
test_preds

0       0.672410
1       0.674629
2       0.651223
3       0.822961
4       0.872475
          ...   
3258    0.725044
3259    0.688839
3260    0.742857
3261    0.799997
3262    0.788248
Name: target, Length: 3263, dtype: float64

In [72]:
aux['target'] = test_preds
aux = aux['target'].to_frame()
aux.to_csv('../csv/avg_w2v_test.csv')