# Se importan las bibliotecas necesarias

In [1]:
import pandas as pd
import re
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, VotingRegressor
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from catboost import CatBoostRegressor
from gensim.models import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec
from gensim.parsing.preprocessing import STOPWORDS
%matplotlib inline

# Se lee el .csv

In [2]:
train = pd.read_csv('../csv/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test = pd.read_csv('../csv/test.csv')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


# Se cargan los embeddings pre-entrenados

In [5]:
# # descargar glove.twitter.27B.zip de aca https://nlp.stanford.edu/projects/glove/ pesa 1.75gb
# glove_input = '../glove.twitter.27B.200d.txt'
# word2vec_output = '../glove.twitter.27B.200d.txt.word2vec'
# glove2word2vec(glove_input, word2vec_output)

In [6]:
# glove = KeyedVectors.load_word2vec_format(word2vec_output, binary=False)

# Elaboración del modelo

Para este modelo se aplicará NLP para el procesamiento de los tweets y se utilizarán distintos árboles de decisión para realizar las predicciones:

### Preparación del set de datos

In [7]:
long_embedding = 200

#### Generación de embeddings

In [8]:
# #Devuelve una lista de tweets dejando solo caracteres alfanumericos
# def clean_text(df):
#     words = df['text'].str.split()
#     clean_words = []

#     for sentence in words:
#         clean_sentence = []
#         for word in sentence:
#             clean_word = re.sub('[^a-zA-Z]','', word)
#             if(clean_word != '')&(clean_word not in STOPWORDS):
#                 clean_sentence.append(clean_word.lower())
#         clean_words.append(clean_sentence)
    
#     return clean_words

In [9]:
# def tweets_embeddings(df):

#     embeddings = np.zeros((len(df.index),long_embedding), dtype='float32') 
#     normalized_embeddings = np.zeros((len(df.index),long_embedding),dtype='float32')

#     contador = 0
#     for tweet in df['clean_text']:
#         cant_palabras = 0
#         embedded_tweet = np.zeros((long_embedding,), dtype='float32')

#         for word in tweet:
#             if word in glove.vocab:
#                 embedded_tweet = np.add(glove[word],embedded_tweet)
#                 cant_palabras += 1     

#         embeddings[contador] = embedded_tweet

#         if cant_palabras!=0:
#             normalized_embeddings[contador] = np.divide(embedded_tweet,cant_palabras)
#         else:
#             normalized_embeddings[contador] = embedded_tweet

#         contador += 1    
        
#     return embeddings, normalized_embeddings


In [10]:
# # Devuelve un dataframe con todos los features considerados para el modelo
# def generate_embeddings(df):
    
#     df['clean_text'] = clean_text(df)
#     df.drop(columns=['keyword','location','text'], inplace=True)
#     df.set_index('id', inplace=True)
    
#     return tweets_embeddings(df)
    

In [11]:
# embeddings_train, normalized_embeddings_train = generate_embeddings(train)

In [12]:
# train_embedding_file = open("../train_embedding_file_glove.txt", "w")

# for i in embeddings_train:
#     np.savetxt(train_embedding_file, i)

# train_embedding_file.close()

In [13]:
# train_embedding_file = open("../train_embedding_file_glove(norm).txt", "w")

# for i in normalized_embeddings_train:
#     np.savetxt(train_embedding_file, i)

# train_embedding_file.close()

In [14]:
# embeddings_test, normalized_embeddings_test = generate_embeddings(test)

In [15]:
# test_embedding_file = open("../test_embedding_file_glove.txt", "w")

# for i in embeddings_test:
#     np.savetxt(test_embedding_file, i)

# test_embedding_file.close()

In [16]:
# test_embedding_file = open("../test_embedding_file_glove(norm).txt", "w")

# for i in normalized_embeddings_test:
#     np.savetxt(test_embedding_file, i)

# test_embedding_file.close()

#### Lectura de embeddings

In [17]:
# embeddings_train = np.loadtxt("../train_embedding_file_glove.txt").reshape(len(train.index),long_embedding)
# embeddings_train.shape

In [18]:
normalized_embeddings_train = np.loadtxt("../train_embedding_file_glove(norm).txt").reshape(len(train.index),long_embedding)
normalized_embeddings_train.shape

(7613, 200)

In [19]:
# embeddings_test = np.loadtxt("../test_embedding_file_glove.txt").reshape(len(test.index),long_embedding)
# embeddings_test.shape

In [20]:
normalized_embeddings_test = np.loadtxt("../test_embedding_file_glove(norm).txt").reshape(len(test.index),long_embedding)
normalized_embeddings_test.shape

(3263, 200)

### Entrenamiento del set de datos considerando sólo 'embeddings'

In [21]:
X, y = normalized_embeddings_train, train['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=13)

In [68]:
xg_reg = xgb.XGBRegressor(objective ='binary:logistic', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 10, alpha = 15, n_estimators = 30)

xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Accuracy Score : 0.8241469816272966
Precision Score : 0.8501742160278746
Recall Score : 0.7283582089552239
F1 Score : 0.7845659163987138


In [69]:
rf_model = RandomForestRegressor(random_state=13, n_estimators=30, max_depth=10)
rf_model.fit(X_train, y_train)
preds = rf_model.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Accuracy Score : 0.8149606299212598
Precision Score : 0.8464285714285714
Recall Score : 0.7074626865671642
F1 Score : 0.7707317073170731


In [70]:
lgb_class = lgb.LGBMRegressor(learning_rate = 0.1,
                max_depth = 10, alpha = 10, n_estimators = 30)
lgb_class.fit(X_train, y_train)
preds = lgb_class.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Accuracy Score : 0.8241469816272966
Precision Score : 0.8477508650519031
Recall Score : 0.7313432835820896
F1 Score : 0.7852564102564102


In [71]:
catb = CatBoostRegressor(iterations=30, depth=10)
catb.fit(X_train, y_train)
preds = catb.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Learning rate set to 0.5
0:	learn: 0.4278340	total: 489ms	remaining: 14.2s
1:	learn: 0.3873108	total: 1s	remaining: 14s
2:	learn: 0.3582024	total: 1.49s	remaining: 13.4s
3:	learn: 0.3391276	total: 1.98s	remaining: 12.9s
4:	learn: 0.3244853	total: 2.48s	remaining: 12.4s
5:	learn: 0.3146681	total: 2.97s	remaining: 11.9s
6:	learn: 0.3046408	total: 3.46s	remaining: 11.4s
7:	learn: 0.2958682	total: 3.95s	remaining: 10.9s
8:	learn: 0.2881414	total: 4.43s	remaining: 10.3s
9:	learn: 0.2818054	total: 4.93s	remaining: 9.85s
10:	learn: 0.2754219	total: 5.42s	remaining: 9.36s
11:	learn: 0.2672088	total: 5.89s	remaining: 8.84s
12:	learn: 0.2614297	total: 6.38s	remaining: 8.35s
13:	learn: 0.2559497	total: 6.88s	remaining: 7.86s
14:	learn: 0.2502706	total: 7.37s	remaining: 7.37s
15:	learn: 0.2453379	total: 7.86s	remaining: 6.87s
16:	learn: 0.2393525	total: 8.34s	remaining: 6.38s
17:	learn: 0.2356625	total: 8.83s	remaining: 5.89s
18:	learn: 0.2304089	total: 9.33s	remaining: 5.4s
19:	learn: 0.2254712	t

In [72]:
gb = GradientBoostingRegressor(n_estimators=30, learning_rate=0.1, 
                                 max_depth = 10, random_state = 0)
gb.fit(X_train, y_train)
preds = gb.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Accuracy Score : 0.8018372703412073
Precision Score : 0.8309352517985612
Recall Score : 0.6895522388059702
F1 Score : 0.7536704730831975


In [74]:
eclf2 = VotingRegressor(estimators=[
         ('xgb', xg_reg), ('rf', rf_model), ('catb', catb), ('gb', gb), ('lgbm',lgb_class)])

eclf2 = eclf2.fit(X, y)
preds = eclf2.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Learning rate set to 0.5
0:	learn: 0.4303131	total: 484ms	remaining: 14s
1:	learn: 0.3886305	total: 967ms	remaining: 13.5s
2:	learn: 0.3634779	total: 1.48s	remaining: 13.3s
3:	learn: 0.3458650	total: 1.98s	remaining: 12.9s
4:	learn: 0.3319806	total: 2.5s	remaining: 12.5s
5:	learn: 0.3231858	total: 3.06s	remaining: 12.2s
6:	learn: 0.3126361	total: 3.58s	remaining: 11.8s
7:	learn: 0.3035727	total: 4.09s	remaining: 11.3s
8:	learn: 0.2960302	total: 4.6s	remaining: 10.7s
9:	learn: 0.2903933	total: 5.11s	remaining: 10.2s
10:	learn: 0.2829280	total: 5.63s	remaining: 9.72s
11:	learn: 0.2763947	total: 6.17s	remaining: 9.25s
12:	learn: 0.2713997	total: 6.7s	remaining: 8.76s
13:	learn: 0.2671406	total: 7.22s	remaining: 8.25s
14:	learn: 0.2613672	total: 7.7s	remaining: 7.7s
15:	learn: 0.2565396	total: 8.16s	remaining: 7.14s
16:	learn: 0.2500948	total: 8.63s	remaining: 6.6s
17:	learn: 0.2457449	total: 9.12s	remaining: 6.08s
18:	learn: 0.2398815	total: 9.61s	remaining: 5.57s
19:	learn: 0.2357633	tot

In [75]:
df = pd.DataFrame(eclf2.predict(X),columns=["glove_score"])
df.head()

Unnamed: 0,glove_score
0,0.635028
1,0.877394
2,0.870452
3,0.860857
4,0.758754


In [76]:
final = df.to_csv('../csv/solo_embedding_glove_train.csv')

### Predicciones

In [77]:
df = pd.DataFrame(eclf2.predict(normalized_embeddings_test),columns=["glove_score"])
df.head()
final = df.to_csv('../csv/solo_embedding_glove_test.csv')

# Averaging de los 5 árboles

In [78]:
aux = pd.DataFrame()
aux['xgboost'] = xg_reg.predict(normalized_embeddings_train)
aux['random_forest'] = rf_model.predict(normalized_embeddings_train)
aux['catboost'] = catb.predict(normalized_embeddings_train)
aux['gradient_boosting'] = gb.predict(normalized_embeddings_train)
aux['lightgbm'] = lgb_class.predict(normalized_embeddings_train)
div = aux.shape[1]

In [79]:
suma = aux.sum(axis=1)
train_preds = (suma/div)      
train_preds

0       0.695463
1       0.871951
2       0.832953
3       0.723936
4       0.708244
          ...   
7608    0.964543
7609    0.951521
7610    0.899861
7611    0.896397
7612    0.958406
Length: 7613, dtype: float64

In [80]:
print('Accuracy Score : ' + str(accuracy_score(train['target'],train_preds.round())))
print('Precision Score : ' + str(precision_score(train['target'],train_preds.round())))
print('Recall Score : ' + str(recall_score(train['target'],train_preds.round())))
print('F1 Score : ' + str(f1_score(train['target'],train_preds.round())))

Accuracy Score : 0.9527124655195061
Precision Score : 0.9727184150698278
Recall Score : 0.9156221339040049
F1 Score : 0.9433070866141732


In [81]:
aux = pd.DataFrame()
aux['xgboost'] = xg_reg.predict(normalized_embeddings_test)
aux['random_forest'] = rf_model.predict(normalized_embeddings_test)
aux['catboost'] = catb.predict(normalized_embeddings_test)
aux['gradient_boosting'] = gb.predict(normalized_embeddings_test)
aux['lightgbm'] = lgb_class.predict(normalized_embeddings_test)
div = aux.shape[1]

In [82]:
suma = aux.sum(axis=1)
test['target'] = (suma/div)
test_preds = test['target']           
test_preds

0       0.613861
1       0.710942
2       0.469730
3       0.829392
4       0.924767
          ...   
3258    0.620309
3259    0.767147
3260    0.658395
3261    0.789026
3262    0.404147
Name: target, Length: 3263, dtype: float64

In [83]:
aux['target'] = test_preds
aux = aux['target'].to_frame()
aux.to_csv('../csv/avg_glove_test.csv')