In [1]:
import pandas as pd
import re
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV,cross_val_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import lightgbm as lgb
from catboost import CatBoostClassifier
%matplotlib inline

# Se leen los archivos

In [2]:
train = pd.read_csv('csv/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [3]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
test = pd.read_csv('csv/test.csv')

In [5]:
elmo_train = np.loadtxt("train_embedding_file.txt").reshape(len(train.index),1024)
elmo_test = np.loadtxt("test_embedding_file.txt").reshape(len(test.index),1024)

In [6]:
w2v_train = np.loadtxt("train_embedding_file_w2v(norm).txt").reshape(len(train.index),300)
w2v_test = np.loadtxt("test_embedding_file_w2v(norm).txt").reshape(len(test.index),300)

# Preparación de los features

En este caso se aplicará el algoritmo PCA de reducción de dimensiones a los embeddings generados con word2vec y ELMo para luego evaluar los resultados obtenidos con distintos árboles de decisión. 

In [7]:
def pca_embedding(embedding):
    pca = PCA(n_components=40)
    aux = StandardScaler().fit_transform(embedding)
    return pca.fit_transform(aux)

In [8]:
training_set = pd.DataFrame(np.concatenate([pca_embedding(elmo_train),pca_embedding(w2v_train)], axis=1))
training_set['target'] = train['target']
training_set.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,71,72,73,74,75,76,77,78,79,target
0,-13.736309,2.007282,0.144576,3.720196,-4.546878,-0.522302,-0.000929,-2.227781,1.283995,-0.824285,...,1.567848,-1.104088,0.520965,2.00156,2.933543,-0.002646,-0.069838,-0.792198,-0.29075,1
1,-11.810253,-1.97245,8.758817,0.464498,-1.751047,2.073969,0.467077,-0.397649,-2.411398,-0.810757,...,0.737749,-0.671624,-2.100406,-2.281841,-1.101396,-0.105134,1.687549,0.614608,2.088953,1
2,-9.389612,-6.768568,-9.392008,-1.860543,-4.532383,-2.499498,7.636814,4.596518,-8.892878,-2.213954,...,0.594961,-0.084156,0.123039,-1.773619,-0.608197,0.435749,-0.32475,-0.118877,0.118378,1
3,-15.600199,-5.578937,0.688331,1.796353,-3.036748,1.024565,0.980655,0.189601,-3.846012,2.078942,...,1.633815,-0.452019,0.923838,-1.338613,-1.718735,-1.569163,-0.316705,-1.49181,0.163558,1
4,-8.027637,-2.679847,-2.919748,1.455481,1.920094,4.269371,-1.396675,-0.055702,-1.214513,4.984164,...,-0.101811,-0.690781,0.102939,-0.245875,-0.170754,-1.082677,0.02499,-0.494026,-0.292226,1


In [9]:
test_set = pd.DataFrame(np.concatenate([pca_embedding(elmo_test),pca_embedding(w2v_test)], axis=1))
#test_set.set_index(test['id'],inplace=True)
test_set.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,70,71,72,73,74,75,76,77,78,79
0,-15.456838,1.844249,-0.764046,1.395083,3.186081,2.001357,0.873796,2.265477,-3.385583,2.247599,...,1.132952,-2.051666,0.216574,1.763921,1.427776,0.371148,1.077794,-0.317192,0.416856,-0.523773
1,-15.17631,-0.772809,-4.685967,0.477827,-1.863495,3.745633,1.308749,-0.3853,0.347107,-0.840948,...,1.350942,0.364163,-0.19394,-0.511294,0.353892,0.600745,0.707569,-1.08014,0.32096,0.384011
2,-4.354521,-0.600519,-2.571853,0.981579,1.851692,8.54953,-2.254208,1.173279,6.531805,0.037489,...,0.615997,0.095573,2.618621,0.334979,1.059441,0.137227,-0.60337,-0.706712,0.710091,-1.025594
3,-14.889525,-3.705396,1.735795,1.814816,0.896484,3.098697,-0.342755,-3.575621,-1.355512,0.730867,...,3.557524,0.031584,0.903291,3.368354,0.047525,0.685191,-2.889472,-3.571731,2.431178,-0.399387
4,-9.656657,-5.667418,4.43318,3.45859,0.963097,4.513907,-4.159752,-2.056735,1.123739,-0.303383,...,-4.632974,-1.018867,-0.863002,1.741258,2.803662,-0.677797,-1.259002,2.842066,1.170445,2.801125


# Se entrena el modelo

In [10]:
X, y = training_set.iloc[:,0:-1], training_set.iloc[:,-1].to_frame()
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

In [11]:
xg_reg = xgb.XGBClassifier(objective ='binary:logistic', 
                colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 20, alpha = 10, n_estimators = 2)

xg_reg.fit(X_train,y_train)
preds = xg_reg.predict(X_test)
print('Accuracy Score : ' + str(accuracy_score(y_test,preds)))
print('Precision Score : ' + str(precision_score(y_test,preds)))
print('Recall Score : ' + str(recall_score(y_test,preds)))
print('F1 Score : ' + str(f1_score(y_test,preds)))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy Score : 0.7703412073490814
Precision Score : 0.8368200836820083
Recall Score : 0.5952380952380952
F1 Score : 0.6956521739130433


In [12]:
rf_model = RandomForestClassifier(random_state=13, n_estimators=5, max_depth=10)
rf_model.fit(X_train, y_train)
preds = rf_model.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds)))
print('Precision Score : ' + str(precision_score(y_test,preds)))
print('Recall Score : ' + str(recall_score(y_test,preds)))
print('F1 Score : ' + str(f1_score(y_test,preds)))

  


Accuracy Score : 0.7729658792650919
Precision Score : 0.7859649122807018
Recall Score : 0.6666666666666666
F1 Score : 0.7214170692431561


In [13]:
lgb_class = lgb.LGBMClassifier(learning_rate = 0.1,
                max_depth = 10, alpha = 10, n_estimators = 5)
lgb_class.fit(X_train, y_train)
preds = lgb_class.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds)))
print('Precision Score : ' + str(precision_score(y_test,preds)))
print('Recall Score : ' + str(recall_score(y_test,preds)))
print('F1 Score : ' + str(f1_score(y_test,preds)))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Accuracy Score : 0.7716535433070866
Precision Score : 0.8491379310344828
Recall Score : 0.5863095238095238
F1 Score : 0.693661971830986


In [14]:
catb = CatBoostClassifier(iterations=5)
catb.fit(X_train, y_train)
preds = catb.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds)))
print('Precision Score : ' + str(precision_score(y_test,preds)))
print('Recall Score : ' + str(recall_score(y_test,preds)))
print('F1 Score : ' + str(f1_score(y_test,preds)))

Learning rate set to 0.5
0:	learn: 0.5512509	total: 85.6ms	remaining: 342ms
1:	learn: 0.5092646	total: 123ms	remaining: 185ms
2:	learn: 0.4807054	total: 159ms	remaining: 106ms
3:	learn: 0.4649020	total: 204ms	remaining: 51.1ms
4:	learn: 0.4512209	total: 240ms	remaining: 0us
Accuracy Score : 0.7979002624671916
Precision Score : 0.837037037037037
Recall Score : 0.6726190476190477
F1 Score : 0.7458745874587459


In [15]:
gb = GradientBoostingClassifier(n_estimators=5, learning_rate=0.1, 
                                max_features=2, max_depth = 2, random_state = 0)
gb.fit(X_train, y_train)
preds = gb.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds.round())))
print('Precision Score : ' + str(precision_score(y_test,preds.round())))
print('Recall Score : ' + str(recall_score(y_test,preds.round())))
print('F1 Score : ' + str(f1_score(y_test,preds.round())))

Accuracy Score : 0.5616797900262467
Precision Score : 0.6
Recall Score : 0.017857142857142856
F1 Score : 0.03468208092485549


  y = column_or_1d(y, warn=True)


In [21]:
eclf2 = VotingClassifier(estimators=[
         ('rf', rf_model), ('catb', catb), ('gb', gb), ('lgbm',lgb_class)])
eclf2 = eclf2.fit(X, y)
preds = eclf2.predict(X_test)

print('Accuracy Score : ' + str(accuracy_score(y_test,preds)))
print('Precision Score : ' + str(precision_score(y_test,preds)))
print('Recall Score : ' + str(recall_score(y_test,preds)))
print('F1 Score : ' + str(f1_score(y_test,preds)))

  y = column_or_1d(y, warn=True)
  y = column_or_1d(y, warn=True)


Learning rate set to 0.5
0:	learn: 0.5454116	total: 35.7ms	remaining: 143ms
1:	learn: 0.5030436	total: 81.2ms	remaining: 122ms
2:	learn: 0.4740884	total: 122ms	remaining: 81.3ms
3:	learn: 0.4628224	total: 174ms	remaining: 43.6ms
4:	learn: 0.4477594	total: 233ms	remaining: 0us
Accuracy Score : 0.8044619422572179
Precision Score : 0.9698492462311558
Recall Score : 0.5744047619047619
F1 Score : 0.7214953271028036


# Predicciones

In [23]:
test['target'] = eclf2.predict(test_set.iloc[:,:-1]).astype(int)

In [24]:
test.set_index('id', inplace=True)
test.drop(columns=['keyword','location','text'], inplace=True)
test.head()

Unnamed: 0_level_0,target
id,Unnamed: 1_level_1
0,0
2,1
3,0
9,1
11,0


In [25]:
final = test.to_csv('csv/submission_modelo6(pca40).csv')