# Se importan las bibliotecas necesarias

In [1]:
import pandas as pd
import re
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, GradientBoostingClassifier
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from catboost import CatBoostRegressor
%matplotlib inline

# Se lee el .csv

In [2]:
train = pd.read_csv('csv/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [3]:
ids_with_target_error = [328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226]
train.loc[train['id'].isin(ids_with_target_error),'target'] = 0

In [4]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
vocab = pd.read_csv('negative-words.txt', header=None)
vocab = vocab[0]
vocab

0         abnormal
1          abolish
2       abominable
3       abominably
4        abominate
           ...    
4776          zaps
4777        zealot
4778       zealous
4779     zealously
4780        zombie
Name: 0, Length: 4781, dtype: object

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer(vocabulary= vocab, min_df=2, stop_words= {'english'}, max_df= 0.60)
x = v.fit_transform(train['text'])
tf = pd.DataFrame.sparse.from_spmatrix(x)
tf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4771,4772,4773,4774,4775,4776,4777,4778,4779,4780
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [7]:
cols = v.get_feature_names()

In [8]:
tf

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4771,4772,4773,4774,4775,4776,4777,4778,4779,4780
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7610,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
palabras = []
for i in range (0, 4780):
    if tf[i].mean() == 0:
        del tf[i]
    else: 
        palabras.append(i)

In [10]:
col_rename = {}
type(col_rename)
for palabra in palabras: 
    col_rename[palabra] = cols[palabra]

In [11]:
tf.rename(columns=col_rename, inplace=True)
tf

Unnamed: 0,abomination,absence,absurd,absurdly,abuse,abused,abuses,accuses,aching,adamantly,...,worst,worthless,wound,wounds,wreak,wreck,wrinkled,wrong,wrought,4780
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7608,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7609,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7610,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [12]:
test = pd.read_csv('csv/test.csv')

In [13]:
res_w2v = pd.read_csv('csv/solo_embedding.csv')
del res_w2v['Unnamed: 0']
res_w2v.head()

Unnamed: 0,0
0,0.771458
1,0.606724
2,0.879399
3,0.944014
4,0.894664


In [14]:
res_w2v_test = pd.read_csv('csv/solo_embedding_test.csv')
del res_w2v_test['Unnamed: 0']
res_w2v_test.head()

Unnamed: 0,0
0,0.86946
1,0.561893
2,0.709307
3,0.601553
4,0.532795


In [15]:
train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [16]:
#train['res_w2v'] = res_w2v.iloc[:, 0]
train = pd.concat([train, tf], axis=1, sort=False)

In [17]:
#test['res_w2v'] = res_w2v_test.iloc[:, 0]
train.head()

Unnamed: 0,id,keyword,location,text,target,abomination,absence,absurd,absurdly,abuse,...,worst,worthless,wound,wounds,wreak,wreck,wrinkled,wrong,wrought,4780
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4,,,Forest fire near La Ronge Sask. Canada,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5,,,All residents asked to 'shelter in place' are ...,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6,,,"13,000 people receive #wildfires evacuation or...",1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [18]:
res_elmo = pd.read_csv('csv/elvin_train.csv')
del res_elmo['Unnamed: 0']
res_elmo.head()

Unnamed: 0,elmo_score
0,0.379383
1,0.585851
2,0.628948
3,0.647089
4,0.677196


In [19]:
#train['res_elmo'] = res_w2v.iloc[:, 0]
#train.set_index('id')

In [20]:
train.head()

Unnamed: 0,id,keyword,location,text,target,abomination,absence,absurd,absurdly,abuse,...,worst,worthless,wound,wounds,wreak,wreck,wrinkled,wrong,wrought,4780
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4,,,Forest fire near La Ronge Sask. Canada,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5,,,All residents asked to 'shelter in place' are ...,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6,,,"13,000 people receive #wildfires evacuation or...",1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
res_elmo_test = pd.read_csv('csv/elvin_train.csv')
del res_elmo_test['Unnamed: 0']
res_elmo_test.head()

Unnamed: 0,elmo_score
0,0.379383
1,0.585851
2,0.628948
3,0.647089
4,0.677196


In [22]:
#test['res_elmo'] = res_elmo_test.iloc[:, 0]
test.tail()

Unnamed: 0,id,keyword,location,text
3258,10861,,,EARTHQUAKE SAFETY LOS ANGELES ÛÒ SAFETY FASTE...
3259,10865,,,Storm in RI worse than last hurricane. My city...
3260,10868,,,Green Line derailment in Chicago http://t.co/U...
3261,10874,,,MEG issues Hazardous Weather Outlook (HWO) htt...
3262,10875,,,#CityofCalgary has activated its Municipal Eme...


# Elaboración del modelo

Para este modelo se utilizarán árboles de decisión para realizar las predicciones y se tendrán en cuenta los siguientes features extraídos del análisis exploratorio, los cuales fueron seleccionados por el grupo como aquellos que podrían resultan más interesantes:

- Palabras con mayor y menor porcentaje de veracidad.
- Pares de 2 palabras con mayor y menor porcentaje de veracidad.
- Tweets que contienen @, ¿?, ¡! tienden a ser falsos.
- Keywords con mayor y menor porcentaje de veracidad.
- Locaciones más y menos veraces.
- Longitud.

### Preparación del set de datos

#### Parte inicial

In [23]:
#Palabras con mayor y menor porcentaje de veracidad
min_repetitions = ((0.2/100)*len(train.index))
words = train['text'].str.split()
clean_words = []

for sentence in words:
    clean_sentence = []
    for word in sentence:
        clean_word = re.sub('[^A-Za-z0-9]+','', word)
        if(clean_word != ''):
            clean_sentence.append(clean_word.lower())
    clean_words.append(clean_sentence)

In [24]:
train_aux = train
train_aux['words'] = clean_words
train_aux = train_aux.explode('words')
train_aux.head()

Unnamed: 0,id,keyword,location,text,target,abomination,absence,absurd,absurdly,abuse,...,worthless,wound,wounds,wreak,wreck,wrinkled,wrong,wrought,4780,words
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,our
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,deeds
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,are
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,the
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,reason


In [25]:
#Palabras con mayor y menor porcentaje de veracidad (TEST, CREO QUE SEPARA LOS TWEETS POR PALABRAS)***********
min_repetitions = ((0.2/100)*len(test.index))
words_test = test['text'].str.split()
clean_words_test = []

for sentence in words_test:
   clean_sentence_test = []
   for word in sentence:
       clean_word_test = re.sub('[^A-Za-z0-9]+','', word)
       if(clean_word_test != ''):
           clean_sentence_test.append(clean_word_test.lower())
   clean_words_test.append(clean_sentence_test)

test['words'] = clean_words_test
test.head()

Unnamed: 0,id,keyword,location,text,words
0,0,,,Just happened a terrible car crash,"[just, happened, a, terrible, car, crash]"
1,2,,,"Heard about #earthquake is different cities, s...","[heard, about, earthquake, is, different, citi..."
2,3,,,"there is a forest fire at spot pond, geese are...","[there, is, a, forest, fire, at, spot, pond, g..."
3,9,,,Apocalypse lighting. #Spokane #wildfires,"[apocalypse, lighting, spokane, wildfires]"
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,"[typhoon, soudelor, kills, 28, in, china, and,..."


In [26]:
train_aux = train_aux.groupby('words').agg({'target':['sum','count']})
train_aux = train_aux[train_aux['target']['count']>min_repetitions]
train_aux['veracity'] = train_aux['target']['sum']/train_aux['target']['count']
train_aux.head()

Unnamed: 0_level_0,target,target,veracity
Unnamed: 0_level_1,sum,count,Unnamed: 3_level_1
words,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
0,9,12,0.75
5,14,18,0.777778
6,7,10,0.7
1,29,64,0.453125
10,9,23,0.391304


In [27]:
train_aux.reset_index(inplace=True) 

In [28]:
#Pares de 2 palabras con mayor y menor porcentaje de veracidad.
train_aux2 = train
train_aux2['words'] = clean_words
words_pairs = []

for sentence in train_aux2['words']:
    pairs = []
    for i in range(len(sentence)-1):
        pairs.append(sentence[i] + ' ' + sentence[i+1])
    words_pairs.append(pairs)
    
train_aux2['words'] = words_pairs
train_aux2 = train_aux2.explode('words')
train_aux2.head()

Unnamed: 0,id,keyword,location,text,target,abomination,absence,absurd,absurdly,abuse,...,worthless,wound,wounds,wreak,wreck,wrinkled,wrong,wrought,4780,words
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,our deeds
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,deeds are
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,are the
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,the reason
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,reason of


In [29]:
train_aux2 = train_aux2.groupby('words').agg({'target':['sum','count']})
train_aux2 = train_aux2[train_aux2['target']['count']>min_repetitions]
train_aux2['veracity'] = train_aux2['target']['sum']/train_aux2['target']['count']

In [30]:
#las palabras que son '' la cagan
train_aux2.head(10)

Unnamed: 0_level_0,target,target,veracity
Unnamed: 0_level_1,sum,count,Unnamed: 3_level_1
words,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
05 at,13,13,1.0
11yearold boy,20,20,1.0
12000 nigerian,11,12,0.916667
15 in,13,13,1.0
16yr old,28,28,1.0
2 spos,9,9,1.0
3 words,0,8,0.0
30 fires,7,7,1.0
300w curved,1,7,0.142857
320 ir,0,7,0.0


In [31]:
train_aux2.reset_index(inplace=True)

In [32]:
# Keywords más y menos veraces
train_aux3 = train
train_aux3 = train_aux3.groupby('keyword').agg({'target':['sum','count']})
train_aux3 = train_aux3[train_aux3['target']['count']>min_repetitions]
train_aux3['veracity'] = train_aux3['target']['sum']/train_aux3['target']['count']
train_aux3.head()

Unnamed: 0_level_0,target,target,veracity
Unnamed: 0_level_1,sum,count,Unnamed: 3_level_1
keyword,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
ablaze,13,36,0.361111
accident,24,35,0.685714
aftershock,0,34,0.0
airplane%20accident,30,35,0.857143
ambulance,20,38,0.526316


In [33]:
train_aux3.reset_index(inplace=True)

In [34]:
# Locaciones más y menos veraces
train_aux4 = train
train_aux4 = train_aux4.groupby('location').agg({'target':['sum','count']})
train_aux4 = train_aux4[train_aux4['target']['count']>min_repetitions]
train_aux4['veracity'] = train_aux4['target']['sum']/train_aux4['target']['count']
train_aux4.head()

Unnamed: 0_level_0,target,target,veracity
Unnamed: 0_level_1,sum,count,Unnamed: 3_level_1
location,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
304,0,9,0.0
"Atlanta, GA",5,10,0.5
"Austin, TX",3,7,0.428571
Australia,9,18,0.5
California,7,17,0.411765


In [35]:
train_aux4.reset_index(inplace=True)

#### Parte final

In [36]:
top_words = train_aux[train_aux['veracity']>0.9]['words']
worst_words = train_aux[train_aux['veracity']<0.1]['words']

In [37]:
top_words_pairs = train_aux2[train_aux2['veracity']>0.9]['words']
worst_words_pairs = train_aux2[train_aux2['veracity']<0.1]['words']

In [38]:
top_keywords = train_aux3[train_aux3['veracity']>0.9]['keyword']
worst_keywords = train_aux3[train_aux3['veracity']<0.1]['keyword']

In [39]:
top_locations = train_aux4[train_aux4['veracity']>0.9]['location']
worst_locations = train_aux4[train_aux4['veracity']<0.1]['location']

In [40]:
train = train.drop(columns='words')
train['long(char)'] = train['text'].str.len()

aux = []
for i in words:
    aux.append(len(i))

train['long(word)'] = aux
train.head()

Unnamed: 0,id,keyword,location,text,target,abomination,absence,absurd,absurdly,abuse,...,wound,wounds,wreak,wreck,wrinkled,wrong,wrought,4780,long(char),long(word)
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,69,13
1,4,,,Forest fire near La Ronge Sask. Canada,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,38,7
2,5,,,All residents asked to 'shelter in place' are ...,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,133,22
3,6,,,"13,000 people receive #wildfires evacuation or...",1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,65,8
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,88,16


In [41]:
test = test.drop(columns='words')

In [42]:
test['long(char)'] = test['text'].str.len()

aux = []

for i in words_test:
    aux.append(len(i))

test['long(word)'] = aux
test.head()

Unnamed: 0,id,keyword,location,text,long(char),long(word)
0,0,,,Just happened a terrible car crash,34,6
1,2,,,"Heard about #earthquake is different cities, s...",64,9
2,3,,,"there is a forest fire at spot pond, geese are...",96,19
3,9,,,Apocalypse lighting. #Spokane #wildfires,40,4
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,45,8


In [43]:
def count_text_ocurrences(texts, words):
    ocurrences = []
    for text in texts:
        count = 0
        for word in words:
            if word in text:
                count = count+1
        ocurrences.append(count)
    return ocurrences

def count_ocurrences(searched, series):
    ocurrences = []
    for element in searched:
        count = 0
        if element in series:
            count = count+1
        ocurrences.append(count)
    return ocurrences

def contains_char(texts, char1, char2=''):
    ocurrences = []
    for text in texts:
        count = 0
        if char1 in text:
            count = 1
        if (count==0)&(char2!=''):
            if char2 in text:
                count = 1
        ocurrences.append(count)
    return ocurrences

In [44]:
train['top_words'] = count_text_ocurrences(train['text'], top_words)
train['worst_words'] = count_text_ocurrences(train['text'], worst_words)
train['top_words_pairs'] = count_text_ocurrences(train['text'], top_words_pairs)
train['worst_words_pairs'] = count_text_ocurrences(train['text'], worst_words_pairs)
train.head()

Unnamed: 0,id,keyword,location,text,target,abomination,absence,absurd,absurdly,abuse,...,wrinkled,wrong,wrought,4780,long(char),long(word),top_words,worst_words,top_words_pairs,worst_words_pairs
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,69,13,1,1,0,0
1,4,,,Forest fire near La Ronge Sask. Canada,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,38,7,1,1,0,0
2,5,,,All residents asked to 'shelter in place' are ...,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,133,22,4,1,0,0
3,6,,,"13,000 people receive #wildfires evacuation or...",1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,65,8,2,1,1,0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,88,16,2,1,0,0


In [45]:
test['top_words'] = count_text_ocurrences(test['text'], top_words)
test['worst_words'] = count_text_ocurrences(test['text'], worst_words)
test['top_words_pairs'] = count_text_ocurrences(test['text'], top_words_pairs)
test['worst_words_pairs'] = count_text_ocurrences(test['text'], worst_words_pairs)
test.head()

Unnamed: 0,id,keyword,location,text,long(char),long(word),top_words,worst_words,top_words_pairs,worst_words_pairs
0,0,,,Just happened a terrible car crash,34,6,3,0,0,0
1,2,,,"Heard about #earthquake is different cities, s...",64,9,1,2,0,0
2,3,,,"there is a forest fire at spot pond, geese are...",96,19,1,1,0,0
3,9,,,Apocalypse lighting. #Spokane #wildfires,40,4,2,1,0,0
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,45,8,1,0,0,0


In [46]:
train['top_keywords'] = count_ocurrences(train['keyword'], top_keywords)
train['worst_keywords'] = count_ocurrences(train['keyword'], worst_keywords)
train['top_locations'] = count_ocurrences(train['location'], top_locations)
train['worst_locations'] = count_ocurrences(train['location'], worst_locations)
train.head()

Unnamed: 0,id,keyword,location,text,target,abomination,absence,absurd,absurdly,abuse,...,long(char),long(word),top_words,worst_words,top_words_pairs,worst_words_pairs,top_keywords,worst_keywords,top_locations,worst_locations
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,0.0,0.0,0.0,0.0,0.0,...,69,13,1,1,0,0,0,0,0,0
1,4,,,Forest fire near La Ronge Sask. Canada,1,0.0,0.0,0.0,0.0,0.0,...,38,7,1,1,0,0,0,0,0,0
2,5,,,All residents asked to 'shelter in place' are ...,1,0.0,0.0,0.0,0.0,0.0,...,133,22,4,1,0,0,0,0,0,0
3,6,,,"13,000 people receive #wildfires evacuation or...",1,0.0,0.0,0.0,0.0,0.0,...,65,8,2,1,1,0,0,0,0,0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,0.0,0.0,0.0,0.0,0.0,...,88,16,2,1,0,0,0,0,0,0


In [47]:
test['top_keywords'] = count_ocurrences(test['keyword'], top_keywords)
test['worst_keywords'] = count_ocurrences(test['keyword'], worst_keywords)
test['top_locations'] = count_ocurrences(test['location'], top_locations)
test['worst_locations'] = count_ocurrences(test['location'], worst_locations)
test.head()

Unnamed: 0,id,keyword,location,text,long(char),long(word),top_words,worst_words,top_words_pairs,worst_words_pairs,top_keywords,worst_keywords,top_locations,worst_locations
0,0,,,Just happened a terrible car crash,34,6,3,0,0,0,0,0,0,0
1,2,,,"Heard about #earthquake is different cities, s...",64,9,1,2,0,0,0,0,0,0
2,3,,,"there is a forest fire at spot pond, geese are...",96,19,1,1,0,0,0,0,0,0
3,9,,,Apocalypse lighting. #Spokane #wildfires,40,4,2,1,0,0,0,0,0,0
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,45,8,1,0,0,0,0,0,0,0


In [48]:
train['arroba'] = contains_char(train['text'],'@')
train['singnosDeInterrogacion'] = contains_char(train['text'],'\?', '¿')
train['signosDeExclamacion'] = contains_char(train['text'],'!','¡')
train.head()

Unnamed: 0,id,keyword,location,text,target,abomination,absence,absurd,absurdly,abuse,...,worst_words,top_words_pairs,worst_words_pairs,top_keywords,worst_keywords,top_locations,worst_locations,arroba,singnosDeInterrogacion,signosDeExclamacion
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
1,4,,,Forest fire near La Ronge Sask. Canada,1,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
2,5,,,All residents asked to 'shelter in place' are ...,1,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
3,6,,,"13,000 people receive #wildfires evacuation or...",1,0.0,0.0,0.0,0.0,0.0,...,1,1,0,0,0,0,0,0,0,0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0


In [49]:
test['arroba'] = contains_char(test['text'],'@')
test['singnosDeInterrogacion'] = contains_char(test['text'],'\?', '¿')
test['signosDeExclamacion'] = contains_char(test['text'],'!','¡')
test.head()

Unnamed: 0,id,keyword,location,text,long(char),long(word),top_words,worst_words,top_words_pairs,worst_words_pairs,top_keywords,worst_keywords,top_locations,worst_locations,arroba,singnosDeInterrogacion,signosDeExclamacion
0,0,,,Just happened a terrible car crash,34,6,3,0,0,0,0,0,0,0,0,0,0
1,2,,,"Heard about #earthquake is different cities, s...",64,9,1,2,0,0,0,0,0,0,0,0,0
2,3,,,"there is a forest fire at spot pond, geese are...",96,19,1,1,0,0,0,0,0,0,0,0,0
3,9,,,Apocalypse lighting. #Spokane #wildfires,40,4,2,1,0,0,0,0,0,0,0,0,0
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan,45,8,1,0,0,0,0,0,0,0,0,0,0


In [50]:
train = train.drop(columns=['keyword','location','text'])
train = train.set_index('id')
train.head(20)

Unnamed: 0_level_0,target,abomination,absence,absurd,absurdly,abuse,abused,abuses,accuses,aching,...,worst_words,top_words_pairs,worst_words_pairs,top_keywords,worst_keywords,top_locations,worst_locations,arroba,singnosDeInterrogacion,signosDeExclamacion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
5,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
6,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,1,0,0,0,0,0,0,0,0
7,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
8,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2,0,0,0,0,0,0,0,0,0
10,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
13,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,1,0,0,0,0,0,0,0,0
14,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
15,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
test = test.drop(columns=['keyword','location','text'])
test = test.set_index('id')
test.head(20)

Unnamed: 0_level_0,long(char),long(word),top_words,worst_words,top_words_pairs,worst_words_pairs,top_keywords,worst_keywords,top_locations,worst_locations,arroba,singnosDeInterrogacion,signosDeExclamacion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,34,6,3,0,0,0,0,0,0,0,0,0,0
2,64,9,1,2,0,0,0,0,0,0,0,0,0
3,96,19,1,1,0,0,0,0,0,0,0,0,0
9,40,4,2,1,0,0,0,0,0,0,0,0,0
11,45,8,1,0,0,0,0,0,0,0,0,0,0
12,34,4,2,2,0,0,0,0,0,0,0,0,0
21,72,12,1,1,0,0,0,0,0,0,0,0,0
22,17,4,1,0,0,0,0,0,0,0,0,0,1
27,16,4,0,1,0,0,0,0,0,0,0,0,0
29,9,2,0,0,0,0,0,0,0,0,0,0,1


In [52]:
col_rename = {'long(char)': 'longEnChars', 'long(word)': 'longEnWords', 'top_words': 'topWords', 'worst_words': 'worstWords', 'top_words_pairs': 'topWordsPairs',  
             'worst_words_pairs': 'worstWordsPairs', 'top_keywords': 'topKeywords', 'worst_keywords': 'worstKeywords', 'top_locations': 'topLocations', 
             'worst_locations': 'worstLocations'}
train = train.rename(columns= col_rename)
train.head()

Unnamed: 0_level_0,target,abomination,absence,absurd,absurdly,abuse,abused,abuses,accuses,aching,...,worstWords,topWordsPairs,worstWordsPairs,topKeywords,worstKeywords,topLocations,worstLocations,arroba,singnosDeInterrogacion,signosDeExclamacion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
4,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
5,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
6,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,1,0,0,0,0,0,0,0,0
7,1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0


In [53]:
col_rename = {'long(char)': 'longEnChars', 'long(word)': 'longEnWords', 'top_words': 'topWords', 'worst_words': 'worstWords', 'top_words_pairs': 'topWordsPairs',  
             'worst_words_pairs': 'worstWordsPairs', 'top_keywords': 'topKeywords', 'worst_keywords': 'worstKeywords', 'top_locations': 'topLocations', 
             'worst_locations': 'worstLocations'}
test = test.rename(columns= col_rename)
test.head()

Unnamed: 0_level_0,longEnChars,longEnWords,topWords,worstWords,topWordsPairs,worstWordsPairs,topKeywords,worstKeywords,topLocations,worstLocations,arroba,singnosDeInterrogacion,signosDeExclamacion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
0,34,6,3,0,0,0,0,0,0,0,0,0,0
2,64,9,1,2,0,0,0,0,0,0,0,0,0
3,96,19,1,1,0,0,0,0,0,0,0,0,0
9,40,4,2,1,0,0,0,0,0,0,0,0,0
11,45,8,1,0,0,0,0,0,0,0,0,0,0


### Entrenamiento del set de datos

In [74]:
X, y = train.iloc[:,train.columns != 'target'], train.iloc[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train

Unnamed: 0_level_0,abomination,absence,absurd,absurdly,abuse,abused,abuses,accuses,aching,adamantly,...,worstWords,topWordsPairs,worstWordsPairs,topKeywords,worstKeywords,topLocations,worstLocations,arroba,singnosDeInterrogacion,signosDeExclamacion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3334,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
9064,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2,0,0,0,0,0,0,1,0,0
1587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,1,0,0,0,0,0,0,0,0
797,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
9325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7470,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,1,0,0,0,0,1,0,0
7691,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2,0,2,0,0,0,0,0,0,0
1242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
10862,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [55]:
from sklearn.feature_selection import SelectPercentile, chi2
selec = SelectPercentile(chi2, percentile=10).fit(X_train, y_train)
X_new = selec.transform(X_train)
seleccionadas = pd.DataFrame(data=X_new)
seleccionadas



Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,85,86,87,88,89,90,91,92,93,94
0,0,0,0,0,0,0,0,0,0,0,...,0,0,139,19,3,0,0,0,1,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,140,14,1,2,0,0,1,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,82,10,1,1,1,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,126,16,0,0,0,0,0,1
4,0,0,0,0,0,0,0,0,0,0,...,0,0,99,17,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6466,0,0,0,0,0,0,0,0,0,0,...,0,0,96,16,1,1,0,1,1,0
6467,0,0,0.524628,0,0,0,0,0,0,0,...,0,0,132,28,1,2,0,2,0,0
6468,0,0,0,0,0,0,0,0,0,0,...,0,0,121,13,0,1,0,0,0,0
6469,0,0,0,0,0,0,0,0,0,0,...,0,0,136,20,2,0,0,0,0,0


In [56]:
cols = selec.get_support(indices=True)
cols
features = X.columns[cols]
features

Index(['alarm', 'apocalypse', 'attack', 'avalanche', 'bad', 'bitch',
       'bleeding', 'bloody', 'bomb', 'burned', 'catastrophic', 'collapse',
       'costlier', 'crash', 'crime', 'crisis', 'crush', 'crushed', 'damage',
       'damaged', 'dead', 'delays', 'deluge', 'demolish', 'desolate',
       'desolation', 'devastation', 'died', 'disaster', 'displaced', 'drought',
       'dust', 'explode', 'failure', 'fatal', 'fear', 'fears', 'freak', 'fuck',
       'fucking', 'harm', 'haunting', 'injury', 'inundated', 'issues',
       'killed', 'killing', 'kills', 'lack', 'limited', 'massacre', 'meltdown',
       'miss', 'murder', 'murderer', 'obliterate', 'obliterated', 'offensive',
       'outbreak', 'outrage', 'pandemonium', 'panic', 'panicking', 'provoke',
       'rail', 'rocky', 'ruin', 'severe', 'smoke', 'sorry', 'struggles', 'sue',
       'suicide', 'sunk', 'suspect', 'terror', 'terrorism', 'threatening',
       'wrong', 'wrought', 'longEnChars', 'longEnWords', 'topWords',
       'worstWord

In [57]:
X_train[X_train.index.duplicated()]

Unnamed: 0_level_0,abomination,absence,absurd,absurdly,abuse,abused,abuses,accuses,aching,adamantly,...,worstWords,topWordsPairs,worstWordsPairs,topKeywords,worstKeywords,topLocations,worstLocations,arroba,singnosDeInterrogacion,signosDeExclamacion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1


In [58]:
X_train = X_train.filter(items = features)
X_test = X_test.filter(items = features)

#### Red

In [59]:
import pandas as pd
import re
import numpy as np
import tensorflow_hub as hub
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Embedding, LSTM,Dense, SpatialDropout1D, Dropout
from keras import layers
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split

In [75]:
X_train

Unnamed: 0_level_0,abomination,absence,absurd,absurdly,abuse,abused,abuses,accuses,aching,adamantly,...,worstWords,topWordsPairs,worstWordsPairs,topKeywords,worstKeywords,topLocations,worstLocations,arroba,singnosDeInterrogacion,signosDeExclamacion
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3334,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
9064,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2,0,0,0,0,0,0,1,0,0
1587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,1,0,0,0,0,0,0,0,0
797,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,1
9325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7470,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,1,0,0,0,0,1,0,0
7691,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2,0,2,0,0,0,0,0,0,0
1242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1,0,0,0,0,0,0,0,0,0
10862,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0


In [97]:
model = Sequential()
model.add(layers.Dense(1024, input_dim=946, activation='relu'))
model.add(Dropout(0.25))
model.add(layers.Dense(512, input_dim=946, activation='relu'))
model.add(Dropout(0.25))
model.add(layers.Dense(1, activation='sigmoid'))

In [98]:
optimizer=Adam(learning_rate=5e-5)
model.compile(loss='binary_crossentropy', 
              optimizer=optimizer, 
              metrics=['accuracy'])
model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_27 (Dense)             (None, 1024)              969728    
_________________________________________________________________
dropout_20 (Dropout)         (None, 1024)              0         
_________________________________________________________________
dense_28 (Dense)             (None, 512)               524800    
_________________________________________________________________
dropout_21 (Dropout)         (None, 512)               0         
_________________________________________________________________
dense_29 (Dense)             (None, 1)                 513       
Total params: 1,495,041
Trainable params: 1,495,041
Non-trainable params: 0
_________________________________________________________________


In [99]:
X_train = np.asarray(X_train).astype(np.float32)
X_test = np.asarray(X_test).astype(np.float32)

In [101]:
history = model.fit(X_train, y_train,
                    epochs=500,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [None]:
dssdss
test.head()

In [None]:
#predigo con el modelo entrenago con cross validation
a = test.iloc[:,:]
a = xgb.DMatrix(a)
preds = model.predict(a)

In [None]:
final = test['arroba'].to_frame()
final['target'] = preds.round().astype(int)
final.drop(columns=['arroba'], inplace=True)
final.head()

In [None]:
final.to_csv('csv/submission_modelo_7.csv')

In [None]:
anterior = pd.read_csv('csv/submission_modelo5_voting(1).csv')
nueva = pd.read_csv('csv/submission_modelo_7.csv')

In [None]:
print(len(nueva.index))
diferencias = np.absolute(anterior['target']-nueva['target'])
a = sum(diferencias)
a