# Se importan las bibliotecas necesarias

In [1]:
import pandas as pd
import re
import numpy as np
import xgboost as xgb
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.ensemble import RandomForestRegressor, VotingRegressor, GradientBoostingClassifier
from sklearn.metrics import accuracy_score,recall_score,precision_score,f1_score
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from catboost import CatBoostRegressor
%matplotlib inline

# Se lee el .csv

In [2]:
train = pd.read_csv('csv/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [3]:
ids_with_target_error = [328,443,513,2619,3640,3900,4342,5781,6552,6554,6570,6701,6702,6729,6861,7226]
train.loc[train['id'].isin(ids_with_target_error),'target'] = 0

In [4]:
col_rename = {'id': 'tp_datos_id', 'keyword': 'tp_datos_keyword', 'location': 'tp_datos_location', 'text': 'tp_datos_text'}
train = train.rename(columns= col_rename)
train.head()

Unnamed: 0,tp_datos_id,tp_datos_keyword,tp_datos_location,tp_datos_text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [5]:
train.head()

Unnamed: 0,tp_datos_id,tp_datos_keyword,tp_datos_location,tp_datos_text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [6]:
def clean_text(df):
    words = df['tp_datos_text'].str.split()
    clean_words = []
    clean_texts = []

    for sentence in words:
        clean_sentence = []
        for word in sentence:
            clean_word = re.sub('[^A-Za-z0-9]+','', word)
            clean_word = re.sub('[0-9]+', '', clean_word)
            if(clean_word != ''):
                clean_sentence.append(clean_word.lower())
        clean_words.append(clean_sentence)
    for words in clean_words:
        clean_text = ''
        for word in words:
            #word_without_num = ''.join([i for i in word if not i.isdigit()])
            #if(word_without_num != ''):
                clean_text += ' ' + word
             #   clean_text += ' ' + word_without_num
        clean_texts.append(clean_text)
    return clean_texts
train['clean_text']= clean_text(train)
train.head()

Unnamed: 0,tp_datos_id,tp_datos_keyword,tp_datos_location,tp_datos_text,target,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake m...
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are b...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as ...


In [7]:
vocab = pd.read_csv('negative-words.txt', header=None)
vocab = vocab[0]
vocab

0         abnormal
1          abolish
2       abominable
3       abominably
4        abominate
           ...    
4776          zaps
4777        zealot
4778       zealous
4779     zealously
4780        zombie
Name: 0, Length: 4781, dtype: object

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer(min_df=2, stop_words= {'english'}, max_df= 0.60, ngram_range= (1,3))
x = v.fit_transform(train['clean_text'])
tf = pd.DataFrame.sparse.from_spmatrix(x)
tf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22654,22655,22656,22657,22658,22659,22660,22661,22662,22663
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
rango = tf.shape[1] 

In [10]:
cols = v.get_feature_names()

In [11]:
palabras = []
for i in range (0, rango):
    if tf[i].mean() == 0:
        del tf[i]
    else: 
        palabras.append(i)
len(palabras)

22664

In [12]:
col_rename_tf = {}
type(col_rename)
for palabra in palabras: 
    col_rename[palabra] = str(cols[palabra])

In [13]:
train.head()

Unnamed: 0,tp_datos_id,tp_datos_keyword,tp_datos_location,tp_datos_text,target,clean_text
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake m...
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are b...
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in...
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as ...


In [14]:
tf.rename(columns=col_rename, inplace=True)
columnas_tf = tf.columns
tf.head()

Unnamed: 0,aa,aba,aba as,aba as woman,abandon,abandoned,abandoned aircraft,abandoned aircraft at,abbott,abbswinston,...,zone,zone coming,zone coming soon,zone of,zouma,zouma has,zouma has just,zss,zss vs,zss vs sws
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
train = pd.concat([train, tf], axis=1, sort=False)

In [16]:
test = pd.read_csv('csv/test.csv')

In [17]:
col_rename = {'id': 'tp_datos_id', 'keyword': 'tp_datos_keyword', 'location': 'tp_datos_location', 'text': 'tp_datos_text'}
test = test.rename(columns= col_rename)
test.head()

Unnamed: 0,tp_datos_id,tp_datos_keyword,tp_datos_location,tp_datos_text
0,0,,,Just happened a terrible car crash
1,2,,,"Heard about #earthquake is different cities, s..."
2,3,,,"there is a forest fire at spot pond, geese are..."
3,9,,,Apocalypse lighting. #Spokane #wildfires
4,11,,,Typhoon Soudelor kills 28 in China and Taiwan


In [18]:
test['clean_text']= clean_text(test)
x = v.transform(test['clean_text'])
tf = pd.DataFrame.sparse.from_spmatrix(x)
tf.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,22654,22655,22656,22657,22658,22659,22660,22661,22662,22663
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [19]:
tf.columns = columnas_tf

In [20]:
tf.head()

Unnamed: 0,aa,aba,aba as,aba as woman,abandon,abandoned,abandoned aircraft,abandoned aircraft at,abbott,abbswinston,...,zone,zone coming,zone coming soon,zone of,zouma,zouma has,zouma has just,zss,zss vs,zss vs sws
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
test = pd.concat([test, tf], axis=1, sort=False)
del test['tp_datos_keyword']
del test['tp_datos_location']
del test['tp_datos_text']
del test['clean_text']
test.set_index('tp_datos_id', inplace=True)
test.head()

Unnamed: 0_level_0,aa,aba,aba as,aba as woman,abandon,abandoned,abandoned aircraft,abandoned aircraft at,abbott,abbswinston,...,zone,zone coming,zone coming soon,zone of,zouma,zouma has,zouma has just,zss,zss vs,zss vs sws
tp_datos_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [22]:
train.head()

Unnamed: 0,tp_datos_id,tp_datos_keyword,tp_datos_location,tp_datos_text,target,clean_text,aa,aba,aba as,aba as woman,...,zone,zone coming,zone coming soon,zone of,zouma,zouma has,zouma has just,zss,zss vs,zss vs sws
0,1,,,Our Deeds are the Reason of this #earthquake M...,1,our deeds are the reason of this earthquake m...,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,4,,,Forest fire near La Ronge Sask. Canada,1,forest fire near la ronge sask canada,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,5,,,All residents asked to 'shelter in place' are ...,1,all residents asked to shelter in place are b...,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,6,,,"13,000 people receive #wildfires evacuation or...",1,people receive wildfires evacuation orders in...,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1,just got sent this photo from ruby alaska as ...,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [23]:
del train['tp_datos_keyword']
del train['tp_datos_location']
del train['tp_datos_text']
del train['clean_text']
train.set_index('tp_datos_id', inplace=True)

### Entrenamiento del set de datos

In [24]:
X, y = train.iloc[:,train.columns != 'target'], train.iloc[:,0]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15, random_state=42)
X_train

Unnamed: 0_level_0,aa,aba,aba as,aba as woman,abandon,abandoned,abandoned aircraft,abandoned aircraft at,abbott,abbswinston,...,zone,zone coming,zone coming soon,zone of,zouma,zouma has,zouma has just,zss,zss vs,zss vs sws
tp_datos_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3334,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9064,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
797,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7470,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7691,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10862,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [25]:
from sklearn.feature_selection import SelectPercentile, chi2
selec = SelectPercentile(chi2, percentile=30).fit(X_train, y_train)
X_new = selec.transform(X_train)
seleccionadas = pd.DataFrame(data=X_new)
seleccionadas

Unnamed: 0,0
0,"(0, 10)\t0.15690021369002807\n (0, 857)\t0...."
1,"(0, 193)\t0.09925984979465946\n (0, 800)\t0..."
2,"(0, 380)\t0.4164093981802417\n (0, 580)\t0...."
3,"(0, 542)\t0.2681729469829434\n (0, 3924)\t0..."
4,"(0, 934)\t0.1542489013456045\n (0, 941)\t0...."
...,...
6466,"(0, 173)\t0.20777574288690842\n (0, 353)\t0..."
6467,"(0, 144)\t0.18655071053405575\n (0, 147)\t0..."
6468,"(0, 213)\t0.12607098955469676\n (0, 679)\t0..."
6469,"(0, 73)\t0.10366425834141525\n (0, 137)\t0...."


In [26]:
cols = selec.get_support(indices=True)
cols
features = X.columns[cols]
features

Index(['abandoned', 'abandoned aircraft', 'abbswinston', 'abbswinston zionist',
       'abbswinston zionist terrorist', 'abc', 'abc news', 'abc online',
       'ablaze in', 'abomb',
       ...
       'yyc', 'yyc abstorm', 'yyc yycstorm', 'yycstorm', 'zionism', 'zionist',
       'zionist terrorist', 'zionist terrorist demolish', 'zone', 'zouma'],
      dtype='object', length=6799)

In [27]:
X_train = X_train.filter(items = features)
X_test = X_test.filter(items = features)

#### Red

In [28]:
import pandas as pd
import re
import numpy as np
import tensorflow_hub as hub
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Embedding, LSTM,Dense, SpatialDropout1D, Dropout
from keras import layers
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split

In [29]:
X_train

Unnamed: 0_level_0,abandoned,abandoned aircraft,abbswinston,abbswinston zionist,abbswinston zionist terrorist,abc,abc news,abc online,ablaze in,abomb,...,yyc,yyc abstorm,yyc yycstorm,yycstorm,zionism,zionist,zionist terrorist,zionist terrorist demolish,zone,zouma
tp_datos_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3334,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9064,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
797,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9325,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7470,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7691,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1242,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10862,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [30]:
rango = X_train.shape[1] 
model = Sequential()
model.add(layers.Dense(1024, input_dim=rango, activation='relu'))
model.add(Dropout(0.25))
#model.add(layers.Dense(512, input_dim=rango, activation='relu'))
#model.add(Dropout(0.25))
model.add(layers.Dense(1, activation='sigmoid'))

In [31]:
optimizer=Adam(learning_rate=1e-5)
model.compile(loss='binary_crossentropy', 
              optimizer=optimizer, 
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1024)              6963200   
_________________________________________________________________
dropout (Dropout)            (None, 1024)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 1025      
Total params: 6,964,225
Trainable params: 6,964,225
Non-trainable params: 0
_________________________________________________________________


In [32]:
X_train = np.asarray(X_train).astype(np.float32)
X_test = np.asarray(X_test).astype(np.float32)

In [33]:
history = model.fit(X_train, y_train,
                    epochs=25,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)

Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


In [34]:
X = X.filter(items = features)
train_pred = model.predict(X)
train_pred_int = train_pred

In [35]:
train['tfidf_net_tri'] = train_pred_int
train = train['tfidf_net_tri'].to_frame()
train.head()

Unnamed: 0_level_0,tfidf_net_tri
tp_datos_id,Unnamed: 1_level_1
1,0.607627
4,0.934687
5,0.70597
6,0.953801
7,0.558009


In [36]:
train['tfidf_net_tri'].value_counts()

0.412476    60
0.992747    24
0.998060    20
0.997498    18
0.194395    17
            ..
0.356716     1
0.385207     1
0.341092     1
0.760315     1
0.070312     1
Name: tfidf_net_tri, Length: 6721, dtype: int64

In [37]:
train.to_csv('csv/train_TFIDF_trigramas.csv')

In [38]:
test = test.filter(items = features)
test.head()

Unnamed: 0_level_0,abandoned,abandoned aircraft,abbswinston,abbswinston zionist,abbswinston zionist terrorist,abc,abc news,abc online,ablaze in,abomb,...,yyc,yyc abstorm,yyc yycstorm,yycstorm,zionism,zionist,zionist terrorist,zionist terrorist demolish,zone,zouma
tp_datos_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [39]:
test_pred = model.predict(test)
test_pred_int = test_pred

In [40]:
test['tfidf_net_tri'] = test_pred_int
test = test['tfidf_net_tri'].to_frame()
test.head()

Unnamed: 0_level_0,tfidf_net_tri
tp_datos_id,Unnamed: 1_level_1
0,0.610922
2,0.454667
3,0.786749
9,0.559148
11,0.948272


In [41]:
test['tfidf_net_tri'].value_counts()

0.412476    31
0.533806    11
0.992747    10
0.279990    10
0.194395     8
            ..
0.093291     1
0.419251     1
0.178375     1
0.417295     1
0.220703     1
Name: tfidf_net_tri, Length: 3012, dtype: int64

In [42]:
test.to_csv('csv/test_TFIDF_trigamas.csv')