In [1]:
import pandas as pd
import re
import numpy as np
import tensorflow_hub as hub
import tensorflow as tf
from keras.models import Sequential
from keras.layers import Embedding, LSTM,Dense, SpatialDropout1D, Dropout
from keras import layers
from keras.optimizers import Adam
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [2]:
np.random.seed(13)

In [3]:
train = pd.read_csv('csv/train.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


In [4]:
duplicates_train = train[train.duplicated(['text'])].index #index de duplicados

In [5]:
train_sin_duplicados = train.drop_duplicates(['text'],keep='first')
train_sin_duplicados.shape

(7503, 5)

In [6]:
test = pd.read_csv('csv/test.csv')
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3263 entries, 0 to 3262
Data columns (total 4 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        3263 non-null   int64 
 1   keyword   3237 non-null   object
 2   location  2158 non-null   object
 3   text      3263 non-null   object
dtypes: int64(1), object(3)
memory usage: 102.1+ KB


In [7]:
len_elmo = 1024
len_bert = 768
len_w2v = 300
len_glove = 200

#### BERT

In [8]:
bert_train = np.loadtxt("train_embedding_file_bert.txt").reshape(len(train.index),len_bert)
bert_train.shape

(7613, 768)

In [9]:
bert_train_sin_dupl = np.delete(bert_train, duplicates_train, axis=0)

In [10]:
bert_train_sin_dupl.shape

(7503, 768)

In [11]:
bert_test = np.loadtxt("test_embedding_file_bert.txt").reshape(len(test.index),len_bert)
bert_test.shape

(3263, 768)

#### ELMo

In [12]:
elmo_train = np.loadtxt("train_embedding_file_elmo.txt").reshape(len(train.index),len_elmo)
elmo_train.shape

(7613, 1024)

In [13]:
elmo_train_sin_dupl = np.delete(elmo_train, duplicates_train, axis=0)

In [24]:
elmo_train_sin_dupl.shape

(7503, 1024)

In [15]:
elmo_test = np.loadtxt("test_embedding_file_elmo.txt").reshape(len(test.index),len_elmo)
elmo_test.shape

(3263, 1024)

#### W2V

In [16]:
elmo_train = np.loadtxt("train_embedding_file_w2v(norm).txt").reshape(len(train.index),len_elmo)
w2v_train.shape

(7613, 300)

In [17]:
w2v_train_sin_dupl = np.delete(w2v_train, duplicates_train, axis=0)

In [18]:
w2v_train_sin_dupl.shape

(7503, 300)

In [19]:
w2v_test = np.loadtxt("test_embedding_file_w2v(norm).txt").reshape(len(test.index),len_w2v)
w2v_test.shape

(3263, 300)

#### Glove

In [20]:
glove_train = np.loadtxt("train_embedding_file_glove.txt").reshape(len(train.index),len_glove)
glove_train.shape

(7613, 200)

In [21]:
glove_train_sin_dupl = np.delete(glove_train, duplicates_train, axis=0)

In [23]:
glove_train_sin_dupl.shape

(7503, 200)

In [25]:
glove_test = np.loadtxt("test_embedding_file_glove.txt").reshape(len(test.index),len_glove)
glove_test.shape

(3263, 200)

# BERT

In [26]:
train = pd.read_csv('csv/train.csv')
test = pd.read_csv('csv/test.csv')

In [27]:
X, y = bert_train_sin_dupl, train_sin_duplicados['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=13)

In [28]:
model = Sequential()
# model.add(layers.Dense(round(len_bert*.75), input_dim=len_bert, activation='relu'))
# model.add(Dropout(0.25))
# model.add(layers.Dense(round(len_bert/2), input_dim=len_bert, activation='relu'))
# model.add(Dropout(0.25))
model.add(layers.Dense(1, input_dim=len_bert, activation='sigmoid'))

In [29]:
optimizer=Adam(learning_rate=5e-5)
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1)                 769       
Total params: 769
Trainable params: 769
Non-trainable params: 0
_________________________________________________________________


In [30]:
history = model.fit(X_train, y_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [31]:
test_pred = model.predict(bert_test)
test_pred_int = test_pred.round().astype('int')

In [32]:
test['bert_net'] = test_pred
test = test['bert_net'].to_frame()
test.head()

Unnamed: 0,bert_net
0,0.955032
1,0.966348
2,0.712088
3,0.996635
4,0.971992


In [33]:
test.to_csv('csv/test_red_bert_sin_dupl.csv')

In [34]:
train_pred = model.predict(bert_train)
train_pred_int = train_pred.round().astype('int')

In [35]:
train['bert_net'] = train_pred
train = train['bert_net'].to_frame()
train.to_csv('csv/train_red_bert_sin_dupl.csv')

# ELMo

In [36]:
train = pd.read_csv('csv/train.csv')
test = pd.read_csv('csv/test.csv')
X, y = elmo_train_sin_dupl, train_sin_duplicados['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=13)

In [37]:
model = Sequential()
model.add(layers.Dense(round(len_elmo/2), input_dim=len_elmo, activation='relu'))
model.add(Dropout(0.25))
model.add(layers.Dense(1, activation='sigmoid'))

In [38]:
optimizer=Adam(learning_rate=5e-5)
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_1 (Dense)              (None, 512)               524800    
_________________________________________________________________
dropout (Dropout)            (None, 512)               0         
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 513       
Total params: 525,313
Trainable params: 525,313
Non-trainable params: 0
_________________________________________________________________


In [39]:
history = model.fit(X_train, y_train,
                    epochs=5,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [40]:
test_pred = model.predict(elmo_test)
test_pred_int = test_pred.round().astype('int')

In [41]:
test['elmo_net'] = test_pred
test = test['elmo_net'].to_frame()
test.head()

Unnamed: 0,elmo_net
0,0.524576
1,0.816625
2,0.596729
3,0.923364
4,0.947302


In [42]:
test.to_csv('csv/test_red_elmo_sin_dupl.csv')

In [43]:
train_pred = model.predict(elmo_train)
train_pred_int = train_pred.round().astype('int')

In [44]:
train['elmo_net'] = train_pred
train = train['elmo_net'].to_frame()
train.to_csv('csv/train_red_elmo_dupl.csv')

# W2V

In [45]:
train = pd.read_csv('csv/train.csv')
test = pd.read_csv('csv/test.csv')
X, y = w2v_train_sin_dupl, train_sin_duplicados['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=13)

In [46]:
model = Sequential()
model.add(layers.Dense(round(len_w2v/2), input_dim=len_w2v, activation='relu'))
model.add(Dropout(0.25))
model.add(layers.Dense(round(len_w2v/4), input_dim=len_w2v, activation='relu'))
model.add(Dropout(0.25))
model.add(layers.Dense(1, activation='sigmoid'))

In [47]:
optimizer=Adam(learning_rate=5e-5)
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_3 (Dense)              (None, 150)               45150     
_________________________________________________________________
dropout_1 (Dropout)          (None, 150)               0         
_________________________________________________________________
dense_4 (Dense)              (None, 75)                11325     
_________________________________________________________________
dropout_2 (Dropout)          (None, 75)                0         
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 76        
Total params: 56,551
Trainable params: 56,551
Non-trainable params: 0
_________________________________________________________________


In [48]:
history = model.fit(X_train, y_train,
                    epochs=10,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [49]:
test_pred = model.predict(w2v_test)
test_pred_int = test_pred.round().astype('int')

In [50]:
test['w2v_net'] = test_pred
test = test['w2v_net'].to_frame()
test.head()

Unnamed: 0,w2v_net
0,0.977497
1,0.88483
2,0.912146
3,0.999208
4,0.999996


In [51]:
test.to_csv('csv/test_red_w2v_sin_dupl.csv')

In [52]:
train_pred = model.predict(w2v_train)
train_pred_int = train_pred.round().astype('int')

In [53]:
train['w2v_net'] = train_pred
train = train['w2v_net'].to_frame()
train.to_csv('csv/train_red_w2v_sin_dupl.csv')

# GLOve

In [54]:
train = pd.read_csv('csv/train.csv')
test = pd.read_csv('csv/test.csv')
X, y = glove_train_sin_dupl, train_sin_duplicados['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=13)

In [55]:
model = Sequential()
model.add(layers.Dense(round(len_glove/2), input_dim=len_glove, activation='relu'))
model.add(Dropout(0.25))
model.add(layers.Dense(1, activation='sigmoid'))

In [56]:
optimizer=Adam(learning_rate=5e-5)
model.compile(loss='binary_crossentropy', 
              optimizer='adam', 
              metrics=['accuracy'])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense_6 (Dense)              (None, 100)               20100     
_________________________________________________________________
dropout_3 (Dropout)          (None, 100)               0         
_________________________________________________________________
dense_7 (Dense)              (None, 1)                 101       
Total params: 20,201
Trainable params: 20,201
Non-trainable params: 0
_________________________________________________________________


In [57]:
history = model.fit(X_train, y_train,
                    epochs=5,
                    verbose=True,
                    validation_data=(X_test, y_test),
                    batch_size=10)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [58]:
test_pred = model.predict(glove_test)
test_pred_int = test_pred.round().astype('int')

In [59]:
test['glove_net'] = test_pred
test = test['glove_net'].to_frame()
test.head()

Unnamed: 0,glove_net
0,0.629792
1,0.84402
2,0.600754
3,0.94331
4,0.998743


In [60]:
test.to_csv('csv/test_red_glove_sin_dupl.csv')

In [61]:
train_pred = model.predict(glove_train)
train_pred_int = train_pred.round().astype('int')

In [62]:
train['glove_net'] = train_pred
train = train['glove_net'].to_frame()
train.to_csv('csv/train_red_glove_sin_dupl.csv')