In [2]:
import tensorflow as tf
print(tf.__version__)

2.3.0


In [3]:
# All general imports
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer 

import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Input, Embedding, Reshape, Conv2D, MaxPool2D, Concatenate, Flatten, Dropout, Dense, Bidirectional, GlobalAveragePooling1D, GRU, GlobalMaxPooling1D, concatenate
from keras.optimizers import Adam
from keras.layers import LSTM, GRU, Conv1D, MaxPool1D, Activation

from keras.models import Model, Sequential
from keras.layers.core import SpatialDropout1D

from keras.engine.topology import Layer
from keras.layers import Dense, Input, Embedding, Dropout, Activation, Conv1D, Softmax
from keras import initializers, regularizers, constraints, optimizers, layers
from keras import backend as K

from keras.callbacks import EarlyStopping

from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import classification_report, accuracy_score
import io, os, gc

In [5]:
#################### Importing ByteDance Datasets ####################
# Train set
train_df = pd.read_csv('../ByteDance_Dataset/train.csv')
print(train_df.columns)
le = LabelEncoder()
train_df['bd_label'] = le.fit_transform(train_df['bd_label'])
train_df.head()

# Test set
test_df = pd.read_csv('../ByteDance_Dataset/test_merged.csv')
print(test_df.columns)
test_df['bd_label'] = le.transform(test_df['bd_label'])
test_df.head()

Index(['id', 'tid1', 'tid2', 'title1_zh', 'title2_zh', 'title1_en',
       'title2_en', 'label'],
      dtype='object')
Index(['id', 'tid1', 'tid2', 'title1_zh', 'title2_zh', 'title1_en',
       'title2_en', 'Expected', 'Weight', 'Usage'],
      dtype='object')


Unnamed: 0,id,tid1,tid2,title1_zh,title2_zh,title1_en,title2_en,Expected,Weight,Usage
0,321187,167562,59521,萨拉赫人气爆棚!埃及总统大选未参选获百万选票 现任总统压力山大,辟谣！里昂官方否认费基尔加盟利物浦，难道是价格没谈拢？,egypt 's presidential election failed to win m...,Lyon! Lyon officials have denied that Felipe F...,2,0.0625,Private
1,321190,167564,91315,萨达姆被捕后告诫美国的一句话，发人深思,10大最让美国人相信的荒诞谣言，如蜥蜴人掌控着美国,A message from Saddam Hussein after he was cap...,The Top 10 Americans believe that the Lizard M...,2,0.0625,Public
2,321189,167563,167564,萨达姆此项计划没有此国破坏的话，美国还会对伊拉克发动战争吗,萨达姆被捕后告诫美国的一句话，发人深思,Will the United States wage war on Iraq withou...,A message from Saddam Hussein after he was cap...,2,0.0625,Private
3,321193,167564,160994,萨达姆被捕后告诫美国的一句话，发人深思,被绞刑处死的萨达姆是替身？他的此男人举动击破替身谣言！,A message from Saddam Hussein after he was cap...,The hanging Saddam is a surrogate? This man's ...,2,0.0625,Public
4,321191,167564,15084,萨达姆被捕后告诫美国的一句话，发人深思,中国川贝枇杷膏在美国受到热捧？纯属谣言！,A message from Saddam Hussein after he was cap...,Chinese loquat loquat plaster in America? Pure...,2,0.0625,Public


In [6]:
train_lst_1 = train_df['title1_en'].tolist()
print(len(train_lst_1))
train_lst_1[:5]
train_lst_2 = train_df['title2_en'].tolist()
print(len(train_lst_2))
uq_tr_1 = list(set(train_lst_1))
uq_tr_2 = list(set(train_lst_2))
print(len(uq_tr_1))
print(len(uq_tr_2))
train_merged = uq_tr_1 + uq_tr_2
print('Train Length is', len(train_merged))
train_merged[:5]
test_lst_1 = test_df['title1_en'].tolist()
test_lst_2 = test_df['title2_en'].tolist()
uq_ts_1 = list(set(test_lst_1))
uq_ts_2 = list(set(test_lst_2))
test_merged = uq_ts_1 + uq_ts_2
print('Test merged', len(test_merged))
total_dataset = train_merged + test_merged
print('Dataset length is', len(total_dataset))

320552
320552
67869
136111
Train Length is 203980
Test merged 68725
Dataset length is 272705


In [7]:
# Defining the tokenizer
def get_tokenizer(vocabulary_size):
  print('Training tokenizer...')
  tokenizer = Tokenizer(num_words= vocabulary_size)
  tweet_text = []
  print('Read {} Sentences'.format(len(total_dataset)))
  tokenizer.fit_on_texts(total_dataset)
  return tokenizer

In [8]:
# For getting the embedding matrix
def get_embeddings():
  print('Generating embeddings matrix...')
  embeddings_file = '../resources/glove.6B.300d.txt'
  embeddings_index = dict()
  with open(embeddings_file, 'r', encoding="utf-8") as infile:
    for line in infile:
      values = line.split()
      word = values[0]
      vector = np.asarray(values[1:], "float32")
      embeddings_index[word] = vector
	# create a weight matrix for words in training docs
  vocabulary_size = len(embeddings_index)
  embeddinds_size = list(embeddings_index.values())[0].shape[0]
  print('Vocabulary = {}, embeddings = {}'.format(vocabulary_size, embeddinds_size))
  tokenizer = get_tokenizer(vocabulary_size)
  embedding_matrix = np.zeros((vocabulary_size, embeddinds_size))
  considered = 0
  total = len(tokenizer.word_index.items())
  for word, index in tokenizer.word_index.items():
    if index > vocabulary_size - 1:
      print(word, index)
      continue
    else:
      embedding_vector = embeddings_index.get(word)
      if embedding_vector is not None:
        embedding_matrix[index] = embedding_vector
        considered += 1
  print('Considered ', considered, 'Left ', total - considered)			
  return embedding_matrix, tokenizer

In [9]:
def get_data(tokenizer, MAX_LENGTH, input_df, train=True):
  print('Loading data')
  X1, X2, Y = [], [], []
	# with open(input_file) as infile:
	# 	for line in infile:
	# 		data = line.split(',')
	# 		text, annotation = data[2], data[1]
			
	# 		if annotation == "MET":
	# 			X.append(text)
	# 			Y.append("1")
	# 		elif annotation == "Non_MET" or annotation == "Help":	
	# 			X.append(text)
	# 			Y.append("0")
  X1 = input_df['title1_en'].tolist()
  X2 = input_df['title2_en'].tolist()
  if train:
    Y = input_df['label'].tolist()
  else:
    Y = input_df['Expected'].tolist()
  
  assert len(X1) == len(X2) == len(Y)
  sequences_1 = tokenizer.texts_to_sequences(X1)
  sequences_2 = tokenizer.texts_to_sequences(X2)
	# for i, s in enumerate(sequences):
	# 	sequences[i] = sequences[i][-250:]
  X1 = pad_sequences(sequences_1, maxlen=MAX_LENGTH)
  X2 = pad_sequences(sequences_2, maxlen=MAX_LENGTH)
  Y = np.array(Y)
  return X1, X2, Y

In [10]:
embedding_matrix, tokenizer = get_embeddings()

Generating embeddings matrix...
Vocabulary = 400000, embeddings = 300
Training tokenizer...
Read 272705 Sentences
Considered  32590 Left  14439


In [11]:
MAX_LENGTH = 20
# read ml data
X1, X2, Y = get_data(tokenizer, MAX_LENGTH, train_df)

Loading data


In [12]:
X1_test, X2_test, Y_test = get_data(tokenizer, MAX_LENGTH, test_df, train = False)

Loading data


In [13]:
print(Y.shape)

(320552,)


In [14]:
print(type(X1))
X1.shape

<class 'numpy.ndarray'>


(320552, 20)

In [15]:
encoder = LabelBinarizer()#convertes into one hot form
encoder.fit(Y)
Y_enc = encoder.transform(Y)
print(Y_enc)

[[0 0 1]
 [0 0 1]
 [0 0 1]
 ...
 [0 0 1]
 [0 0 1]
 [0 0 1]]


In [16]:
# Have to eliminate unrealted labels
# Removing the unrelated samples from both train and test
result = np.where(train_df['bd_label'] == 2)
reduced_X1 = np.delete(X1, result[0], axis=0)
reduced_X2 = np.delete(X2, result[0], axis=0)
print('Train shape', reduced_X1.shape)
reduced_train_labels = np.delete(train_df['bd_label'].values, result[0])
print('Train labels', reduced_train_labels)
assert len(reduced_X1) == len(reduced_X2) == len(reduced_train_labels)
result_test = np.where(test_df['bd_label']==2)
reduced_X1_test = np.delete(X1_test, result_test[0], axis=0)
reduced_X2_test = np.delete(X2_test, result_test[0], axis=0)
print('Test shape', reduced_X1_test.shape)
reduced_test_labels = np.delete(test_df['bd_label'].values, result_test[0])
reduced_test_weights = np.delete(test_df['Weight'].values, result_test[0])
print('Test labels', reduced_test_labels)
assert len(reduced_X1_test) == len(reduced_X2_test) == len(reduced_test_labels)

Train shape (101239, 20)
Train labels [0 0 0 ... 1 0 0]
Test shape (28746, 20)
Test labels [0 0 0 ... 0 0 0]


In [17]:
print(type(reduced_train_labels))
print(reduced_train_labels.shape)
encoder = LabelBinarizer()#convertes into one hot form
encoder.fit(reduced_train_labels)
Y_enc = encoder.transform(reduced_train_labels)
Y_enc_test = encoder.transform(reduced_test_labels)
print(Y_enc)
print(Y_enc_test)

<class 'numpy.ndarray'>
(101239,)
[[0]
 [0]
 [0]
 ...
 [1]
 [0]
 [0]]
[[0]
 [0]
 [0]
 ...
 [0]
 [0]
 [0]]


In [18]:
y_train = keras.utils.to_categorical(reduced_train_labels)
print(y_train)
y_test = keras.utils.to_categorical(reduced_test_labels)
print(y_test)

[[1. 0.]
 [1. 0.]
 [1. 0.]
 ...
 [0. 1.]
 [1. 0.]
 [1. 0.]]
[[1. 0.]
 [1. 0.]
 [1. 0.]
 ...
 [1. 0.]
 [1. 0.]
 [1. 0.]]


In [19]:
from sklearn.model_selection import train_test_split
VALIDATION_RATIO = 0.1
RANDOM_STATE = 9527
x1_train, x1_val, \
x2_train, x2_val, \
y_train, y_val = \
    train_test_split(
        reduced_X1, reduced_X1, y_train, 
        test_size=VALIDATION_RATIO, 
        random_state=RANDOM_STATE
)

In [20]:
print("Training Set")
print("-" * 10)
print(f"x1_train: {x1_train.shape}")
print(f"x2_train: {x2_train.shape}")
print(f"y_train : {y_train.shape}")

print("-" * 10)
print(f"x1_val:   {x1_val.shape}")
print(f"x2_val:   {x2_val.shape}")
print(f"y_val :   {y_val.shape}")
print("-" * 10)
print("Test Set")

Training Set
----------
x1_train: (91115, 20)
x2_train: (91115, 20)
y_train : (91115, 2)
----------
x1_val:   (10124, 20)
x2_val:   (10124, 20)
y_val :   (10124, 2)
----------
Test Set


In [21]:
NUM_CLASSES = 2

MAX_SEQUENCE_LENGTH = 20

NUM_LSTM_UNITS = 128

MAX_NUM_WORDS = embedding_matrix.shape[0]

NUM_EMBEDDING_DIM = embedding_matrix.shape[1]

In [22]:
# from keras import Input
# from keras.layers import Embedding,LSTM, concatenate, Dense
# from keras.models import Model

top_input = Input(
    shape=(MAX_SEQUENCE_LENGTH, ), 
    dtype='int32')
bm_input = Input(
    shape=(MAX_SEQUENCE_LENGTH, ), 
    dtype='int32')

embedding_layer = Embedding(
    MAX_NUM_WORDS, NUM_EMBEDDING_DIM)
top_embedded = embedding_layer(
    top_input)
bm_embedded = embedding_layer(
    bm_input)

shared_lstm = LSTM(NUM_LSTM_UNITS)
top_output = shared_lstm(top_embedded)
bm_output = shared_lstm(bm_embedded)

merged = concatenate(
    [top_output, bm_output], 
    axis=-1)

dense =  Dense(
    units=NUM_CLASSES, 
    activation='softmax')
predictions = dense(merged)

model = Model(
    inputs=[top_input, bm_input], 
    outputs=predictions)

model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, 20)]         0                                            
__________________________________________________________________________________________________
embedding (Embedding)           (None, 20, 300)      120000000   input_1[0][0]                    
                                                                 input_2[0][0]                    
__________________________________________________________________________________________________
lstm (LSTM)                     (None, 128)          219648      embedding[0][0]       

In [23]:
from keras.optimizers import Adam
lr = 1e-3
opt = Adam(lr=lr, decay=lr/50)
model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy'])

In [24]:
BATCH_SIZE = 512
NUM_EPOCHS = 50
stop = [EarlyStopping(monitor='val_loss', patience=0.001)]
history = model.fit(x=[x1_train, x2_train],
                    y=y_train,
                    batch_size=BATCH_SIZE,
                    epochs=NUM_EPOCHS,
                    validation_data=(
                      [x1_val, x2_val], 
                      y_val
                    ),
                    shuffle=True,
                    callbacks=stop,
          )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50


In [25]:
from sklearn import metrics
from sklearn.metrics import classification_report
predictions = model.predict(
    [reduced_X1_test, reduced_X2_test])
# print('Accuracy is')
# print(metrics.accuracy_score(y_test, y_pred, sample_weight = reduced_test_weights)*100)
# print(classification_report(y_test, y_pred, target_names = ['agreed', 'disagreed'], sample_weight = reduced_test_weights))

In [None]:
y_pred = [idx for idx in np.argmax(predictions, axis=1)]
#print(y_pred)
print('Accuracy is')
print(metrics.accuracy_score(reduced_test_labels, y_pred, sample_weight = reduced_test_weights)*100)
print(classification_report(reduced_test_labels, y_pred, target_names = ['agreed', 'disagreed'], sample_weight = reduced_test_weights))