In [0]:
# import necessary libraries
import numpy as np
import json
import csv
import pandas as pd
import tensorflow as tf
from tensorflow.python.keras.preprocessing import text, sequence
from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Activation, Dropout, Dense
from tensorflow.python.keras.models import load_model

In [0]:
# reading file to train the neural network

with open("/content/sample_data/nlp_train.json") as data_file:
  data = json.load(data_file)
  y_pandas_df = []
  x_pandas_df = []
  i=0
  for v in data.values():
    y_pandas_df.insert(-1,v["emotion"])
    i=i+1
    x_pandas_df.insert(-1,v["body"])
ydf = [ [0] * 12 for _ in range(1493)]
# print(y)
i=0
# print(y_pandas_df[2])
for a in y_pandas_df:
  if(a["anger"]):
    ydf[i][0]=1
  if(a["anticipation"]):
    ydf[i][1]=1
  if(a["disgust"]):
    ydf[i][2]=1
  if(a["fear"]):
    ydf[i][3]=1
  if(a["joy"]):
    ydf[i][4]=1
  if(a["love"]):
    ydf[i][5]=1
  if(a["optimism"]):
    ydf[i][6]=1
  if(a["pessimism"]):
    ydf[i][7]=1
  if(a["sadness"]):
    ydf[i][8]=1
  if(a["surprise"]):
    ydf[i][9]=1
  if(a["trust"]):
    ydf[i][10]=1
  if(a["neutral"]):
    ydf[i][11]=1
  i=i+1
# print(y)
x=np.asarray(x_pandas_df)
y = np.asarray(ydf)

In [3]:
# checking the properties of y, whether it is in correct format to give input to NN
print("properties of y")
print("type : {}, dimensions : {}, shape : {}, total no. of elements : {}, data type of each element: {}, size of each element {} bytes".format(type(y), y.ndim, y.shape, y.size, y.dtype, y.itemsize))

properties of y
type : <class 'numpy.ndarray'>, dimensions : 2, shape : (1493, 12), total no. of elements : 17916, data type of each element: int64, size of each element 8 bytes


In [0]:
# Defining Parameters of the model. These are randomly chosen, feel free to change parameters to get better learning accuracy.
list_of_classes = ['anger', 'anticipation', 'disgust', 'fear', 'joy', 'love', 'optimism', 'pessimism', 'sadness', 'surprise', 'trust', 'neutral']
max_features = 20000
max_text_length = 400
embedding_dims = 50
batch_size = 32
epochs = 5
num_filters_1 = 250
num_filters_2 = 250
filter_size = 3

In [0]:
x_tokenizer = text.Tokenizer(num_words=max_features)
x_tokenizer.fit_on_texts(list(x))
x_tokenized = x_tokenizer.texts_to_sequences(x)
x_train_val = sequence.pad_sequences(x_tokenized, maxlen=max_text_length)

In [0]:
# defining the model

model = Sequential()

model.add(Embedding(max_features, embedding_dims, input_length=max_text_length))
model.add(Dropout(0.2))

model.add(Conv1D(filters=num_filters_1, kernel_size=filter_size, padding='valid', activation='relu', strides=1))
model.add(GlobalMaxPooling1D())

model.add(Dense(num_filters_2))
model.add(Dropout(0.2))
model.add(Activation('relu'))

model.add(Dense(12))
model.add(Activation('sigmoid'))

In [7]:
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 400, 50)           1000000   
_________________________________________________________________
dropout (Dropout)            (None, 400, 50)           0         
_________________________________________________________________
conv1d (Conv1D)              (None, 398, 250)          37750     
_________________________________________________________________
global_max_pooling1d (Global (None, 250)               0         
_________________________________________________________________
dense (Dense)                (None, 250)               62750     
_________________________________________________________________
dropout_1 (Dropout)          (None, 250)               0         
_________________________________________________________________
activation (Activation)      (None, 250)               0

In [8]:
# training the model with train data
model.fit(x_train_val, y, batch_size=batch_size, epochs=epochs)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<tensorflow.python.keras.callbacks.History at 0x7f175f787a20>

In [0]:
# saving the model so that I don't have to run the training code again for testing purpose
# because training takes lot of time.
model.save('/content/sample_data/emo_class_keras_model.h5')

In [0]:
# our favorite part. testing the model with data
# output will be saved in solution.csv

def predict2(test_set):
  global x_tokenizer
  model2 = load_model('/content/sample_data/emo_class_keras_model.h5')
  x_test2 = []
  with open(test_set) as data_file:
    data = json.load(data_file)
    i=0
    for v in data.values():
      x_test2.insert(-1,v["body"])
      i=i+1
  print(i)
  x_test_tokenized2 = x_tokenizer.texts_to_sequences(x_test2)
  print(type(x_test_tokenized2))
  x_testing2 = sequence.pad_sequences(x_test_tokenized2, maxlen=max_text_length)
  y_testing2 = model2.predict(x_testing2, verbose = 1)
  sample_submission = pd.read_csv("/content/sample_data/submission.csv")
  sample_submission[list_of_classes] = y_testing2
  sample_submission.to_csv("/content/sample_data/solution.csv", index=False)

In [11]:
predict2("/content/sample_data/nlp_test.json")

374
<class 'list'>


FileNotFoundError: ignored