In [47]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import PorterStemmer
from keras.preprocessing import text as keras_text, sequence as keras_seq
from keras.utils import np_utils
from keras.layers import Conv1D, GlobalMaxPooling1D
from keras.layers import Dense, Activation, Convolution1D, MaxPooling1D, Dropout, Flatten
from keras.models import Sequential, save_model
from sklearn.model_selection import train_test_split
from keras.layers import Embedding
from sklearn.metrics import classification_report,confusion_matrix

In [48]:
data_new=pd.read_csv('english_dataset.tsv', sep = '\t', encoding="latin-1")

In [49]:
data_new=data_new.dropna()

In [50]:
data_new.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5852 entries, 0 to 5851
Data columns (total 5 columns):
text_id    5852 non-null object
text       5852 non-null object
task_1     5852 non-null object
task_2     5852 non-null object
task_3     5852 non-null object
dtypes: object(5)
memory usage: 274.3+ KB


In [5]:
#data_new = data_new[data_new["task_2"] != "NONE"]

In [51]:
data_new.text=data_new.text.astype(str)

In [52]:
df2=data_new['text']
labels=data_new['task_1']

In [53]:
data_new=data_new[['text','task_1']]

In [54]:
from keras.preprocessing.text import one_hot

In [55]:
#df_text = data_new['text']
data_new

Unnamed: 0,text,task_1
0,#DhoniKeepsTheGlove | WATCH: Sports Minister K...,NOT
1,@politico No. We should remember very clearly ...,HOF
2,@cricketworldcup Guess who would be the winner...,NOT
3,Corbyn is too politically intellectual for #Bo...,NOT
4,All the best to #TeamIndia for another swimmin...,NOT
5,@kellymiller513 @TheRealOJ32 I hope you rememb...,NOT
6,@ICC Latest design of #WC2019 trophy. #CWC2019...,NOT
7,#ADOS #trendingnow #blacklivesmatter #justice ...,HOF
8,Thanks for your support! Wow 600k. Graffiti ha...,NOT
9,By wearing the #BalidaanBadge over his gloves ...,NOT


In [56]:
data_new['task_1'] = data_new['task_1'].map({'NOT':0,'HOF': 1})
data_new

Unnamed: 0,text,task_1
0,#DhoniKeepsTheGlove | WATCH: Sports Minister K...,0
1,@politico No. We should remember very clearly ...,1
2,@cricketworldcup Guess who would be the winner...,0
3,Corbyn is too politically intellectual for #Bo...,0
4,All the best to #TeamIndia for another swimmin...,0
5,@kellymiller513 @TheRealOJ32 I hope you rememb...,0
6,@ICC Latest design of #WC2019 trophy. #CWC2019...,0
7,#ADOS #trendingnow #blacklivesmatter #justice ...,1
8,Thanks for your support! Wow 600k. Graffiti ha...,0
9,By wearing the #BalidaanBadge over his gloves ...,0


In [57]:
vocab_size = 4000000
# encode full sentence into vector
encoded_docs=[one_hot(d,vocab_size) for d in df2]
print (encoded_docs)

[[1860167, 1608835, 1447584, 2109541, 3013774, 2848127, 3198876, 2251471, 2969255, 1670938, 3851044, 2329086, 320872, 2239857, 45330, 3259750, 208706, 663141, 3609718, 376712, 1559807, 1118068, 1502871, 3719322, 1561944, 2334843, 3835657, 376712, 685495, 983675, 721201, 3720326, 398824, 3362676, 3922097, 1360942, 2006463, 3864345], [3805978, 672770, 1510605, 1427622, 1958293, 3056857, 663211, 1005718, 1060539, 3814395, 3253807, 208706, 915066, 1184859, 2637311, 137270], [3509699, 1193868, 724957, 2069389, 3168189, 376712, 2968923, 560197, 1137945, 3772252, 3146871, 724957, 1349261, 2352482, 2326398, 3375376, 376712, 251145, 1852086, 763763, 3174954, 3668033, 1502871], [219748, 398824, 580545, 2967330, 833176, 198823, 1068051, 2230279, 478859, 1753391, 3922097, 1360942, 2006463, 873], [2072288, 376712, 3319282, 208706, 1592176, 198823, 106082, 1621764, 2730450, 2375708, 3350630, 2856307, 1330318, 141004, 3174954, 3772252, 2867735, 2651329, 3922097, 1360942, 2006463, 2668292], [766506, 2

In [58]:
from keras.preprocessing.sequence import pad_sequences
# pad documents to a max length of 4 words
max_length = 100
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

[[1860167 1608835 1447584 ...       0       0       0]
 [3805978  672770 1510605 ...       0       0       0]
 [3509699 1193868  724957 ...       0       0       0]
 ...
 [2659634  243693 2344845 ...       0       0       0]
 [2399366 2664984 1024878 ...       0       0       0]
 [1843559 1137945 3168189 ...       0       0       0]]


In [59]:
from keras.utils import to_categorical
labels=data_new['task_1']
labels= to_categorical(labels,2)

In [60]:
x_train, x_test, y_train, y_test = train_test_split(padded_docs,labels, test_size=0.20, random_state=42)

In [61]:
model = Sequential()
model.add(Embedding(vocab_size, 8, input_length=max_length))
model.add( Convolution1D(8, 4 ))
model.add( Activation('relu') )

model.add( Convolution1D(16,4) )

model.add( Activation('relu'))
model.add(Convolution1D(64,4))
model.add( MaxPooling1D( pool_size=(4) ) )
model.add(Activation("relu"))
model.add(Convolution1D(64,4))
model.add(Activation("relu"))

model.add( Flatten() )

model.add( Dense(2) )

model.add( Activation('sigmoid') )

model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 100, 8)            32000000  
_________________________________________________________________
conv1d_9 (Conv1D)            (None, 97, 8)             264       
_________________________________________________________________
activation_11 (Activation)   (None, 97, 8)             0         
_________________________________________________________________
conv1d_10 (Conv1D)           (None, 94, 16)            528       
_________________________________________________________________
activation_12 (Activation)   (None, 94, 16)            0         
_________________________________________________________________
conv1d_11 (Conv1D)           (None, 91, 64)            4160      
_________________________________________________________________
max_pooling1d_3 (MaxPooling1 (None, 22, 64)            0         
__________

In [62]:
from keras.callbacks import ModelCheckpoint
filepath = "task1_cnn_onehot_100d_last.hdf5"
checkpoint = ModelCheckpoint(filepath, monitor="val_acc", verbose=1, save_best_only=True, mode='max')
callbacks_list = [checkpoint]

In [63]:
model.compile(loss='categorical_crossentropy',optimizer='adam',metrics=['acc'])

In [64]:
#hist=model.fit(x_train,y_train,epochs=100,batch_size=32,validation_data=(x_test, y_test))
hist=model.fit(x_train,y_train,epochs=100,batch_size=32,validation_data=(x_test, y_test),callbacks=callbacks_list)

Train on 4681 samples, validate on 1171 samples
Epoch 1/100

Epoch 00001: val_acc improved from -inf to 0.62681, saving model to task1_cnn_onehot_100d_last.hdf5
Epoch 2/100

Epoch 00002: val_acc improved from 0.62681 to 0.66610, saving model to task1_cnn_onehot_100d_last.hdf5
Epoch 3/100

Epoch 00003: val_acc did not improve from 0.66610
Epoch 4/100

Epoch 00004: val_acc did not improve from 0.66610
Epoch 5/100

Epoch 00005: val_acc did not improve from 0.66610
Epoch 6/100

Epoch 00006: val_acc did not improve from 0.66610
Epoch 7/100

Epoch 00007: val_acc did not improve from 0.66610
Epoch 8/100

Epoch 00008: val_acc did not improve from 0.66610
Epoch 9/100

Epoch 00009: val_acc did not improve from 0.66610
Epoch 10/100

Epoch 00010: val_acc did not improve from 0.66610
Epoch 11/100

Epoch 00011: val_acc did not improve from 0.66610
Epoch 12/100

Epoch 00012: val_acc did not improve from 0.66610
Epoch 13/100

Epoch 00013: val_acc did not improve from 0.66610
Epoch 14/100

Epoch 00014:


Epoch 00043: val_acc did not improve from 0.66610
Epoch 44/100

Epoch 00044: val_acc did not improve from 0.66610
Epoch 45/100

Epoch 00045: val_acc did not improve from 0.66610
Epoch 46/100

Epoch 00046: val_acc did not improve from 0.66610
Epoch 47/100

Epoch 00047: val_acc did not improve from 0.66610
Epoch 48/100

Epoch 00048: val_acc did not improve from 0.66610
Epoch 49/100

Epoch 00049: val_acc did not improve from 0.66610
Epoch 50/100

Epoch 00050: val_acc did not improve from 0.66610
Epoch 51/100

Epoch 00051: val_acc did not improve from 0.66610
Epoch 52/100

Epoch 00052: val_acc did not improve from 0.66610
Epoch 53/100

Epoch 00053: val_acc did not improve from 0.66610
Epoch 54/100

Epoch 00054: val_acc did not improve from 0.66610
Epoch 55/100

Epoch 00055: val_acc did not improve from 0.66610
Epoch 56/100

Epoch 00056: val_acc did not improve from 0.66610
Epoch 57/100

Epoch 00057: val_acc did not improve from 0.66610
Epoch 58/100

Epoch 00058: val_acc did not improve fr


Epoch 00086: val_acc did not improve from 0.66610
Epoch 87/100

Epoch 00087: val_acc did not improve from 0.66610
Epoch 88/100

Epoch 00088: val_acc did not improve from 0.66610
Epoch 89/100

Epoch 00089: val_acc did not improve from 0.66610
Epoch 90/100

Epoch 00090: val_acc did not improve from 0.66610
Epoch 91/100

Epoch 00091: val_acc did not improve from 0.66610
Epoch 92/100

Epoch 00092: val_acc did not improve from 0.66610
Epoch 93/100

Epoch 00093: val_acc did not improve from 0.66610
Epoch 94/100

Epoch 00094: val_acc did not improve from 0.66610
Epoch 95/100

Epoch 00095: val_acc did not improve from 0.66610
Epoch 96/100

Epoch 00096: val_acc did not improve from 0.66610
Epoch 97/100

Epoch 00097: val_acc did not improve from 0.66610
Epoch 98/100

Epoch 00098: val_acc did not improve from 0.66610
Epoch 99/100

Epoch 00099: val_acc did not improve from 0.66610
Epoch 100/100

Epoch 00100: val_acc did not improve from 0.66610


In [65]:
from keras.models import load_model
# load weights into new model
loaded_model= load_model("task1_cnn_onehot_100d_last.hdf5")

In [71]:
w=model.predict_classes(x_test)

In [72]:
from sklearn.metrics import classification_report,confusion_matrix

In [73]:
Y_actual=[]
for ix in y_test:
    Y_actual.append(np.argmax(ix))

In [74]:
print(classification_report(Y_actual,w))

             precision    recall  f1-score   support

          0       0.67      0.67      0.67       734
          1       0.44      0.43      0.44       437

avg / total       0.58      0.58      0.58      1171



In [75]:
data_test=pd.read_csv('hasoc2019_en_test.tsv', sep = '\t', encoding="latin-1")

In [76]:
data_test.text=data_test.text.astype(str)
df_text=data_test['text']

In [77]:
vocab_size = 4000000
# encode full sentence into vector
encoded_docs_test=[one_hot(d,vocab_size) for d in df_text]
print (encoded_docs_test)

[[3890726, 1689115, 1889232, 3386187, 3037361, 3916376, 552039, 208706, 535390, 639269, 346446, 3835657, 1302802, 560197, 1181521, 3563073, 2322285, 983675, 134438, 711352, 2375708, 3056920], [2187008, 2670265, 229687, 1842694, 3522308, 1707644, 3051076, 208706, 231651, 399524, 1395403, 311660, 1466232, 3922097, 226379, 2737409, 2421770, 2955722, 2156347], [3887262, 3591028, 3887262, 3571613, 1510605, 243693, 3306886, 28335, 376712, 3617159, 1726061, 3380822], [827109, 336520, 285371, 1490199, 926333, 3238572, 2598569, 81912, 3795868, 376712, 3104680, 1005718, 243693, 3141436, 1468833, 1137051, 2101607, 194879, 532178, 2814987, 125859], [2016915, 3887262, 2079867, 1864705, 208706, 376712, 3474313, 2299270, 3887262, 2062110, 2953808, 2953041, 2289980, 1780442, 1125741, 3719322, 1492170, 3982656, 607205, 639269], [2305281, 2671813, 1954284, 2678758, 2439620, 3487617, 208706, 3191103, 2638030, 1451898, 707934, 198823, 33119, 3719322, 446793, 3835657, 285371, 3297915, 3922097, 2793913, 689

In [78]:
from keras.preprocessing.sequence import pad_sequences
# pad documents to a max length of 4 words
max_length = 100
padded_docs_test = pad_sequences(encoded_docs_test, maxlen=max_length, padding='post')
print(padded_docs_test)

[[3890726 1689115 1889232 ...       0       0       0]
 [2187008 2670265  229687 ...       0       0       0]
 [3887262 3591028 3887262 ...       0       0       0]
 ...
 [2236561  560197  886026 ...       0       0       0]
 [ 376712 3478592 2511558 ...       0       0       0]
 [  33337 3010654  208706 ...       0       0       0]]


In [84]:
w_test=loaded_model.predict_classes(padded_docs_test)

In [86]:
print(len(w_test)-np.count_nonzero(w_test))
np.count_nonzero(w_test)

927


226

In [87]:
output = pd.DataFrame()
#out_result['text_id'] = 

In [88]:
output['text_id'] = data_test['text_id']

In [89]:
ans_dict = {0:'NOT',1:'HOF'}
w2 = np.vectorize(ans_dict.get)(w_test)
output['result'] = w2

In [90]:
output.to_csv('new_Kirti Kumari_English_task_1_run_1.tsv',header=True,sep='\t',index=False)