In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import pandas as pd
import numpy as np
import re

from sklearn.preprocessing import LabelEncoder

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Dropout, Embedding, Flatten, GlobalAveragePooling1D, LSTM, Bidirectional, Conv1D, MaxPooling1D
import tensorflow as tf

from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score, confusion_matrix, classification_report

In [None]:
'''embedding_vectors = {}
f = open("E:\zz\INT\piblitz\summarizer\glove.6B\glove.6B.100d.txt", encoding="utf8")
for line in f:
    value = line.split(" ")
    word = value[0]
    coef = np.array(value[1:], dtype='float32')
    embedding_vectors[word] = coef
print("Total word vectors: ", len(embedding_vectors))

Total word vectors:  400001


In [3]:
df_tr = pd.read_csv('/content/drive/My Drive/dataset/kannada_sentiment_full_train.tsv', sep='\t')
print(df_tr.shape)
df_tr.head()

(6212, 2)


Unnamed: 0,text,category
0,ಒಂದು ದೇಶದ ಮುಂದುವರಿಯುವುದು ಅದರ ಆರ್ಥಿಕ ಸ್ಥಿತಿಯನ್ನ...,Negative
1,ಕನ್ನಡದಲ್ಲಿ ಡೈಲಿ ಟೆಕ್ ಅಪ್ಡೇಟ್ಸ್ ಪಡೆಯಲು ಸಬ್ಸ್ಕ್ರ...,Positive
2,Super sar song,not-Kannada
3,Tiktokers present situation... nನೋಡುವವರು ಯಾರು ...,Negative
4,Super ಸಾಂಗ್ ವೆರಿ ನೈಸ್....,Positive


In [4]:
print(df_tr['category'].value_counts())
df_tr['category'].unique()

Positive          2823
Negative          1188
not-Kannada        916
unknown state      711
Mixed feelings     574
Name: category, dtype: int64


array(['Negative', 'Positive', 'not-Kannada', 'Mixed feelings',
       'unknown state'], dtype=object)

In [5]:
categories = pd.get_dummies(df_tr.category)

In [6]:
df_tr = pd.concat([df_tr, categories], axis=1)
df_tr.head()

Unnamed: 0,text,category,Mixed feelings,Negative,Positive,not-Kannada,unknown state
0,ಒಂದು ದೇಶದ ಮುಂದುವರಿಯುವುದು ಅದರ ಆರ್ಥಿಕ ಸ್ಥಿತಿಯನ್ನ...,Negative,0,1,0,0,0
1,ಕನ್ನಡದಲ್ಲಿ ಡೈಲಿ ಟೆಕ್ ಅಪ್ಡೇಟ್ಸ್ ಪಡೆಯಲು ಸಬ್ಸ್ಕ್ರ...,Positive,0,0,1,0,0
2,Super sar song,not-Kannada,0,0,0,1,0
3,Tiktokers present situation... nನೋಡುವವರು ಯಾರು ...,Negative,0,1,0,0,0
4,Super ಸಾಂಗ್ ವೆರಿ ನೈಸ್....,Positive,0,0,1,0,0


In [7]:
def demoji(text):
	emoji_pattern = re.compile("["
		u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U00010000-\U0010ffff"
	"]+", flags=re.UNICODE)
	return(emoji_pattern.sub(r'', text))

In [8]:
df_tr['text'] = df_tr['text'].apply(lambda x: demoji(x))
df_tr['text'] = df_tr['text'].str.replace('\d+', '')
df_tr.head(15)

Unnamed: 0,text,category,Mixed feelings,Negative,Positive,not-Kannada,unknown state
0,ಒಂದು ದೇಶದ ಮುಂದುವರಿಯುವುದು ಅದರ ಆರ್ಥಿಕ ಸ್ಥಿತಿಯನ್ನ...,Negative,0,1,0,0,0
1,ಕನ್ನಡದಲ್ಲಿ ಡೈಲಿ ಟೆಕ್ ಅಪ್ಡೇಟ್ಸ್ ಪಡೆಯಲು ಸಬ್ಸ್ಕ್ರ...,Positive,0,0,1,0,0
2,Super sar song,not-Kannada,0,0,0,1,0
3,Tiktokers present situation... nನೋಡುವವರು ಯಾರು ...,Negative,0,1,0,0,0
4,Super ಸಾಂಗ್ ವೆರಿ ನೈಸ್....,Positive,0,0,1,0,0
5,Varshakke thagadu movie madi industry haal ma...,Negative,0,1,0,0,0
6,Tickets amount adru mosa illa ... Love you all,Positive,0,0,1,0,0
7,Super super super film I can't explain,Positive,0,0,1,0,0
8,@Wild Rex ಕಟ್ಟಬೇಕು bronಖಂಡಿತಾ ಕಟ್ಟುತ್ತೆ bro,Mixed feelings,1,0,0,0,0
9,shankaragouda desaigoudra super,Positive,0,0,1,0,0


In [9]:
X_train, y_train = df_tr['text'], df_tr[['Mixed feelings', 'Negative', 'Positive', 'not-Kannada', 'unknown state']]

In [10]:
df_tt = pd.read_csv('/content/drive/My Drive/dataset/kannada_sentiment_full_dev.tsv', sep='\t')
print(df_tt.shape)
df_tt.head()

(691, 2)


Unnamed: 0,text,category
0,Binduge saryagi ugithidira good go ahead we a...,Mixed feelings
1,yen song guru ...super,Positive
2,my fevorat story,not-Kannada
3,Super ತೋಗರಿ ತೀಪ್ಪ,Positive
4,ನಿಮ್ಮ ಮಾತುಗಳು ಅಕ್ಷರಶಃ ಸತ್ಯ... ನಿಮ್ಮ ಈ ಸಾಮಾನ್ಯ ...,Positive


In [11]:
df_tt['category'].value_counts()

Positive          321
Negative          139
not-Kannada       110
unknown state      69
Mixed feelings     52
Name: category, dtype: int64

In [12]:
categories = pd.get_dummies(df_tt.category)
df_tt = pd.concat([df_tt, categories], axis=1)
df_tt.head()

Unnamed: 0,text,category,Mixed feelings,Negative,Positive,not-Kannada,unknown state
0,Binduge saryagi ugithidira good go ahead we a...,Mixed feelings,1,0,0,0,0
1,yen song guru ...super,Positive,0,0,1,0,0
2,my fevorat story,not-Kannada,0,0,0,1,0
3,Super ತೋಗರಿ ತೀಪ್ಪ,Positive,0,0,1,0,0
4,ನಿಮ್ಮ ಮಾತುಗಳು ಅಕ್ಷರಶಃ ಸತ್ಯ... ನಿಮ್ಮ ಈ ಸಾಮಾನ್ಯ ...,Positive,0,0,1,0,0


In [13]:
df_tt['text'] = df_tt['text'].apply(lambda x: demoji(x))
df_tt['text'] = df_tt['text'].str.replace('\d+', '')

In [14]:
X_test, y_test = df_tt['text'], df_tt[['Mixed feelings', 'Negative', 'Positive', 'not-Kannada', 'unknown state']]

In [15]:
max_len = 50
oov_tok = "<OOV>"

tk = Tokenizer(oov_token=oov_tok)
tk.fit_on_texts(X_train)

In [16]:
seq_tr = tk.texts_to_sequences(X_train)
pad_seq_tr = pad_sequences(seq_tr, maxlen=max_len, padding='post', truncating='post')
vocab_size = len(tk.word_index) + 1

seq_tt = tk.texts_to_sequences(X_test)
pad_seq_tt = pad_sequences(seq_tt, maxlen=max_len, padding='post', truncating='post')

In [17]:
embedding_dim = 256
n_lstm = 100
drop_value = 0.2
n_dense = 24

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len, trainable=True))
model.add(Bidirectional(LSTM(units=n_lstm, dropout=drop_value, return_sequences=True)))
model.add(Conv1D(filters=64, kernel_size=10, activation='relu'))
# model.add(Dropout(drop_value))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(64, activation='relu'))
# model.add(Dropout(drop_value))
# model.add(Dense(32, activation='relu'))
model.add(Dropout(drop_value))
# model.add(Flatten())
# model.add(Dense(5, activation='softmax'))
model.add(Dense(5, activation='softmax'))

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 50, 256)           3997184   
_________________________________________________________________
bidirectional (Bidirectional (None, 50, 200)           285600    
_________________________________________________________________
conv1d (Conv1D)              (None, 41, 64)            128064    
_________________________________________________________________
max_pooling1d (MaxPooling1D) (None, 20, 64)            0         
_________________________________________________________________
flatten (Flatten)            (None, 1280)              0         
_________________________________________________________________
dense (Dense)                (None, 64)                81984     
_________________________________________________________________
dropout (Dropout)            (None, 64)                0

In [18]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

cb = tf.keras.callbacks.EarlyStopping(monitor='val_accuracy', mode='max', min_delta=0.001, patience=5, restore_best_weights=True)
num_epochs = 15
history = model.fit(pad_seq_tr, y_train, epochs=num_epochs, validation_data=(pad_seq_tt, y_test), callbacks=[cb])

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15


In [19]:
model.evaluate(pad_seq_tt, y_test, verbose=0)

[1.0221061706542969, 0.6309695839881897]

In [20]:
len(tk.word_index)

15613

In [None]:
'''embedding_vectors = {}
f = open("E:\zz\INT\piblitz\summarizer\glove.6B\glove.6B.100d.txt", encoding="utf8")
for line in f:
    value = line.split(" ")
    word = value[0]
    coef = np.array(value[1:], dtype='float32')
    embedding_vectors[word] = coef
print("Total word vectors: ", len(embedding_vectors))

Total word vectors:  400001


In [None]:
'''# creating a matrix only of the words present in our corpus and their vectors
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tk.word_index.items():
    embedding_value = embedding_vectors.get(word)
    if embedding_value is not None:
        embedding_matrix[i] = embedding_value

In [None]:
'''filter_size = 32
kernel_size = 10
drop_value = 0.2
n_dense = 32

model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=100, weights=[embedding_matrix], input_length=max_len, trainable=False))
model.add(Bidirectional(LSTM(units=100, dropout=drop_value, return_sequences=True)))
# model.add(Dropout(drop_value))
model.add(GlobalAveragePooling1D())
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(drop_value))
# model.add(Dense(64, activation='relu'))
# model.add(Dropout(drop_value))
model.add(Dense(32, activation='relu'))
model.add(Dropout(drop_value))
model.add(Dense(5, activation='softmax'))

model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 100)           6674700   
_________________________________________________________________
bidirectional_1 (Bidirection (None, 50, 200)           160800    
_________________________________________________________________
global_average_pooling1d_1 ( (None, 200)               0         
_________________________________________________________________
flatten_1 (Flatten)          (None, 200)               0         
_________________________________________________________________
dense (Dense)                (None, 128)               25728     
_________________________________________________________________
dropout (Dropout)            (None, 128)               0         
_________________________________________________________________
dense_1 (Dense)              (None, 64)               

In [None]:
'''model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

num_epochs = 5
history = model.fit(pad_seq_tr, y_train, epochs=num_epochs, validation_data=(pad_seq_tt, y_test))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:
'''model.evaluate(pad_seq_tt, y_test, verbose=0)

[1.076276421546936, 0.6052498817443848]

In [22]:
y_pred = model.predict(pad_seq_tt)

f = f1_score(y_test, y_pred.round(), average='weighted')
print(f)
a = accuracy_score(y_test, y_pred.round())
print(a)


0.55226773051479
0.5180897250361794


In [24]:
cr = classification_report(y_test, y_pred.round())
print(cr)

              precision    recall  f1-score   support

           0       0.00      0.00      0.00        52
           1       0.68      0.37      0.48       139
           2       0.71      0.75      0.73       321
           3       0.75      0.52      0.61       110
           4       0.70      0.10      0.18        69

   micro avg       0.71      0.52      0.60       691
   macro avg       0.57      0.35      0.40       691
weighted avg       0.66      0.52      0.55       691
 samples avg       0.52      0.52      0.52       691



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [28]:
cm = confusion_matrix(y_test.values.argmax(axis=1), y_pred.argmax(axis=1))
print(cm)

[[  5  10  29   3   5]
 [  5  71  57   1   5]
 [  4  21 269  25   2]
 [  0   4  24  69  13]
 [  4   5  32   6  22]]
