### Converting preprocessed text into vectors using fasttext embedding.

In [0]:
from google.colab import drive
import numpy as np
import pandas as pd
drive.mount('/content/gdrive')
bengali_news_after_preprocessing = pd.read_pickle('/content/gdrive/My Drive/Projects/Bengali Text Classification/Bengali_Text_after_preprocessing.pkl')
from sklearn.externals import joblib
filename = '/content/gdrive/My Drive/Projects/Bengali Text Classification/fastText_Bangla_content_full.sav'
loaded_model = joblib.load(filename)

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [0]:
import keras.backend as K
import numpy as np
number_of_sample, max_number_of_words, word_vector_size = 40000, 50, 32
temp = bengali_news_after_preprocessing.loc[:number_of_sample-1,:max_number_of_words-1]

In [0]:
temp = temp.replace(['ঘস', 'ফগ', 'ঝবঃ', 'ঋন', 'ঊঘ', '\u09e4', 'ওৎ', 'গথ', 'খঢ', 'ঝ’', ' ং', 'ঔ', 'ডড', 'গঘ'], None)
X = np.zeros((number_of_sample, max_number_of_words, word_vector_size), dtype=K.floatx())
for i in temp.index:
  X[i,:,:] = loaded_model.wv[temp.loc[i,:]]

### preparing labels from csv

In [0]:
bengali_news = pd.read_pickle('/content/gdrive/My Drive/Projects/Bengali Text Classification/40k_bangla_newspaper_article.p')
bengali_news_dataframe = pd.DataFrame(bengali_news)
y = bengali_news_dataframe['category']

In [0]:
from sklearn import preprocessing
import keras
import numpy as np
le = preprocessing.LabelEncoder()
le.fit(y)
enc = le.transform(y)
y = keras.utils.to_categorical(enc)

### Train, Validation and Test Split

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, shuffle = True, test_size=0.125)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, shuffle = True, test_size=0.20)

### Training CNN

In [0]:
from keras.models import Sequential
from keras.layers import Conv1D, Dropout, Dense, Flatten, LSTM, MaxPooling1D, Bidirectional
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, TensorBoard


model = Sequential()

model.add(Conv1D(32, kernel_size=3, activation='relu', padding='same',
                 input_shape=(max_number_of_words, word_vector_size)))
model.add(Conv1D(32, kernel_size=3, activation='relu', padding='same'))
model.add(Conv1D(32, kernel_size=3, activation='relu', padding='same'))
model.add(MaxPooling1D(pool_size=3))
model.add(Flatten())
model.add(Dense(256, activation='sigmoid'))
model.add(Dropout(0.2))
model.add(Dense(256, activation='sigmoid'))
model.add(Dropout(0.25))
model.add(Dense(256, activation='sigmoid'))
model.add(Dropout(0.25))

model.add(Dense(13, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer=Adam(lr=0.0001, decay=1e-6), metrics=['accuracy'])

In [54]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_4 (Conv1D)            (None, 50, 32)            3104      
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 50, 32)            3104      
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 50, 32)            3104      
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 16, 32)            0         
_________________________________________________________________
flatten_6 (Flatten)          (None, 512)               0         
_________________________________________________________________
dense_26 (Dense)             (None, 256)               131328    
_________________________________________________________________
dropout_10 (Dropout)         (None, 256)               0         
__________

In [55]:
history = model.fit(X_train, y_train, batch_size= 500, shuffle=True, epochs= 100, validation_data=(X_val, y_val))

Train on 35000 samples, validate on 4000 samples
Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/10

In [0]:
predicts = model.predict(X_test)
import numpy as np
def decode(le, one_hot):
    dec = np.argmax(one_hot, axis=1)
    return le.inverse_transform(dec)
y_test = decode(le, y_test)
y_preds = decode(le, predicts)

In [57]:
from sklearn import metrics

print(metrics.accuracy_score(y_test, y_preds))

print(metrics.confusion_matrix(y_test, y_preds))

print(metrics.classification_report(y_test, y_preds))

0.806
[[  0   0   0   0   0   1   0   1   0   8   0   0]
 [  0 245   0  11   2   2   5   1   0  33   1   2]
 [  0   2   0   0   0   1   0   0   0   2   0   0]
 [  0   5   0  93   0   1   2   0   0   9   0   2]
 [  0   1   0   1   7   1   0   0   0   3   0   0]
 [  0   6   0   0   0  55   1   0   0   1   1   0]
 [  0   5   0   0   0   2  27   0   0  10   1   1]
 [  0   1   0   2   0   2   0  11   0   5   1   2]
 [  0   0   0   0   0   0   2   0   0   3   0   0]
 [  0  19   0   7   0   2   5   4   0 239   1   2]
 [  0   0   0   0   0   1   2   0   0   1  78   0]
 [  0   2   0   2   0   0   0   1   0   2   0  51]]
                    precision    recall  f1-score   support

art-and-literature       0.00      0.00      0.00        10
        bangladesh       0.86      0.81      0.83       302
       durporobash       0.00      0.00      0.00         5
           economy       0.80      0.83      0.82       112
         education       0.78      0.54      0.64        13
     entertainment  

  'precision', 'predicted', average, warn_for)


In [0]:
accuracy, val_accuracy = np.array(history.history["acc"]), np.array(history.history["val_acc"])
accuracy, val_accuracy = accuracy.reshape(100,1), val_accuracy.reshape(100,1)
accuracies = np.concatenate((accuracy,val_accuracy),axis=1)
np.savetxt('/content/gdrive/My Drive/Projects/Bengali Text Classification/temp.csv',accuracies,delimiter=",")