In [None]:
! pip install matplotlib
! pip install numpy
! pip install torch
! pip install pandas
! pip install opencv-python

In [None]:
from matplotlib import pyplot as plt
import numpy as np
import torch
import pandas as pd
import torch.nn as nn
from torch.nn import Sequential as seq
from torch.nn import Conv1d as c1d
from torch.nn import Linear as l_n
from torch.nn import ReLU as rel
from torch.nn import Dropout as drop
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.optim as optim
from torch.nn.functional import nll_loss as nll_loss
from torch.autograd import Variable
import cv2
from torch.nn import MaxPool1d as m1d
import os
from sklearn.metrics import f1_score,accuracy_score,precision_score,recall_score
import tabulate
from torchsummary import summary

In [None]:
class CharacterConvolutionNetwork(nn.Module):                                       # Class for Character Convolution Network (CCN) 
    def __init__(self, n_classes=4, input_length=1014, input_dim=68):
        self.ker_siz=[7,7,3,3,3,3]
        super(CharacterConvolutionNetwork, self).__init__()
        self.conv1 = seq(c1d(input_dim, 256, kernel_size=self.ker_siz[0]), rel(),
                                   nn.MaxPool1d(3))
        self.conv2 = seq(c1d(256, 256, kernel_size=self.ker_siz[1], padding=0), rel(),
                                   nn.MaxPool1d(3))
        self.conv3 = seq(c1d(256, 256, kernel_size=self.ker_siz[2], padding=0), rel())
        self.conv4 = seq(c1d(256, 256, kernel_size=self.ker_siz[3], padding=0), rel())
        self.conv5 = seq(c1d(256, 256, kernel_size=self.ker_siz[4], padding=0), rel())
        self.conv6 = seq(c1d(256, 256, kernel_size=self.ker_siz[5], padding=0), rel(),
                                   nn.MaxPool1d(3))

        dimension = int((input_length - 96) / 27 * 256)
        self.fc1 = seq(l_n(dimension, 1024), drop(0.5))
        self.fc2 = seq(l_n(1024, 1024), drop(0.5))
        self.fc3 = l_n(1024, n_classes)
        mn = 0.0
        std_dev = 0.05
        self._weights(mean=mn, std=std_dev)

    def _weights(self, mean=0, std=0.05):
        for module in self.modules():
            if isinstance(module, c1d) or isinstance(module, l_n):
                module.weight.data.normal_(mean, std)

    def forward(self, inp):
        inp = inp.transpose(1, 2)
        y = self.conv1(inp)
        y = self.conv2(y)
        y = self.conv3(y)
        y = self.conv4(y)
        y = self.conv5(y)
        y = self.conv6(y)
        y = y.view(y.size(0), -1)
        y = self.fc1(y)
        y = self.fc2(y)
        y = self.fc3(y)
        return y

In [None]:
characterConvolutionNetwork = CharacterConvolutionNetwork(4)
print(characterConvolutionNetwork)

In [None]:
data=pd.read_csv(r'./dataset/train.csv')
train_df=pd.DataFrame(data,columns=["class","title","desc"])
print(train_df)

In [None]:
data=pd.read_csv(r'./dataset/test.csv')
test_df=pd.DataFrame(data,columns=["class","title","desc"])
print(test_df)

In [None]:
class TextDataset(Dataset):
    def __init__(self,df, max_length=1014):
#         self.data_path = data_path
        strr="""abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"""
        self.vocabulary = list(strr)

        self.identity_mat = np.identity(len(self.vocabulary))
        texts, labels = [], []
        
        self.labels = [int(i)-1 for i in df['class'].values]
        self.num_classes = len(set(self.labels))
        self.length = len(self.labels)
        self.texts = df['desc'].values
        self.max_length = max_length


    def __getitem__(self, index):
        raw_text = self.texts[index]
        isi_text = self.texts[index]
        vocabb= self.vocabulary
        data = np.array([self.identity_mat[self.vocabulary.index(i)] for i in list(raw_text) if i in vocabb], dtype=np.float32)

        if len(data) == 0:
            data = np.zeros((self.max_length, len(self.vocabulary)), dtype=np.float32)
        
        
        elif self.max_length > len(data) > 0 :
            data = np.concatenate((data, np.zeros((-(len(data) - self.max_length), len(self.vocabulary)), dtype=np.float32)))
        
        elif  self.max_length < len(data):
            data = data[:self.max_length]
        
        label = self.labels[index]
        
        return data, label

    def __len__(self):
        return self.length
        

In [None]:
trainset=TextDataset(train_df)

In [None]:
train_siz=int(0.9*120000)
valid_siz=120000-train_siz

In [None]:
trainloader = DataLoader(trainset, batch_size=128, num_workers=2, drop_last=True, shuffle=True)

In [None]:
optimizer = optim.SGD(characterConvolutionNetwork.parameters(), lr=0.01, momentum=0.9)
criterion = nn.CrossEntropyLoss()

In [None]:
epochs=8

In [None]:
class Config:
    def __init__(self):
        self.train=False
        self.model_dict_path='./model_dict.pth'

In [None]:
cnf=Config()

In [None]:
if cnf.train:
    for i in range(epochs):
        running_loss=0
        for j,data in enumerate(trainloader,0):
            x,y=data
            x=Variable(x)
            y=Variable(y)
            optimizer.zero_grad()
            out=characterConvolutionNetwork(x)
            loss=nll_loss(out,y)
            loss.backward()
            optimizer.step()
            if j%100==0:
                print(loss.data)
else:
    plt1=cv2.imread('./plot1.png')
    plt2=cv2.imread('./plot2.png')

    fig=plt.figure(figsize=(20,12))
    r=1
    c=2
    fig.add_subplot(r,c,1)
    plt.imshow(plt1,cmap='gray')
    plt.axis('off')
    fig.add_subplot(r,c,2)
    plt.imshow(plt2,cmap='gray')
    plt.axis('off')

In [None]:
if cnf.train:
    torch.save(characterConvolutionNetwork.state_dict(), cnf.model_dict_path)

In [None]:
model = CharacterConvolutionNetwork(4)
model.load_state_dict(torch.load('./model_dict.pth'))
model.eval()

In [None]:
# print(len(test_df))
testset=TextDataset(test_df)
# print(testset)

In [None]:
testloader = DataLoader(testset,batch_size=128, num_workers=2, drop_last=True, shuffle=True)
# print(len(testloader))
# print(next(iter(testloader)))
pr=[]
ac=[]

In [None]:
cnt=0
match=0
# print(testloader)
# for j,data in enumerate(testloader,0):
#   x,y=data
#   x=Variable(x)
#   y=Variable(y)
#   print(j)
for j,data in enumerate(testloader,0):
        m1=0
        x,y=data
        # print(x,y)
        x=Variable(x)
        y=Variable(y)
        out=model(torch.reshape(x,(128,1014,68)))
        # print(out[1])
        lab = []
        for i in out:
            mx=-1e9
            midx=0
            for x in range(4):
                if i[x]>mx:
                    mx=i[x]
                    midx=x
            lab.append(midx)
        # print(lab)
        for i in range(128):
            if y[i]==lab[i]:
                match+=1
            pr.append(lab[i])
            ac.append(y[i])
        # break
print("accuracy",str(match*100/len(testset)),"%%")
        

In [None]:
print(match)    

In [None]:
def sm(pred,test,i,case):
    cnt=0
    for x in range(len(pred)):
        if case=='tp':
            if pred[x]==i and test[x]==i:
                cnt+=1
        elif case=='tn':
            if pred[x]!=i and test[x]!=i:
                cnt+=1
        elif case=='fp':
            if pred[x]!=i and test[x]==i:
                cnt+=1
        elif case=='fn':
            if pred[x]==i and test[x]!=i:
                cnt+=1
    return cnt    

In [None]:
def findMet(pred,test,nclass):
    tp=[0 for i in range(nclass)]
    tn=[0 for i in range(nclass)]
    fp=[0 for i in range(nclass)]
    fn=[0 for i in range(nclass)]
    for i in range(nclass):
        tp[i]=sm(pred,test,i,'tp')
        tn[i]=sm(pred,test,i,'tn')
        fp[i]=sm(pred,test,i,'fp')
        fn[i]=sm(pred,test,i,'fn')
    return tp,tn,fp,fn

In [None]:
tp,tn,fp,fn=findMet(pred=ac,test=pr,nclass=4)
acc=sum(tp)+sum(tn)
print(sum(tp))
acc/=(sum(tp)+sum(tn)+sum(fp)+sum(fn))
print(acc)

In [None]:
ac_sc=accuracy_score(ac,pr)
pr_sc=precision_score(ac,pr,average=None).tolist()
rc_sc=recall_score(ac,pr,average=None).tolist()
f1_sc=f1_score(ac,pr,average=None).tolist()
print("accuracy",100*ac_sc,"%")

In [None]:
fig=plt.figure(figsize=(10,5))
ax=fig.add_axes([0,0,1,1])
classes=[i for i in range(1,5)]
ax.bar(classes,pr_sc,width=0.5)
plt.xlabel('CLASS')
plt.ylabel('Precision Score')
plt.title('Precision Score vs Class for AG\'s NEWS dataset')
plt.show()

In [None]:
fig=plt.figure(figsize=(10,5))
ax=fig.add_axes([0,0,1,1])
classes=[i for i in range(1,5)]
ax.bar(classes,rc_sc,width=0.5)
plt.xlabel('CLASS')
plt.ylabel('Recall Score')
plt.title('Recall vs Class for AG\'s NEWS dataset')
plt.show()

In [None]:
fig=plt.figure(figsize=(10,5))
ax=fig.add_axes([0,0,1,1])
classes=[i for i in range(1,5)]
ax.bar(classes,f1_sc,width=0.5)
plt.xlabel('CLASS')
plt.ylabel('F1 Score')
plt.title('F1 Score vs Class for AG\'s NEWS dataset')
plt.show()

In [None]:
pr_sc.insert(0,'Precision')
rc_sc.insert(0,'Recall')
f1_sc.insert(0,'F1_score')
# print(x)
heading=['Type','Class 1','Class 2','Class 3','Class 4']
table=[heading,pr_sc,rc_sc,f1_sc]
# print(table)
print(tabulate.tabulate(table,headers='firstrow'))

In [None]:
data=pd.read_csv(r'./dbpedia/train.csv')
train_df=pd.DataFrame(data,columns=["class","title","desc"])
print(train_df)

In [None]:
data=pd.read_csv(r'./dbpedia/test.csv')
test_df1=pd.DataFrame(data,columns=["class","title","desc"])
print(test_df1)

In [None]:
testset=TextDataset(test_df1)
model1 = CharacterConvolutionNetwork(14)
model1.load_state_dict(torch.load('./dbpedia_model_dict.pth'))
model1.eval()

In [None]:
testloader = DataLoader(testset,batch_size=128, num_workers=2, drop_last=True, shuffle=True)
pr1=[]
ac1=[]

In [None]:
cnt=0
match=0
for j,data in enumerate(testloader,0):
        # print(j)
        m1=0
        x,y=data
        x=Variable(x)
        y=Variable(y)
        out=model1(torch.reshape(x,(128,1014,68)))
        lab=[]
        for i in out:
            mx=-1e9
            midx=0
            for x in range(14):
                if i[x]>mx:
                    mx=i[x]
                    midx=x
            lab.append(midx)
        for i in range(128):
            if y[i]==lab[i]:
                match+=1
            pr1.append(lab[i])
            ac1.append(y[i])
print("accuracy",str(match*100/len(testset)),"%%")
        

In [None]:
tp,tn,fp,fn=findMet(pred=ac1,test=pr1,nclass=14)
acc=sum(tp)+sum(tn)
acc/=(sum(tp)+sum(tn)+sum(fp)+sum(fn))
print(acc)

In [None]:
ac_sc=accuracy_score(ac1,pr1).tolist()
pr_sc=precision_score(ac1,pr1,average=None).tolist()
rc_sc=recall_score(ac1,pr1,average=None).tolist()
f1_sc=f1_score(ac1,pr1,average=None).tolist()


In [None]:

fig=plt.figure(figsize=(10,5))
ax=fig.add_axes([0,0,1,1])
classes=[i for i in range(1,15)]
ax.bar(classes,pr_sc,width=0.5)
plt.xlabel('CLASS')
plt.ylabel('Precision Score')
plt.title('Precision Score vs Class for DBPedia ontology dataset')
plt.show()

In [None]:
fig=plt.figure(figsize=(10,5))
ax=fig.add_axes([0,0,1,1])
classes=[i for i in range(1,15)]
ax.bar(classes,rc_sc,width=0.4)
plt.xlabel('CLASS')
plt.ylabel('Recall')
plt.title('Recall vs Class for DBPedia ontology dataset')
plt.show()

In [None]:
fig=plt.figure(figsize=(10,5))
ax=fig.add_axes([0,0,1,1])
classes=[i for i in range(1,15)]
ax.bar(classes,f1_sc,width=0.5)
plt.xlabel('CLASS')
plt.ylabel('F1 Score')
plt.title('F1 Score vs Class for DBPedia ontology dataset')
plt.show()

In [None]:
ac_sc=accuracy_score(ac1,pr1).tolist()
pr_sc=precision_score(ac1,pr1,average=None).tolist()
rc_sc=recall_score(ac1,pr1,average=None).tolist()
f1_sc=f1_score(ac1,pr1,average=None).tolist()
print("accuracy",100*ac_sc,"%")
pr_sc.insert(0,'Precision')
rc_sc.insert(0,'Recall')
f1_sc.insert(0,'F1_score')
# print(x)
heading=['Type']
for i in range(1,15):
    heading.append('Class'+str(i))
table=[heading,pr_sc,rc_sc,f1_sc]
t1=[]
for i in range(15):
    temp=[]
    for j in range(4):
        temp.append(table[j][i])
    t1.append(temp)
        
# print(t1)
print(tabulate.tabulate(t1,headers='firstrow'))