![servicedesk](servicedesk.png)

CleverSupport is a company at the forefront of AI innovation, specializing in the development of AI-driven solutions to enhance customer support services. Their latest endeavor is to engineer a text classification system that can automatically categorize customer complaints. 

Your role as a data scientist involves the creation of a sophisticated machine learning model that can accurately assign complaints to specific categories, such as mortgage, credit card, money transfers, debt collection, etc.

In [133]:
from collections import Counter
import nltk, json
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import TensorDataset, DataLoader
from torchmetrics import Accuracy, Precision, Recall

In [134]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /home/repl/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [135]:
# Import data and labels
with open("words.json", 'r') as f1:
    words = json.load(f1)
with open("text.json", 'r') as f2:
    text = json.load(f2)
labels = np.load('labels.npy')

In [136]:
# Dictionaries to store the word to index mappings and vice versa
word2idx = {o:i for i,o in enumerate(words)}
idx2word = {i:o for i,o in enumerate(words)}

# Looking up the mapping dictionary and assigning the index to the respective words
for i, sentence in enumerate(text):
    text[i] = [word2idx[word] if word in word2idx else 0 for word in sentence]
    
# Defining a function that either shortens sentences or pads sentences with 0 to a fixed length
def pad_input(sentences, seq_len):
    features = np.zeros((len(sentences), seq_len),dtype=int)
    for ii, review in enumerate(sentences):
        if len(review) != 0:
            features[ii, -len(review):] = np.array(review)[:seq_len]
    return features

text = pad_input(text, 50)

In [137]:
# Splitting dataset
train_text, test_text, train_label, test_label = train_test_split(text, labels, test_size=0.2, random_state=42)

train_data = TensorDataset(torch.from_numpy(train_text), torch.from_numpy(train_label).long())
test_data = TensorDataset(torch.from_numpy(test_text), torch.from_numpy(test_label).long())

In [138]:
train_loader = DataLoader(train_data, batch_size=400, shuffle=True)
test_loader = DataLoader(test_data, batch_size=400, shuffle=False)

In [139]:
len(set(labels))

5

In [140]:
class TicketClassifier(nn.Module):
    def __init__(self, vocab_size, embed_dim, target_size):
        super(TicketClassifier, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv = nn.Conv1d(embed_dim, embed_dim, kernel_size=3, padding=1, stride=1)
        self.fc = nn.Linear(embed_dim, target_size)

    def forward(self, text):
        embed = self.embedding(text).permute(0,2,1)
        conved = F.relu(self.conv(embed))
        conved = conved.mean(dim=2)
        return self.fc(conved)

In [141]:
model = TicketClassifier(len(word2idx) + 1, 64, len(set(labels)))
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.05)

In [142]:
epochs = 3

model.train()
for epoch in range(epochs):
    running_loss, num_processed = 0, 0
    for text, label in train_loader:
        model.zero_grad()
        output = model(text)
        loss = criterion(output, label)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
        num_processed += len(text)
    print(f"Epoch: {epoch+1}, Loss: {running_loss/num_processed}")

Epoch: 1, Loss: 0.003693235069513321
Epoch: 2, Loss: 0.0016779607981443406
Epoch: 3, Loss: 0.0007226967178285122


In [143]:
acc = Accuracy(task="multiclass", num_classes=5)
pre = Precision(task="multiclass", num_classes=5, average=None)
rec = Recall(task="multiclass", num_classes=5, average=None)

In [144]:
model.eval()
predicted= []

for _, (text, label) in enumerate(test_loader):
    output = model(text)
    cat = torch.argmax(output, dim=-1)
    predicted.extend(cat.tolist())
    acc(cat, label)
    pre(cat, label)
    rec(cat, label)

In [145]:
accuracy = acc.compute().item()
precision = pre.compute().tolist()
recall = rec.compute().tolist()

In [146]:
print("Accuracy(per class):", accuracy, "\nPrecision(per class):", precision, "\nRecall(per class):", recall)

Accuracy(per class): 0.7910000085830688 
Precision(per class): [0.65887850522995, 0.8157894611358643, 0.7599999904632568, 0.8787878751754761, 0.8721461296081543] 
Recall(per class): [0.734375, 0.6526315808296204, 0.8796296119689941, 0.7552083134651184, 0.9095237851142883]
