In [14]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import accuracy_score
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, TensorDataset

In [2]:
train_df = pd.read_json('Data/is_train.json')
train_df.shape

(15000, 2)

In [3]:
test_df = pd.read_json('Data/is_test.json')
test_df.shape

(4500, 2)

In [4]:
unwanted_labels = ['order_status', 'account_blocked', 'what_song', 'international_fees', 'last_maintenance', 'taxes', 
              'min_payment', 'pin_change', 'accept_reservations', 'how_busy', 'bill_due', 'damaged_card', 'do_you_have_pets',
              'gas_type', 'plug_type', 'tire_change', 'who_do_you_work_for', 'credit_limit', 'international_visa', 'transfer',
              'gas', 'expiration_date', 'how_old_are_you', 'car_rental', 'jump_start', 'redeem_rewards', 'pto_balance', 'direct_deposit',
              'credit_limit_change', 'bill_balance', 'w2', 'where_are_you_from', 'what_can_i_ask_you', 'maybe', 'oil_change_how', 'balance',
              'confirm_reservation', 'freeze_account', 'rollover_401k', 'transactions', 'insurance_change', 'travel_alert', 'pto_request',
              'improve_credit_score', 'change_language', 'payday', 'replacement_card_duration', 'application_status', 'flight_status',
              'rewards_balance', 'pay_bill', 'spending_history', 'pto_request_status', 'carry_on', 'pto_used', 'schedule_maintenance',
              'travel_notification', 'sync_device', 'report_lost_card', 'yes', 'credit_score', 'new_card', 'lost_luggage', 'mpg', 'oil_change_when',
              'apr', 'change_speed', 'tire_pressure', 'card_declined']

In [5]:
train_filtred = train_df[~train_df[1].isin(unwanted_labels)]
train_filtred.shape

(8100, 2)

In [6]:
test_filtred = test_df[~test_df[1].isin(unwanted_labels)]
test_filtred.shape

(2430, 2)

In [172]:
corpus = train_filtred[0]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

array(['00', '000', '005', ..., 'zombie', 'zone', 'zoo'], dtype=object)

In [173]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(train_filtred[1])

In [174]:
X.shape

(8100, 3809)

In [175]:
X_tensor = torch.tensor(X.toarray(), dtype=torch.float32)
y_tensor = torch.tensor(y_encoded, dtype=torch.long)

In [176]:
class TextClassificationModel(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(TextClassificationModel, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, output_dim)
        
    def forward(self, x):
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc3(x)
        return x

In [177]:
Dataset = TensorDataset(X_tensor, y_tensor)
train_loader = DataLoader(Dataset, batch_size=32, shuffle=True)

In [178]:
input_dim = X_tensor.shape[1]
output_dim = len(torch.unique(y_tensor))

In [179]:
input_dim

3809

In [180]:
model = TextClassificationModel(input_dim, output_dim)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001)

In [181]:
for epoch in range(100):
    model.train()
    train_loss = 0
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        loss.backward()
        optimizer.step()
        
        train_loss += loss.item()
        
    print(f"Epoch {epoch+1}, Train Loss: {train_loss/len(train_loader):.4f}")

Epoch 1, Train Loss: 4.3948
Epoch 2, Train Loss: 4.3802
Epoch 3, Train Loss: 4.3484
Epoch 4, Train Loss: 4.2880
Epoch 5, Train Loss: 4.1793
Epoch 6, Train Loss: 4.0168
Epoch 7, Train Loss: 3.7883
Epoch 8, Train Loss: 3.5131
Epoch 9, Train Loss: 3.2152
Epoch 10, Train Loss: 2.9357
Epoch 11, Train Loss: 2.6701
Epoch 12, Train Loss: 2.4337
Epoch 13, Train Loss: 2.2162
Epoch 14, Train Loss: 2.0416
Epoch 15, Train Loss: 1.8948
Epoch 16, Train Loss: 1.7473
Epoch 17, Train Loss: 1.6341
Epoch 18, Train Loss: 1.5251
Epoch 19, Train Loss: 1.4150
Epoch 20, Train Loss: 1.3323
Epoch 21, Train Loss: 1.2517
Epoch 22, Train Loss: 1.1835
Epoch 23, Train Loss: 1.1103
Epoch 24, Train Loss: 1.0426
Epoch 25, Train Loss: 0.9970
Epoch 26, Train Loss: 0.9493
Epoch 27, Train Loss: 0.8916
Epoch 28, Train Loss: 0.8519
Epoch 29, Train Loss: 0.8109
Epoch 30, Train Loss: 0.7700
Epoch 31, Train Loss: 0.7343
Epoch 32, Train Loss: 0.7073
Epoch 33, Train Loss: 0.6773
Epoch 34, Train Loss: 0.6301
Epoch 35, Train Loss: 0

In [182]:
idk = vectorizer.transform(test_filtred[0])
idk.shape

(2430, 3809)

In [183]:
X_test = torch.tensor(vectorizer.transform(test_filtred[0]).toarray(), dtype=torch.float32)
y_test = torch.tensor(label_encoder.transform(test_filtred[1]))

test_dataset = TensorDataset(X_test, y_test)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=True)

In [184]:
correct, total = 0, 0
model.eval()
with torch.no_grad():
    for X_batch, y_batch in test_loader:
        outputs = model(X_batch)
        _, predicted = torch.max(outputs, 1)
        total += y_batch.size(0)
        correct += (predicted == y_batch).sum().item()
    
    print(f"Validation Accuracy: {100 * correct / total:.2f}%")

Validation Accuracy: 90.70%


In [41]:
# model = make_pipeline(TfidfVectorizer(), MultinomialNB())

In [42]:
# model.fit(train_filtred[0], train_filtred[1])

In [43]:
# predictions = model.predict(test_filtred[0])

In [44]:
# print(accuracy_score(test_filtred[1], predictions))

0.8950617283950617


In [45]:
# new_test = "Can you play some music on spotify"
# new_pred = model.predict([new_test])

In [47]:
# new_pred[0]

'play_music'

In [186]:
import joblib

In [51]:
joblib.dump(model, 'naive_bayes_txt_class_pipeline.pkl')

['navie_bayes_txt_class_pipeline.pkl']

In [187]:
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [188]:
torch.save(model.state_dict(), 'nn_text_class.pth')

In [189]:
joblib.dump(label_encoder, 'label_encoder.pkl')

['label_encoder.pkl']