In [29]:
from fastkan import FastKAN as KAN

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
import torch.optim as optim
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from tqdm import tqdm
tqdm.pandas()

from sklearn.model_selection import train_test_split

import torchtext
torchtext.disable_torchtext_deprecation_warning()

from torchtext.datasets import IMDB
from transformers import GPT2Tokenizer

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import re

# Download necessary NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kaloq\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kaloq\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kaloq\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
data = pd.read_csv('./imdb.csv')
data.head()

Unnamed: 0,review,sentiment
0,One of the other reviewers has mentioned that ...,positive
1,A wonderful little production. <br /><br />The...,positive
2,I thought this was a wonderful way to spend ti...,positive
3,Basically there's a family where a little boy ...,negative
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive


In [4]:
def transform_label(label):
    return 1 if label == 'positive' else 0

data['label'] = data['sentiment'].progress_apply(transform_label)
data.head()

100%|██████████| 50000/50000 [00:00<00:00, 860592.48it/s]


Unnamed: 0,review,sentiment,label
0,One of the other reviewers has mentioned that ...,positive,1
1,A wonderful little production. <br /><br />The...,positive,1
2,I thought this was a wonderful way to spend ti...,positive,1
3,Basically there's a family where a little boy ...,negative,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1


In [5]:
data.sentiment.value_counts()

sentiment
positive    25000
negative    25000
Name: count, dtype: int64

In [6]:
data['token_length'] = data.review.progress_apply(lambda x: len(x.split()))
data.head()

100%|██████████| 50000/50000 [00:00<00:00, 105655.19it/s]


Unnamed: 0,review,sentiment,label,token_length
0,One of the other reviewers has mentioned that ...,positive,1,307
1,A wonderful little production. <br /><br />The...,positive,1,162
2,I thought this was a wonderful way to spend ti...,positive,1,166
3,Basically there's a family where a little boy ...,negative,0,138
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1,230


In [7]:
data_pos = data[data['label'] == 1]
data_pos['token_length'].describe()

count    25000.000000
mean       232.849320
std        177.497046
min         10.000000
25%        125.000000
50%        172.000000
75%        284.000000
max       2470.000000
Name: token_length, dtype: float64

In [8]:
data_neg = data[data['label'] == 0]
data_neg['token_length'].describe()

count    25000.000000
mean       229.464560
std        164.947795
min          4.000000
25%        128.000000
50%        174.000000
75%        278.000000
max       1522.000000
Name: token_length, dtype: float64

In [9]:
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Remove HTML tags
    text = re.sub(r'<.*?>', '', text)
    
    # Remove punctuation and special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)

    # 
    words = word_tokenize(text)
    sent = [word for word in words if word not in stop_words]
    sent = ' '.join(sent)
    
    return sent

In [10]:
# Apply the preprocessing to the 'review' column (assuming the column name is 'review')
data['clean'] = data.review.progress_apply(preprocess_text)

# Display the first few rows of the dataframe with the cleaned reviews
data.head()

100%|██████████| 50000/50000 [00:23<00:00, 2110.62it/s]


Unnamed: 0,review,sentiment,label,token_length,clean
0,One of the other reviewers has mentioned that ...,positive,1,307,one reviewers mentioned watching 1 oz episode ...
1,A wonderful little production. <br /><br />The...,positive,1,162,wonderful little production filming technique ...
2,I thought this was a wonderful way to spend ti...,positive,1,166,thought wonderful way spend time hot summer we...
3,Basically there's a family where a little boy ...,negative,0,138,basically theres family little boy jake thinks...
4,"Petter Mattei's ""Love in the Time of Money"" is...",positive,1,230,petter matteis love time money visually stunni...


In [11]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [17]:
tokenizer.pad_token = tokenizer.eos_token
max_l = 250
data['tokenized'] = data.clean.progress_apply(lambda x: tokenizer.encode(text=x, add_special_tokens=False, truncation=True, add_prefix_space=True, padding='max_length', max_length=max_l))

100%|██████████| 50000/50000 [00:32<00:00, 1559.37it/s]


In [13]:
data.head

<bound method NDFrame.head of                                                   review sentiment  label  \
0      One of the other reviewers has mentioned that ...  positive      1   
1      A wonderful little production. <br /><br />The...  positive      1   
2      I thought this was a wonderful way to spend ti...  positive      1   
3      Basically there's a family where a little boy ...  negative      0   
4      Petter Mattei's "Love in the Time of Money" is...  positive      1   
...                                                  ...       ...    ...   
49995  I thought this movie did a down right good job...  positive      1   
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative      0   
49997  I am a Catholic taught in parochial elementary...  negative      0   
49998  I'm going to have to disagree with the previou...  negative      0   
49999  No one expects the Star Trek movies to be high...  negative      0   

       token_length                          

In [18]:
data['ct_length'] = data.tokenized.progress_apply(lambda x: len(x))
data_pos = data[data['label'] == 1]
data_pos['ct_length'].describe()

100%|██████████| 50000/50000 [00:00<00:00, 1348512.05it/s]


count    25000.0
mean       250.0
std          0.0
min        250.0
25%        250.0
50%        250.0
75%        250.0
max        250.0
Name: ct_length, dtype: float64

In [63]:
from torch.utils.data import DataLoader, TensorDataset

X = data.tokenized
y = data.label

# Use train_test_split to split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=39)

# Convert to PyTorch tensors
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train.tolist()).to(device)
X_test_tensor = torch.tensor(X_test.tolist()).to(device)
y_train_tensor = torch.tensor(y_train.tolist()).to(device)
y_test_tensor = torch.tensor(y_test.tolist()).to(device)

# Create TensorDatasets
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

In [64]:
# Create DataLoaders
batch_size = 64
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

In [87]:
import torch.nn as nn
import torch.nn.functional as F
from fastkan import FastKAN as KAN

class SentimentCNNModel(nn.Module):
    def __init__(self, vocab_size, embed_dim, num_class):
        super(SentimentCNNModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_dim)
        self.conv1 = nn.Conv2d(1, 100, (1, embed_dim), padding=(0, 0))  # 1-gram
        self.conv2 = nn.Conv2d(1, 100, (3, embed_dim), padding=(1, 0))  # 3-gram
        self.conv3 = nn.Conv2d(1, 100, (5, embed_dim), padding=(2, 0))  # 5-gram
        self.dropout = nn.Dropout(0.35)
        # Replace dense layers with FastKAN
        self.kan = KAN([37500, 512, 128, num_class])

    def forward(self, text):
        embedded = self.embedding(text).unsqueeze(1)  # Add channel dimension
        
        conv1_out = F.relu(self.conv1(embedded)).squeeze(3)
        conv2_out = F.relu(self.conv2(embedded)).squeeze(3)
        conv3_out = F.relu(self.conv3(embedded)).squeeze(3)
        
        pooled1 = F.max_pool1d(conv1_out, 2).squeeze(2)
        pooled2 = F.max_pool1d(conv2_out, 2).squeeze(2)
        pooled3 = F.max_pool1d(conv3_out, 2).squeeze(2)

        
        cat = torch.cat((pooled1, pooled2, pooled3), 1)
        cat = self.dropout(cat)
        
        cat = cat.view(cat.size(0), -1)
        # Use FastKAN for final classification
        out = self.kan(cat)
        return out

# Instantiate the model
vocab_size = len(tokenizer)  # Vocabulary size of GPT-2 tokenizer
embed_dim = 128  # Embedding dimension
num_class = 2  # Number of classes (negative, positive)

model = SentimentCNNModel(vocab_size, embed_dim, num_class)
model.to(device)

SentimentCNNModel(
  (embedding): Embedding(50257, 128)
  (conv1): Conv2d(1, 100, kernel_size=(1, 128), stride=(1, 1))
  (conv2): Conv2d(1, 100, kernel_size=(3, 128), stride=(1, 1), padding=(1, 0))
  (conv3): Conv2d(1, 100, kernel_size=(5, 128), stride=(1, 1), padding=(2, 0))
  (dropout): Dropout(p=0.35, inplace=False)
  (kan): FastKAN(
    (layers): ModuleList(
      (0): FastKANLayer(
        (layernorm): LayerNorm((37500,), eps=1e-05, elementwise_affine=True)
        (rbf): RadialBasisFunction()
        (spline_linear): SplineLinear(in_features=300000, out_features=512, bias=False)
        (base_linear): Linear(in_features=37500, out_features=512, bias=True)
      )
      (1): FastKANLayer(
        (layernorm): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
        (rbf): RadialBasisFunction()
        (spline_linear): SplineLinear(in_features=4096, out_features=128, bias=False)
        (base_linear): Linear(in_features=512, out_features=128, bias=True)
      )
      (2): Fast

In [88]:
# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
#optimizer = optim.Adam(model.parameters(), lr=0.001)
optimizer = optim.AdamW(model.parameters(), lr=1e-3, weight_decay=1e-4)
# Define learning rate scheduler
scheduler = optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.8)

# Define the accuracy calculation
def calculate_accuracy(preds, labels):
    _, predicted = torch.max(preds, 1)
    correct = (predicted == labels).sum().item()
    return correct

# Training loop with loss and accuracy display
num_epochs = 20

for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_predictions = 0

    with tqdm(train_loader, desc=f"Epoch {epoch+1}/{num_epochs}") as pbar:
        for texts, labels in pbar:
            optimizer.zero_grad()
            outputs = model(texts)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()
            
            total_loss += loss.item()
            correct_predictions += calculate_accuracy(outputs, labels)
            total_predictions += labels.size(0)
            
            accuracy = correct_predictions / total_predictions
            
            pbar.set_postfix(loss=total_loss/total_predictions, accuracy=accuracy)
    scheduler.step()

    print(f'Epoch {epoch+1}, Loss: {total_loss/len(train_loader)}, Accuracy: {accuracy}')


Epoch 1/20: 100%|██████████| 625/625 [00:35<00:00, 17.48it/s, accuracy=0.537, loss=0.0181]


Epoch 1, Loss: 1.161401811313629, Accuracy: 0.537275


Epoch 2/20: 100%|██████████| 625/625 [00:36<00:00, 17.08it/s, accuracy=0.774, loss=0.00738]


Epoch 2, Loss: 0.47252855350971223, Accuracy: 0.774175


Epoch 3/20: 100%|██████████| 625/625 [00:38<00:00, 16.45it/s, accuracy=0.879, loss=0.00447]


Epoch 3, Loss: 0.28603353217840194, Accuracy: 0.8787


Epoch 4/20: 100%|██████████| 625/625 [00:37<00:00, 16.81it/s, accuracy=0.925, loss=0.00287]


Epoch 4, Loss: 0.18366781608462335, Accuracy: 0.924975


Epoch 5/20: 100%|██████████| 625/625 [00:35<00:00, 17.43it/s, accuracy=0.952, loss=0.00192]


Epoch 5, Loss: 0.12297664624005557, Accuracy: 0.9522


Epoch 6/20: 100%|██████████| 625/625 [00:35<00:00, 17.45it/s, accuracy=0.968, loss=0.0013] 


Epoch 6, Loss: 0.08310348702147603, Accuracy: 0.9675


Epoch 7/20: 100%|██████████| 625/625 [00:36<00:00, 17.36it/s, accuracy=0.98, loss=0.000817] 


Epoch 7, Loss: 0.05228792381752283, Accuracy: 0.9803


Epoch 8/20: 100%|██████████| 625/625 [00:35<00:00, 17.44it/s, accuracy=0.985, loss=0.000627]


Epoch 8, Loss: 0.04015079543367028, Accuracy: 0.9852


Epoch 9/20: 100%|██████████| 625/625 [00:35<00:00, 17.40it/s, accuracy=0.991, loss=0.000412]


Epoch 9, Loss: 0.02639798095920123, Accuracy: 0.990825


Epoch 10/20: 100%|██████████| 625/625 [00:35<00:00, 17.42it/s, accuracy=0.993, loss=0.00033] 


Epoch 10, Loss: 0.021127590995328502, Accuracy: 0.9928


Epoch 11/20: 100%|██████████| 625/625 [00:35<00:00, 17.42it/s, accuracy=0.995, loss=0.000242]


Epoch 11, Loss: 0.015494658929202706, Accuracy: 0.9946


Epoch 12/20: 100%|██████████| 625/625 [00:35<00:00, 17.42it/s, accuracy=0.995, loss=0.000208]


Epoch 12, Loss: 0.01330933862022357, Accuracy: 0.9952


Epoch 13/20: 100%|██████████| 625/625 [00:36<00:00, 17.31it/s, accuracy=0.997, loss=0.000167]


Epoch 13, Loss: 0.010704210755182431, Accuracy: 0.99655


Epoch 14/20: 100%|██████████| 625/625 [00:35<00:00, 17.43it/s, accuracy=0.997, loss=0.000132]


Epoch 14, Loss: 0.00843191039004596, Accuracy: 0.997025


Epoch 15/20: 100%|██████████| 625/625 [00:35<00:00, 17.41it/s, accuracy=0.998, loss=0.000104]


Epoch 15, Loss: 0.006642530341190286, Accuracy: 0.997775


Epoch 16/20: 100%|██████████| 625/625 [00:35<00:00, 17.41it/s, accuracy=0.998, loss=9.62e-5] 


Epoch 16, Loss: 0.0061551289284077935, Accuracy: 0.9977


Epoch 17/20: 100%|██████████| 625/625 [00:35<00:00, 17.41it/s, accuracy=0.998, loss=7.39e-5]


Epoch 17, Loss: 0.004730923702463042, Accuracy: 0.998375


Epoch 18/20: 100%|██████████| 625/625 [00:34<00:00, 18.00it/s, accuracy=0.998, loss=7.54e-5]


Epoch 18, Loss: 0.00482643639507587, Accuracy: 0.998325


Epoch 19/20: 100%|██████████| 625/625 [00:35<00:00, 17.58it/s, accuracy=0.999, loss=5.62e-5]


Epoch 19, Loss: 0.0035974783867248335, Accuracy: 0.99865


Epoch 20/20: 100%|██████████| 625/625 [00:35<00:00, 17.41it/s, accuracy=0.999, loss=5.77e-5]

Epoch 20, Loss: 0.0036914174732752144, Accuracy: 0.99875





In [89]:
# Evaluation function
def evaluate(model, test_loader):
    model.eval()
    correct = 0
    total = 0
    total_loss = 0
    with torch.no_grad():
        for texts, labels in test_loader:
            outputs = model(texts)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            correct += calculate_accuracy(outputs, labels)
            total += labels.size(0)
    accuracy = correct / total
    avg_loss = total_loss / total
    return accuracy, avg_loss

# Evaluate the model
accuracy, avg_loss = evaluate(model, test_loader)
print(f'Test Accuracy: {accuracy}, Test Loss: {avg_loss}')


Test Accuracy: 0.851, Test Loss: 0.014452516278624535
