In [None]:
import pandas as pd

def read_data(path, limit = None):
    data = []
    with open(path, 'r', encoding = 'utf-8') as f:
        for i, line in enumerate(f):
            if limit and i >= limit:
                break
            label, text = line.strip().split(' ', 1)
            sentiment = 1 if label == '__label__2' else 0
            data.append((text, sentiment))
    return pd.DataFrame(data, columns = ['review', 'label'])

train_df = read_data('data/train.ft.txt/train.ft.txt', 30000)
test_df = read_data('data/test.ft.txt/test.ft.txt', 10000)

train_df.head()
test_df.head()

Unnamed: 0,review,label
0,Great CD: My lovely Pat has one of the GREAT v...,1
1,One of the best game music soundtracks - for a...,1
2,Batteries died within a year ...: I bought thi...,0
3,"works fine, but Maha Energy is better: Check o...",1
4,Great for the non-audiophile: Reviewed quite a...,1


In [None]:
import re
def clean_text(text):
    text = text.lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"[^a-zA-z\s]", "", text)
    text = re.sub(r"\s+", " ", text)
    return text

train_df['cleaned_reviews'] = train_df['review'].apply(clean_text)
test_df['cleaned_reviews'] = test_df['review'].apply(clean_text)

train_df['label'] = train_df['label'].astype(int)
test_df['label'] = test_df['label'].astype(int)

train_df.head()
test_df.head()

Unnamed: 0,review,label,cleaned_reviews
0,Great CD: My lovely Pat has one of the GREAT v...,1,great cd my lovely pat has one of the great vo...
1,One of the best game music soundtracks - for a...,1,one of the best game music soundtracks for a g...
2,Batteries died within a year ...: I bought thi...,0,batteries died within a year i bought this cha...
3,"works fine, but Maha Energy is better: Check o...",1,works fine but maha energy is better check out...
4,Great for the non-audiophile: Reviewed quite a...,1,great for the nonaudiophile reviewed quite a b...


In [None]:
from transformers import DistilBertTokenizerFast

tokenizer = DistilBertTokenizerFast.from_pretrained("distilbert-base-uncased")

train_encoding = tokenizer(
    train_df["cleaned_reviews"].tolist(), 
    truncation=True, 
    padding=True, 
    max_length=128
)

test_encoding = tokenizer(
    test_df["cleaned_reviews"].tolist(), 
    truncation=True, 
    padding=True, 
    max_length=128
)


In [None]:
import torch

class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels
        
    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item["labels"] = torch.tensor(self.labels[idx])
        return item
    
    def __len__(self):
        return len(self.labels)

train_dataset = SentimentDataset(train_encoding, train_df["label"].tolist())
test_dataset = SentimentDataset(test_encoding, test_df["label"].tolist())


In [6]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"


In [7]:
import os
os.environ["TRANSFORMERS_NO_TF"] = "1"

from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments
import numpy as np
from sklearn.metrics import accuracy_score, precision_recall_fscore_support


In [None]:
from transformers import DistilBertForSequenceClassification

model = DistilBertForSequenceClassification.from_pretrained("distilbert-base-uncased", num_labels=2)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)


In [None]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",              
    num_train_epochs=2,                   
    per_device_train_batch_size=64,       
    per_device_eval_batch_size=64,        
    evaluation_strategy="epoch",          
    save_strategy="epoch",                
    logging_dir="./logs",                 
    load_best_model_at_end=True,
    metric_for_best_model="accuracy"      
)


In [None]:
from transformers import Trainer

from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = logits.argmax(axis=-1)
    acc = accuracy_score(labels, predictions)
    return {"accuracy": acc}


trainer = Trainer(
    model=model,                         
    args=training_args,                  
    train_dataset=train_dataset,         
    eval_dataset=test_dataset,           
    compute_metrics=compute_metrics      
)


In [11]:
trainer.train()

                                                 
 50%|█████     | 469/938 [03:49<03:10,  2.47it/s]

{'eval_loss': 0.17419375479221344, 'eval_accuracy': 0.9354, 'eval_runtime': 22.8688, 'eval_samples_per_second': 437.278, 'eval_steps_per_second': 6.865, 'epoch': 1.0}


 53%|█████▎    | 500/938 [04:03<03:15,  2.25it/s]

{'loss': 0.2195, 'grad_norm': 2.7345032691955566, 'learning_rate': 2.3347547974413646e-05, 'epoch': 1.07}


                                                 
100%|██████████| 938/938 [07:33<00:00,  2.61it/s]

{'eval_loss': 0.1856209635734558, 'eval_accuracy': 0.9366, 'eval_runtime': 20.5794, 'eval_samples_per_second': 485.922, 'eval_steps_per_second': 7.629, 'epoch': 2.0}


100%|██████████| 938/938 [07:34<00:00,  2.07it/s]

{'train_runtime': 454.1135, 'train_samples_per_second': 132.126, 'train_steps_per_second': 2.066, 'train_loss': 0.16616664308983126, 'epoch': 2.0}





TrainOutput(global_step=938, training_loss=0.16616664308983126, metrics={'train_runtime': 454.1135, 'train_samples_per_second': 132.126, 'train_steps_per_second': 2.066, 'total_flos': 1987010979840000.0, 'train_loss': 0.16616664308983126, 'epoch': 2.0})

In [12]:
trainer.evaluate()

100%|██████████| 157/157 [00:20<00:00,  7.63it/s]


{'eval_loss': 0.1856209635734558,
 'eval_accuracy': 0.9366,
 'eval_runtime': 20.939,
 'eval_samples_per_second': 477.578,
 'eval_steps_per_second': 7.498,
 'epoch': 2.0}

In [None]:
from transformers import pipeline

model_path = "./results"
sentiment_pipeline = pipeline("text-classification", model=model, tokenizer=tokenizer, device=0 if torch.cuda.is_available() else -1)

sentiment_pipeline("I'm very disappointed with the service.")


[{'label': 'LABEL_0', 'score': 0.9976430535316467}]

In [14]:
model.save_pretrained("./sentiment-model")
tokenizer.save_pretrained("./sentiment-model")


('./sentiment-model\\tokenizer_config.json',
 './sentiment-model\\special_tokens_map.json',
 './sentiment-model\\vocab.txt',
 './sentiment-model\\added_tokens.json',
 './sentiment-model\\tokenizer.json')