In [1]:
!pip install datasets






In [5]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load the dataset
df = pd.read_csv("IMDB Dataset.csv")

# Split the data into training and testing sets
X = df['review']
y = df['sentiment'].map({'positive': 1, 'negative': 0})

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Check the shape
print(X_train.shape, X_test.shape)


(40000,) (10000,)


In [3]:
from nltk.sentiment.vader import SentimentIntensityAnalyzer
import nltk

nltk.download('vader_lexicon')

# Initialize VADER sentiment analyzer
sia = SentimentIntensityAnalyzer()

# Example of scoring sentiment using VADER
sample_review = X_train.iloc[0]
vader_score = sia.polarity_scores(sample_review)
print(f"VADER sentiment score: {vader_score}")


VADER sentiment score: {'neg': 0.103, 'neu': 0.773, 'pos': 0.123, 'compound': 0.881}


[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\nives\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [4]:
!pip install transformers[torch]





In [5]:
from transformers import RobertaTokenizer, RobertaForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset

# Load the tokenizer and the model
tokenizer = RobertaTokenizer.from_pretrained('roberta-base')
model = RobertaForSequenceClassification.from_pretrained('roberta-base', num_labels=2)

# Tokenize the data
def tokenize_function(examples):
    return tokenizer(examples['review'], padding="max_length", truncation=True)

# Convert to Hugging Face Dataset format
train_dataset = Dataset.from_pandas(pd.DataFrame({'review': X_train, 'label': y_train}))
test_dataset = Dataset.from_pandas(pd.DataFrame({'review': X_test, 'label': y_test}))

train_dataset = train_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Training arguments
training_args = TrainingArguments(
    output_dir='./results',          
    evaluation_strategy="epoch",     
    learning_rate=2e-5,             
    per_device_train_batch_size=16, 
    per_device_eval_batch_size=16,  
    num_train_epochs=3,            
    weight_decay=0.01,
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
)

# Fine-tune the model
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained('./roberta_imdb_model')
tokenizer.save_pretrained('./roberta_imdb_model')


  from .autonotebook import tqdm as notebook_tqdm





Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at roberta-base and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
Map: 100%|██████████| 40000/40000 [01:04<00:00, 621.93 examples/s]
Map: 100%|██████████| 10000/10000 [00:16<00:00, 588.87 examples/s]
  7%|▋         | 500/7500 [09:51<2:23:49,  1.23s/it]

{'loss': 0.2929, 'grad_norm': 6.247313976287842, 'learning_rate': 1.866666666666667e-05, 'epoch': 0.2}


 13%|█▎        | 1000/7500 [19:41<2:06:49,  1.17s/it]

{'loss': 0.2214, 'grad_norm': 13.562657356262207, 'learning_rate': 1.7333333333333336e-05, 'epoch': 0.4}


 20%|██        | 1500/7500 [29:36<1:57:09,  1.17s/it]

{'loss': 0.2146, 'grad_norm': 12.5281982421875, 'learning_rate': 1.6000000000000003e-05, 'epoch': 0.6}


 27%|██▋       | 2000/7500 [42:36<2:21:19,  1.54s/it]

{'loss': 0.1872, 'grad_norm': 8.160083770751953, 'learning_rate': 1.4666666666666666e-05, 'epoch': 0.8}


 33%|███▎      | 2500/7500 [55:34<2:07:30,  1.53s/it]

{'loss': 0.1852, 'grad_norm': 11.460078239440918, 'learning_rate': 1.3333333333333333e-05, 'epoch': 1.0}


                                                     
 33%|███▎      | 2500/7500 [1:33:02<2:07:30,  1.53s/it]

{'eval_loss': 0.22068648040294647, 'eval_runtime': 2240.7513, 'eval_samples_per_second': 4.463, 'eval_steps_per_second': 0.279, 'epoch': 1.0}


 40%|████      | 3000/7500 [1:45:31<1:53:37,  1.52s/it]   

{'loss': 0.1258, 'grad_norm': 0.1093222126364708, 'learning_rate': 1.2e-05, 'epoch': 1.2}


 47%|████▋     | 3500/7500 [1:58:28<1:26:16,  1.29s/it]

{'loss': 0.1306, 'grad_norm': 0.33916592597961426, 'learning_rate': 1.0666666666666667e-05, 'epoch': 1.4}


 53%|█████▎    | 4000/7500 [2:10:51<1:11:38,  1.23s/it]

{'loss': 0.1231, 'grad_norm': 15.131962776184082, 'learning_rate': 9.333333333333334e-06, 'epoch': 1.6}


 60%|██████    | 4500/7500 [2:23:03<59:42,  1.19s/it]  

{'loss': 0.128, 'grad_norm': 9.881794929504395, 'learning_rate': 8.000000000000001e-06, 'epoch': 1.8}


 67%|██████▋   | 5000/7500 [2:36:39<1:10:23,  1.69s/it]

{'loss': 0.1231, 'grad_norm': 0.09675367176532745, 'learning_rate': 6.666666666666667e-06, 'epoch': 2.0}


                                                       
 67%|██████▋   | 5000/7500 [3:16:39<1:10:23,  1.69s/it]

{'eval_loss': 0.16848549246788025, 'eval_runtime': 2393.8569, 'eval_samples_per_second': 4.177, 'eval_steps_per_second': 0.261, 'epoch': 2.0}


 73%|███████▎  | 5500/7500 [3:33:40<52:13,  1.57s/it]     

{'loss': 0.0791, 'grad_norm': 39.59916687011719, 'learning_rate': 5.333333333333334e-06, 'epoch': 2.2}


 80%|████████  | 6000/7500 [3:47:12<39:48,  1.59s/it]  

{'loss': 0.079, 'grad_norm': 62.80134201049805, 'learning_rate': 4.000000000000001e-06, 'epoch': 2.4}


 87%|████████▋ | 6500/7500 [4:01:01<27:45,  1.67s/it]  

{'loss': 0.0801, 'grad_norm': 1.2025119066238403, 'learning_rate': 2.666666666666667e-06, 'epoch': 2.6}


 93%|█████████▎| 7000/7500 [4:14:57<13:41,  1.64s/it]

{'loss': 0.0758, 'grad_norm': 54.94712829589844, 'learning_rate': 1.3333333333333334e-06, 'epoch': 2.8}


100%|██████████| 7500/7500 [4:28:18<00:00,  1.59s/it]

{'loss': 0.0643, 'grad_norm': 0.06320647895336151, 'learning_rate': 0.0, 'epoch': 3.0}


                                                     
100%|██████████| 7500/7500 [5:13:07<00:00,  2.50s/it]


{'eval_loss': 0.21452347934246063, 'eval_runtime': 2683.4153, 'eval_samples_per_second': 3.727, 'eval_steps_per_second': 0.233, 'epoch': 3.0}
{'train_runtime': 18787.0436, 'train_samples_per_second': 6.387, 'train_steps_per_second': 0.399, 'train_loss': 0.14067669881184897, 'epoch': 3.0}


('./roberta_imdb_model\\tokenizer_config.json',
 './roberta_imdb_model\\special_tokens_map.json',
 './roberta_imdb_model\\vocab.json',
 './roberta_imdb_model\\merges.txt',
 './roberta_imdb_model\\added_tokens.json')

In [21]:
import numpy as np
from torch.utils.data import DataLoader
import torch

# Helper function to get VADER scores
def get_vader_sentiment(text):
    return sia.polarity_scores(text)['compound']

# Generate predictions using RoBERTa
def predict_with_lexicon(model, tokenizer, texts, threshold=0.5, device='cuda' if torch.cuda.is_available() else 'cpu'):
    model.to(device)
    model.eval()

    # Ensure texts is a list
    if isinstance(texts, str):
        texts = [texts]

    # Tokenize texts
    inputs = tokenizer(
        texts,
        padding=True,
        truncation=True,
        return_tensors="pt",
        max_length=512
    ).to(device)

    with torch.no_grad():
        outputs = model(**inputs)
        logits = outputs.logits
        probs = torch.nn.functional.softmax(logits, dim=-1)
    
    # Get predictions from RoBERTa
    roberta_preds = np.argmax(probs.cpu().numpy(), axis=-1)

    # Integrate with VADER scores (optional enhancement)
    vader_preds = np.array([1 if get_vader_sentiment(text) >= threshold else 0 for text in texts])

    # Context-aware fusion of RoBERTa and VADER predictions
    final_preds = np.where(vader_preds == 1, vader_preds, roberta_preds)
    
    return final_preds

# Example usage
texts = X_test[:5].tolist()  # Ensure X_test is a pandas Series or list
predictions = predict_with_lexicon(model, tokenizer, texts)
print(predictions)




[1 1 1 1 0]


In [6]:
from sklearn.metrics import classification_report

# Generate predictions on the test set
test_preds = predict_with_lexicon(model, tokenizer, X_test.tolist())

# Classification report
print(classification_report(y_test, test_preds))


RuntimeError: [enforce fail at ..\c10\core\impl\alloc_cpu.cpp:72] data. DefaultCPUAllocator: not enough memory: you tried to allocate 15728640000 bytes.

In [7]:
from transformers import RobertaForSequenceClassification, RobertaTokenizer
import torch
import numpy as np

# Load the model and tokenizer
model = RobertaForSequenceClassification.from_pretrained('./roberta_imdb_model')
tokenizer = RobertaTokenizer.from_pretrained('./roberta_imdb_model')

# Function to predict sentiment using the loaded model
def predict_with_lexicon(model, tokenizer, texts):
    inputs = tokenizer(texts, padding=True, truncation=True, return_tensors="pt", max_length=512)
    outputs = model(**inputs)
    logits = outputs.logits
    probs = torch.nn.functional.softmax(logits, dim=-1)
    roberta_preds = np.argmax(probs.detach().numpy(), axis=-1)
    return roberta_preds

# Example of making predictions
texts = ["I loved this movie!", "This movie was not okay."]
predictions = predict_with_lexicon(model, tokenizer, texts)
print(predictions)  # Should print [1, 0] or [positive, negative]


[1 0]
