In [2]:
# imports useful packages, e.g. pandas, numpy, random.
import pandas as pd
# imports Feature Engineered/Supervised models from scikit learn.
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
# Function to preprocess the data
def extract_data():
    # Extracts data from a CSV file
    data = pd.read_csv('/Users/charan/Desktop/Lab 3/IMDB dataset.csv')
    return data['text'], data['label'] 

In [4]:
# Function for feature extraction
def get_tfidf_vectors(sentence):
    """
    :description: gets the TF-IDF values for the words in each sentence.
    :parameters: sentences as strings saved in list.
    :return: a list with TF-IDF values for each sentence (will be used as features).
    """
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(sentence)
    print("n_samples: %d, n_features: %d" % vectors.shape)
    tf_idf = pd.DataFrame(vectors.todense()).iloc[:len(sentence)]
    tf_idf.columns = vectorizer.get_feature_names_out()
    tfidf_matrix = tf_idf.T
    features = []
    for i in tfidf_matrix:
       features.append(list(tfidf_matrix[i]))
    return features

In [6]:
# Function for training the SVM model
def svm_classifier(train_features, train_labels, test_features):
    clf = SVC(kernel='sigmoid') # Selects and defines the SVM model
    clf.fit(train_features, train_labels) # Trains the model
    pred = clf.predict(test_features) # Predicts labels of test features
    return pred #returns predictions

In [8]:
#Exercise-3

In [10]:
##Step 1: Import, Load and preprocess the dataset.

In [12]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
# Step 1: Data Preprocessing

In [16]:
def extract_data():
    data = pd.read_csv('/Users/charan/Desktop/Lab 3/IMDB dataset.csv')
    return data['text'], data['label']

In [18]:
# Step 2: Feature Extraction

In [20]:
def get_tfidf_vectors(sentences):
    vectorizer = TfidfVectorizer()
    vectors = vectorizer.fit_transform(sentences)
    return vectors
# Custom function to calculate evaluation metrics without using Scikit-Learn's confusion matrix 
def calculate_metrics(true_labels, predictions):
    TP = sum((true_labels == 1) & (predictions == 1))
    TN = sum((true_labels == 0) & (predictions == 0))
    FP = sum((true_labels == 0) & (predictions == 1))
    FN = sum((true_labels == 1) & (predictions == 0))

    precision = TP / (TP + FP) if (TP + FP) > 0 else 0
    recall = TP / (TP + FN) if (TP + FN) > 0 else 0
    f1_score = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0
    accuracy = (TP + TN) / (TP + TN + FP + FN)

    return precision, recall, f1_score, accuracy

In [22]:
# Step 3: Train-Test Split

In [24]:
sentence, labels = extract_data()
sentence_features = get_tfidf_vectors(sentence)
train_features, test_features, train_labels, test_labels = train_test_split(
    sentence_features, labels, test_size=0.2, stratify=labels
)

In [25]:
# Step 4: Model Training and Prediction

In [26]:
clf = RandomForestClassifier()
clf.fit(train_features, train_labels)
predictions = clf.predict(test_features)

In [29]:
# Step 5: Calculate Evaluation Metrics with custom function

In [30]:
precision, recall, f1, accuracy = calculate_metrics(test_labels.to_numpy(), predictions)

# Output the calculated metrics
print("Precision =", precision)
print("Recall =", recall)
print("F1-score =", f1)
print("Accuracy =", accuracy)


Precision = 0.8115079365079365
Recall = 0.8131212723658051
F1-score = 0.8123138033763654
Accuracy = 0.811


In [17]:
#Exercise 4

In [None]:
#BERT-Based Sentiment Analysis Using Hugging Face
#Step 1: Install Required Libraries

In [34]:
pip install transformers


Note: you may need to restart the kernel to use updated packages.


In [None]:
#Step 2: Import Libraries
#Step 3: Load and Inspect the Dataset

In [35]:
import pandas as pd

data = pd.read_csv('/Users/charan/Desktop/Lab 3/IMDB dataset.csv')
print(data.head())


                                                text  label
0  It's been about 14 years since Sharon Stone aw...      0
1  someone needed to make a car payment... this i...      0
2  The Guidelines state that a comment must conta...      0
3  This movie is a muddled mish-mash of clichés f...      0
4  Before Stan Laurel became the smaller half of ...      0


In [None]:
#Step 4: Initialize the BERT Tokenizer

In [38]:
from transformers import BertTokenizer

# Initialize the tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenize the text column from the dataset
inputs = tokenizer(list(data['text']), return_tensors="pt", padding=True, truncation=True, max_length=512)
inputs



{'input_ids': tensor([[  101,  2009,  1005,  ...,  1998,  2585,   102],
        [  101,  2619,  2734,  ...,     0,     0,     0],
        [  101,  1996, 11594,  ...,     0,     0,     0],
        ...,
        [  101,  2017,  2064,  ...,     0,     0,     0],
        [  101, 11865,  5753,  ...,     0,     0,     0],
        [  101,  1000,  1996,  ...,     0,     0,     0]]), 'token_type_ids': tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]]), 'attention_mask': tensor([[1, 1, 1,  ..., 1, 1, 1],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        ...,
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0],
        [1, 1, 1,  ..., 0, 0, 0]])}

In [40]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments

model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)
training_args = TrainingArguments(
    output_dir='./results', 
    evaluation_strategy="epoch", 
    per_device_train_batch_size=8, 
    per_device_eval_batch_size=8, 
    num_train_epochs=3,
    weight_decay=0.01,
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [56]:
#Step 5: Prepare Custom Dataset Class for PyTorch
#Step 6: Split the Dataset into Training and Validation Sets


In [42]:
import torch
from torch.utils.data import Dataset, DataLoader, random_split

# Define a custom dataset class for PyTorch
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Encode the labels and split data
labels = list(data['label'])
dataset = SentimentDataset(inputs, labels)

# Split into training and validation datasets
train_size = int(0.8 * len(dataset))
val_size = len(dataset) - train_size
train_dataset, eval_dataset = random_split(dataset, [train_size, val_size])



In [58]:
#Step 7: Set Up Training Arguments
#Step 8: Initialize the Trainer

In [44]:
from transformers import Trainer

trainer = Trainer(
    model=model, 
    args=training_args, 
    train_dataset=train_dataset, 
    eval_dataset=eval_dataset,
)


In [48]:
class SentimentDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: val[idx].clone().detach() for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# For predictions and evaluation:
# Make predictions on validation set
predictions = trainer.predict(eval_dataset)
preds = np.argmax(predictions.predictions, axis=1)
labels = predictions.label_ids



  item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}


In [50]:
# Calculate metrics
precision = precision_score(labels, preds, average='weighted')
recall = recall_score(labels, preds, average='weighted')
f1 = f1_score(labels, preds, average='weighted')
accuracy = accuracy_score(labels, preds)


In [52]:
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")
print(f"Accuracy: {accuracy}")

Precision: 0.5232979502766737
Recall: 0.509
F1 Score: 0.3547981323884897
Accuracy: 0.509
