<a href="https://colab.research.google.com/github/RaveeMishra/Sentiment_analysis/blob/main/INSTA_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:

%%writefile requirements.txt

numpy==1.26.4
scikit-learn==1.4.2
torch==2.2.2
transformers==4.41.2
datasets==2.18.0
tqdm
pandas


Writing requirements.txt


In [2]:
pip install -r requirements.txt

Collecting numpy==1.26.4 (from -r requirements.txt (line 2))
  Downloading numpy-1.26.4-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (61 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/61.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━[0m [32m51.2/61.0 kB[0m [31m5.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m61.0/61.0 kB[0m [31m953.5 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting scikit-learn==1.4.2 (from -r requirements.txt (line 3))
  Downloading scikit_learn-1.4.2-cp312-cp312-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (11 kB)
Collecting torch==2.2.2 (from -r requirements.txt (line 4))
  Downloading torch-2.2.2-cp312-cp312-manylinux1_x86_64.whl.metadata (25 kB)
Collecting transformers==4.41.2 (from -r requirements.txt (line 5))
  Downloading transformers-4.41.2-py3-none-any.whl.metadata (43 kB)
[2K    

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("INSTAGRAM_REVIEWS.csv")

# Keep only needed columns
df = df[["review_text", "review_rating"]]

# Remove missing values
df = df.dropna()

# Create sentiment label
def create_label(rating):
    if rating >= 4:
        return 1  # Positive
    elif rating <= 2:
        return 0  # Negative
    else:
        return None  # Neutral (we remove)

df["label"] = df["review_rating"].apply(create_label)

# Remove neutral reviews
df = df.dropna(subset=["label"])

# Rename review_text to text (to match your training code)
df = df.rename(columns={"review_text": "text"})

# Convert label to integer
df["label"] = df["label"].astype(int)

# Split (80% train / 20% test)
train_df, test_df = train_test_split(
    df[["text", "label"]],
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

# Save files
train_df.to_csv("instagram_train.csv", index=False)
test_df.to_csv("instagram_test.csv", index=False)

print("Dataset prepared & split successfully ✔")
print("Train size:", len(train_df))
print("Test size:", len(test_df))
print("\nLabel distribution (Train):")
print(train_df["label"].value_counts())


Dataset prepared & split successfully ✔
Train size: 2339688
Test size: 584923

Label distribution (Train):
label
1    1767852
0     571836
Name: count, dtype: int64


In [5]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import pandas as pd
from transformers import AutoTokenizer

from sklearn.metrics import precision_score, recall_score, f1_score, roc_auc_score , accuracy_score
from sklearn.cluster import KMeans
from sklearn.feature_extraction.text import TfidfVectorizer

from tqdm import tqdm
import numpy as np


############################################
# CONFIG
############################################

MODEL_NAME = "distilbert-base-uncased"
BATCH_SIZE = 32
EPOCHS = 3
LR = 2e-4
MAX_LEN = 128

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)

############################################
# DATASET
############################################

class InstagramSentimentDataset(Dataset):

    def __init__(self, csv_path):

        df = pd.read_csv(csv_path)

        self.texts = df["text"].tolist()
        self.labels = df["label"].tolist()

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):

        enc = tokenizer(
            self.texts[idx],
            truncation=True,
            padding="max_length",
            max_length=MAX_LEN,
            return_tensors="pt"
        )

        return {
            "input_ids": enc["input_ids"].squeeze(0),
            "label": torch.tensor(self.labels[idx], dtype=torch.long)
        }


def get_loader(csv_path, shuffle=True):

    return DataLoader(
        InstagramSentimentDataset(csv_path),
        batch_size=BATCH_SIZE,
        shuffle=shuffle,
        num_workers=2,
        pin_memory=True
    )


############################################
# MODELS (SAME AS YOUR CODE)
############################################

class CNNClassifier(nn.Module):

    def __init__(self, vocab=30522, embed=128):
        super().__init__()

        self.embedding = nn.Embedding(vocab, embed)

        self.conv = nn.Conv1d(embed, 256, kernel_size=5)
        self.pool = nn.AdaptiveMaxPool1d(1)

        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(256, 2)

    def forward(self, ids):

        x = self.embedding(ids)
        x = x.permute(0, 2, 1)

        x = torch.relu(self.conv(x))
        x = self.pool(x).squeeze(-1)

        x = self.dropout(x)

        return self.fc(x)


class RNNClassifier(nn.Module):

    def __init__(self, vocab=30522, embed=128, hidden=256):
        super().__init__()

        self.embedding = nn.Embedding(vocab, embed)

        self.lstm = nn.LSTM(
            embed,
            hidden,
            batch_first=True,
            bidirectional=True
        )

        self.dropout = nn.Dropout(0.3)
        self.fc = nn.Linear(hidden * 2, 2)

    def forward(self, ids):

        x = self.embedding(ids)

        _, (hidden, _) = self.lstm(x)

        hidden = torch.cat((hidden[-2], hidden[-1]), dim=1)

        hidden = self.dropout(hidden)

        return self.fc(hidden)


############################################
# METRICS (UNCHANGED)
############################################

def compute_metrics(logits, labels):

    probs = torch.softmax(logits, dim=1)[:, 1].cpu().numpy()
    preds = torch.argmax(logits, dim=1).cpu().numpy()
    labels = labels.cpu().numpy()

    return {
        "Precision": precision_score(labels, preds),
        "Recall": recall_score(labels, preds),
        "F1": f1_score(labels, preds),
        "ROC-AUC": roc_auc_score(labels, probs),
        "Accuracy": accuracy_score(labels, preds)
    }


############################################
# TRAINING
############################################

def evaluate(model, loader):

    model.eval()

    all_logits = []
    all_labels = []

    with torch.no_grad():

        for batch in loader:

            ids = batch["input_ids"].to(DEVICE)
            labels = batch["label"].to(DEVICE)

            logits = model(ids)

            all_logits.append(logits)
            all_labels.append(labels)

    metrics = compute_metrics(
        torch.cat(all_logits),
        torch.cat(all_labels)
    )

    print("\nEvaluation Metrics:")
    for k, v in metrics.items():
        print(f"{k}: {v:.4f}")


def train(model_type="cnn"):

    train_loader = get_loader("instagram_train.csv", shuffle=True)
    test_loader = get_loader("instagram_test.csv", shuffle=False)

    if model_type == "cnn":
        model = CNNClassifier().to(DEVICE)
    else:
        model = RNNClassifier().to(DEVICE)

    optimizer = torch.optim.AdamW(model.parameters(), lr=LR)
    criterion = nn.CrossEntropyLoss()

    for epoch in range(EPOCHS):

        model.train()
        loop = tqdm(train_loader)

        for batch in loop:

            ids = batch["input_ids"].to(DEVICE)
            labels = batch["label"].to(DEVICE)

            logits = model(ids)
            loss = criterion(logits, labels)

            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            loop.set_description(f"Epoch {epoch+1}")
            loop.set_postfix(loss=loss.item())

        evaluate(model, test_loader)

    torch.save(model.state_dict(), f"{model_type}_instagram_sentiment.pt")

    print("\nModel saved ✔")


############################################
# KMEANS (UNSUPERVISED)
############################################

def run_kmeans():

    print("\nRunning KMeans clustering...")

    df = pd.read_csv("instagram_train.csv")

    texts = df["text"][:5000]

    vectorizer = TfidfVectorizer(
        max_features=5000,
        stop_words="english"
    )

    X = vectorizer.fit_transform(texts)

    kmeans = KMeans(n_clusters=2, random_state=42)
    kmeans.fit(X)

    unique, counts = np.unique(kmeans.labels_, return_counts=True)

    print("Cluster Distribution:")
    print(dict(zip(unique, counts)))


############################################
# MAIN
############################################

if __name__ == "__main__":

    print("Device:", DEVICE)

    train(model_type="cnn")

    print('*' * 100)

    train(model_type="rnn")

    print('*' * 100)

    run_kmeans()


Device: cuda


Epoch 1: 100%|██████████| 73116/73116 [15:19<00:00, 79.51it/s, loss=0.248]



Evaluation Metrics:
Precision: 0.8939
Recall: 0.9574
F1: 0.9246
ROC-AUC: 0.8878
Accuracy: 0.8819


Epoch 2: 100%|██████████| 73116/73116 [14:52<00:00, 81.88it/s, loss=1.07]



Evaluation Metrics:
Precision: 0.8977
Recall: 0.9555
F1: 0.9257
ROC-AUC: 0.8899
Accuracy: 0.8841


Epoch 3: 100%|██████████| 73116/73116 [14:59<00:00, 81.30it/s, loss=0.0837]



Evaluation Metrics:
Precision: 0.9010
Recall: 0.9524
F1: 0.9260
ROC-AUC: 0.8913
Accuracy: 0.8850

Model saved ✔
****************************************************************************************************


Epoch 1: 100%|██████████| 73116/73116 [22:57<00:00, 53.07it/s, loss=0.915]



Evaluation Metrics:
Precision: 0.8974
Recall: 0.9560
F1: 0.9258
ROC-AUC: 0.8914
Accuracy: 0.8842


Epoch 2: 100%|██████████| 73116/73116 [22:42<00:00, 53.68it/s, loss=0.0798]



Evaluation Metrics:
Precision: 0.8977
Recall: 0.9587
F1: 0.9272
ROC-AUC: 0.8942
Accuracy: 0.8862


Epoch 3: 100%|██████████| 73116/73116 [22:39<00:00, 53.80it/s, loss=0.972]



Evaluation Metrics:
Precision: 0.8991
Recall: 0.9577
F1: 0.9274
ROC-AUC: 0.8944
Accuracy: 0.8868

Model saved ✔
****************************************************************************************************

Running KMeans clustering...
Cluster Distribution:
{0: 3988, 1: 1012}
