<a href="https://colab.research.google.com/github/RaveeMishra/Sentiment_analysis/blob/main/K_means_insta_sentiment.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

# Load dataset
df = pd.read_csv("INSTAGRAM_REVIEWS.csv")

# Keep only needed columns
df = df[["review_text", "review_rating"]]

# Remove missing values
df = df.dropna()

# Create sentiment label
def create_label(rating):
    if rating >= 4:
        return 1  # Positive
    elif rating <= 2:
        return 0  # Negative
    else:
        return None  # Neutral (we remove)

df["label"] = df["review_rating"].apply(create_label)

# Remove neutral reviews
df = df.dropna(subset=["label"])

# Rename review_text to text (to match your training code)
df = df.rename(columns={"review_text": "text"})

# Convert label to integer
df["label"] = df["label"].astype(int)

# Split (80% train / 20% test)
train_df, test_df = train_test_split(
    df[["text", "label"]],
    test_size=0.2,
    random_state=42,
    stratify=df["label"]
)

# Save files
train_df.to_csv("instagram_train.csv", index=False)
test_df.to_csv("instagram_test.csv", index=False)

print("Dataset prepared & split successfully ✔")
print("Train size:", len(train_df))
print("Test size:", len(test_df))
print("\nLabel distribution (Train):")
print(train_df["label"].value_counts())


Dataset prepared & split successfully ✔
Train size: 20104
Test size: 5027

Label distribution (Train):
label
1    13678
0     6426
Name: count, dtype: int64


In [4]:
from sklearn.cluster import MiniBatchKMeans
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score
import numpy as np
import pandas as pd

def run_kmeans():

    print("\nRunning KMeans clustering (MiniBatch version)...")

    df = pd.read_csv("instagram_train.csv")

    texts = df["text"][:5000]
    labels = df["label"][:5000].values  # true labels

    vectorizer = TfidfVectorizer(
        max_features=5000,
        stop_words="english"
    )

    X = vectorizer.fit_transform(texts)

    # Use MiniBatchKMeans to avoid version issues
    kmeans = MiniBatchKMeans(n_clusters=2, random_state=42, batch_size=256)
    kmeans.fit(X)

    preds = kmeans.labels_

    # Map clusters to actual labels
    cluster_to_label = {}
    for cluster in [0, 1]:
        indices = np.where(preds == cluster)[0]
        majority_label = np.bincount(labels[indices]).argmax()
        cluster_to_label[cluster] = majority_label

    mapped_preds = np.array([cluster_to_label[c] for c in preds])

    # Compute metrics directly
    precision = precision_score(labels, mapped_preds)
    recall = recall_score(labels, mapped_preds)
    f1 = f1_score(labels, mapped_preds)
    accuracy = accuracy_score(labels, mapped_preds)
    roc_auc = roc_auc_score(labels, mapped_preds)

    print("\nKMeans Metrics:")
    print(f"Precision : {precision:.4f}")
    print(f"Recall    : {recall:.4f}")
    print(f"F1        : {f1:.4f}")
    print(f"Accuracy  : {accuracy:.4f}")
    print(f"ROC-AUC   : {roc_auc:.4f}")

    # Cluster distribution
    unique, counts = np.unique(preds, return_counts=True)
    print("\nCluster Distribution:")
    print(dict(zip(unique, counts)))


In [5]:
run_kmeans()


Running KMeans clustering (MiniBatch version)...

KMeans Metrics:
Precision : 0.6844
Recall    : 1.0000
F1        : 0.8126
Accuracy  : 0.6844
ROC-AUC   : 0.5000

Cluster Distribution:
{np.int32(0): np.int64(4388), np.int32(1): np.int64(612)}
