In [None]:
# TASK 4 â€“ ML + VISUALISATIONS

import pandas as pd
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay

# 1. LOAD DATASET
df = pd.read_csv(
    "/content/Amazon_Reviews.csv",
    engine="python",
    quoting=3
)

print("Loaded shape:", df.shape)

# 2. CLEAN DATA
df = df[['Rating','Review Text']]

df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')
df['Review Text'] = df['Review Text'].astype(str)

df.dropna(inplace=True)

# sentiment label
df['sentiment'] = (df['Rating'] >= 4).astype(int)

print("Sentiment counts:\n", df['sentiment'].value_counts())

# 3. TF-IDF FEATURES
tfidf = TfidfVectorizer(max_features=2000)

X = tfidf.fit_transform(df['Review Text'])
y = df['sentiment']

# 4. TRAIN MODEL
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, stratify=y
)

model = LogisticRegression(max_iter=1000)
model.fit(X_train, y_train)

pred = model.predict(X_test)
acc = accuracy_score(y_test, pred)

print("Accuracy:", acc)

# 5. VISUALISATIONS
plt.figure(figsize=(12,10))

#  Plot 1: Rating distribution
plt.subplot(2,2,1)
df['Rating'].value_counts().sort_index().plot(kind='bar')
plt.title("Distribution of Customer Ratings")
plt.xlabel("Rating")
plt.ylabel("Count")

#  Plot 2: Sentiment distribution
plt.subplot(2,2,2)
df['sentiment'].value_counts().plot(kind='bar')
plt.xticks([0,1], ['Negative','Positive'])
plt.title("Sentiment Distribution")
plt.xlabel("Sentiment")
plt.ylabel("Count")

#  Plot 3: Accuracy
plt.subplot(2,2,3)
plt.bar(['Accuracy'], [acc])
plt.ylim(0,1)
plt.title("Model Accuracy")

#  Plot 4: Confusion matrix
plt.subplot(2,2,4)
cm = confusion_matrix(y_test, pred)
disp = ConfusionMatrixDisplay(cm, display_labels=['Negative','Positive'])
disp.plot(ax=plt.gca())

plt.tight_layout()
plt.show()