<a href="https://colab.research.google.com/github/OmChandraSharma/Sentiment_Analysis/blob/main/linear_regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import os
import pandas as pd
import joblib
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import LabelEncoder

from google.colab import drive
drive.mount('/content/drive')

import os

# Define a working directory within your Drive
WORKING_DIR = "/content/drive/MyDrive/sentiment_regression_pipeline"

# Create directories if they don't exist
os.makedirs(WORKING_DIR, exist_ok=True)

# === CONFIG ===
BASE_DIR = "/content/drive/MyDrive/sentiment_regression_pipeline"
DATA_DIR = os.path.join(BASE_DIR, "data")
MODEL_DIR = os.path.join(BASE_DIR, "artifacts/models")
VECTORIZER_DIR = os.path.join(BASE_DIR, "artifacts/vectorizers")
ENCODER_DIR = os.path.join(BASE_DIR, "artifacts/encoders")
PLOTS_DIR = os.path.join(BASE_DIR, "artifacts/plots")

os.makedirs(DATA_DIR, exist_ok=True)
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(VECTORIZER_DIR, exist_ok=True)
os.makedirs(ENCODER_DIR, exist_ok=True)
os.makedirs(PLOTS_DIR, exist_ok=True)

# === STEP 1: Load Data ===
!pip install -q gdown
import gdown

# Download using the file ID
file_id = "133jd-yMyIpnVnHPYjiopu0XlpDk6I_H3"
gdown.download(f"https://drive.google.com/uc?id={file_id}", "clean_data.csv", quiet=False)

import pandas as pd

df = pd.read_csv("clean_data.csv")
print(df.head())
df = df.dropna(subset=["clean_text", "sentiment"])
df.to_csv(os.path.join(DATA_DIR, "clean_data.csv"), index=False)
print(f"📁 Loaded data with shape: {df.shape}")

# === STEP 2: TF-IDF Vectorization ===
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(df["clean_text"])

joblib.dump(tfidf_vectorizer, os.path.join(VECTORIZER_DIR, "tfidf_vectorizer.pkl"))

# === STEP 3: Label Encoding ===
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df["sentiment"])

joblib.dump(label_encoder, os.path.join(ENCODER_DIR, "label_encoder.pkl"))

# === STEP 4: Train Model ===
model = LinearRegression()
model.fit(X, y)

joblib.dump(model, os.path.join(MODEL_DIR, "linear_regression_model.pkl"))

# === STEP 5: Evaluation ===
y_pred = model.predict(X)
mse = mean_squared_error(y, y_pred)
r2 = r2_score(y, y_pred)

print("✅ Model trained and saved.")
print(f"📉 Mean Squared Error: {mse:.4f}")
print(f"📈 R² Score: {r2:.4f}")

# === STEP 6: Save Evaluation Plots ===

# Scatter plot: Actual vs Predicted
plt.figure(figsize=(8, 6))
plt.scatter(y, y_pred, alpha=0.4, color="blue", edgecolor="k")
plt.xlabel("Actual Sentiment (Encoded)")
plt.ylabel("Predicted Sentiment")
plt.title("Actual vs Predicted Sentiment (Linear Regression)")
plt.grid(True)
plt.savefig(os.path.join(PLOTS_DIR, "actual_vs_predicted.png"))
plt.close()

# Residual plot
residuals = y - y_pred
plt.figure(figsize=(8, 6))
plt.hist(residuals, bins=30, color='orange', edgecolor='black')
plt.title("Residuals Histogram")
plt.xlabel("Residual (Actual - Predicted)")
plt.ylabel("Frequency")
plt.grid(True)
plt.savefig(os.path.join(PLOTS_DIR, "residuals_histogram.png"))
plt.close()

# Line plot: Error vs Sample Index
plt.figure(figsize=(8, 6))
plt.plot(residuals, linestyle='-', color='red')
plt.title("Residuals Across Samples")
plt.xlabel("Sample Index")
plt.ylabel("Residual")
plt.grid(True)
plt.savefig(os.path.join(PLOTS_DIR, "residuals_lineplot.png"))
plt.close()

print("📊 Graphs saved under:", PLOTS_DIR)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Downloading...
From (original): https://drive.google.com/uc?id=133jd-yMyIpnVnHPYjiopu0XlpDk6I_H3
From (redirected): https://drive.google.com/uc?id=133jd-yMyIpnVnHPYjiopu0XlpDk6I_H3&confirm=t&uuid=a26ecaf4-c9c0-4880-9744-cdce9cd04b59
To: /content/clean_data.csv
100%|██████████| 212M/212M [00:03<00:00, 56.8MB/s]


   polarity  id                          date    query      user  \
0         4   3  Mon May 11 03:17:40 UTC 2009  kindle2    tpryan   
1         4   4  Mon May 11 03:18:03 UTC 2009  kindle2    vcu451   
2         4   5  Mon May 11 03:18:54 UTC 2009  kindle2    chadfu   
3         4   6  Mon May 11 03:19:04 UTC 2009  kindle2     SIX15   
4         4   7  Mon May 11 03:21:41 UTC 2009  kindle2  yamarama   

                                                text       source sentiment  \
0  @stellargirl I loooooooovvvvvveee my Kindle2. ...  manual_test  positive   
1  Reading my kindle2...  Love it... Lee childs i...  manual_test  positive   
2  Ok, first assesment of the #kindle2 ...it fuck...  manual_test  positive   
3  @kenburbary You'll love your Kindle2. I've had...  manual_test  positive   
4  @mikefish  Fair enough. But i have the Kindle2...  manual_test  positive   

                                          clean_text  
0  loooooooovvvvvveee kindle2 dx cool 2 fantastic...  
1     

In [4]:
from sklearn.metrics import accuracy_score

# Round predictions to nearest integer to simulate classification
y_pred_rounded = y_pred.round().astype(int)

# Clip predictions to stay within valid class range
y_pred_rounded = y_pred_rounded.clip(min=0, max=len(label_encoder.classes_) - 1)

# Compute accuracy
accuracy = accuracy_score(y, y_pred_rounded)

print(f"🎯 Approximate Classification Accuracy: {accuracy * 100:.2f}%")


🎯 Approximate Classification Accuracy: 68.42%
