# Sentiment Analysis on Amazon Fine Food Reviews

This notebook performs sentiment analysis using machine learning techniques on the Amazon Fine Food Reviews dataset.

**Key Steps:**
- Load and clean the dataset
- Perform text preprocessing
- Convert text to TF-IDF features
- Train LightGBM classifier
- Evaluate model performance
- Visualize results


In [None]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from lightgbm import LGBMClassifier
import joblib

# Download necessary NLTK resources
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')


In [None]:
# Step 2: Load and clean the dataset
file_path = 'Reviews.csv'  # Update this if the file is in a different location
df_raw = pd.read_csv(file_path)
df = df_raw[['Score', 'Text']].dropna()

# Convert score to binary sentiment: 1 for positive (4, 5), 0 for negative (1, 2)
df = df[df['Score'] != 3]
df['label'] = df['Score'].apply(lambda x: 1 if x > 3 else 0)
df = df[['Text', 'label']]


In [None]:
# Step 3: Text Preprocessing
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = re.sub(r'\d+', '', text)
    tokens = text.split()
    tokens = [lemmatizer.lemmatize(word) for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['cleaned_text'] = df['Text'].apply(clean_text)


In [None]:
# Step 4: TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['cleaned_text'].fillna(''))
y = df['label']

# Save vectorizer
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')


In [None]:
# Step 5: Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)


In [None]:
# Step 6: Model Training with LightGBM
model = LGBMClassifier()
model.fit(X_train, y_train)

# Save model
joblib.dump(model, 'lgbm_model.pkl')


In [None]:
# Step 7: Evaluation
y_pred = model.predict(X_test)

print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

print("\nClassification Report:")
print(classification_report(y_test, y_pred))

print("\nAccuracy Score:", accuracy_score(y_test, y_pred))


In [None]:
# Step 8: Visualization
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6, 4))
sns.heatmap(cm, annot=True, fmt='d', cmap='plasma', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()
