In [3]:
# Step 5: The Predictive Enchantment – Vectorizing and Training the Classifier

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline

In [4]:
def categorize_sentiment(rating):
    if rating >= 4:
        return 'positive'
    elif rating <= 2:
        return 'negative'
    else:
        return 'neutral'

df = pd.read_csv("Amazon_Reviews.csv", on_bad_lines='skip', engine='python')
df.rename(columns={'Review Text': 'review_text', 'Rating': 'rating', 'Date of Experience': 'date_of_experience'}, inplace=True)
df.dropna(subset=['review_text', 'rating', 'date_of_experience'], inplace=True)
df.drop_duplicates(inplace=True)
df['rating'] = df['rating'].str.extract(r'(\d+)')
df['rating'] = df['rating'].astype(int)
df['date_of_experience'] = pd.to_datetime(df['date_of_experience'], errors='coerce')
df.dropna(subset=['date_of_experience'], inplace=True)
df['text_length'] = df['review_text'].str.len()
df['exclamation_count'] = df['review_text'].str.count('!')
current_date = pd.Timestamp.now()
df['review_age_days'] = (current_date - df['date_of_experience']).dt.days
df['sentiment'] = df['rating'].apply(categorize_sentiment)

In [None]:
# 2. Separate our features (X) and our target (y)
X = df['review_text']
y = df['sentiment']

In [None]:
# 3. Perform a train/test split, ensuring our sentiment classes are balanced.
# We'll set aside 20% of our data for testing.

X_train, X_test, y_train, y_test = \
train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
print("--- Data Split Report ---")
print(f"Training set size: {len(X_train)}")
print(f"Testing set size: {len(X_test)}")

print("\nTrain sentiment distribution:\n", 
      y_train.value_counts(normalize=True))

print("\nTest sentiment distribution:\n", 
      y_test.value_counts(normalize=True))


--- Data Split Report ---
Training set size: 16757
Testing set size: 4190

Train sentiment distribution:
 sentiment
negative    0.684908
positive    0.273378
neutral     0.041714
Name: proportion, dtype: float64

Test sentiment distribution:
 sentiment
negative    0.684964
positive    0.273270
neutral     0.041766
Name: proportion, dtype: float64


In [None]:
# 4. Build our powerful Machine Learning Pipeline.
# This chains the Vectorizer and the Classifier together in one object.

sentiment_pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('classifier', LogisticRegression(max_iter=1000, solver='liblinear'))
])
print("\n--- The Predictive Pipeline is Assembled ---")
print(sentiment_pipeline)



--- The Predictive Pipeline is Assembled ---
Pipeline(steps=[('vectorizer', TfidfVectorizer()),
                ('classifier',
                 LogisticRegression(max_iter=1000, solver='liblinear'))])


In [None]:
# 5. Train the model by fitting the pipeline to our training data.
print("\n--- The Ritual Begins: Training the Model ---")
sentiment_pipeline.fit(X_train, y_train)
print("Training complete! The model has learned to predict sentiment.")



--- The Ritual Begins: Training the Model ---
Training complete! The model has learned to predict sentiment.


In [None]:
# 6. Make predictions on our unseen test data.

y_pred = sentiment_pipeline.predict(X_test)
print("\n--- The First Predictions ---")
print("First 5 actual labels:", list(y_test.iloc[:5]))
print("First 5 predicted labels:", list(y_pred[:5]))



--- The First Predictions ---
First 5 actual labels: ['negative', 'positive', 'negative', 'negative', 'negative']
First 5 predicted labels: ['negative', 'positive', 'negative', 'negative', 'negative']
