In [2]:
# Step 4: The Sentiment Sorcery â€“ Making Labels from Stars

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [3]:
# We'll reload and re-process the data from our previous steps.
df = pd.read_csv("Amazon_Reviews.csv", on_bad_lines='skip', engine='python')
df.rename(columns={'Review Text': 'review_text', 'Rating': 'rating', 'Date of Experience': 'date_of_experience','Country':'country'}, inplace=True)
df.dropna(subset=['review_text', 'rating', 'date_of_experience'], inplace=True)
df.drop_duplicates(inplace=True)
df['rating'] = df['rating'].str.extract(r'(\d+)')
df['rating'] = df['rating'].astype(int)
df['date_of_experience'] = pd.to_datetime(df['date_of_experience'], errors='coerce')
df.dropna(subset=['date_of_experience'], inplace=True)
df['text_length'] = df['review_text'].str.len()
df['exclamation_count'] = df['review_text'].str.count('!')
current_date = pd.Timestamp.now()
df['review_age_days'] = (current_date - df['date_of_experience']).dt.days

In [None]:
# --- Sentiment Sorcery: Mapping Ratings to Labels ---
# We define a function to categorize our star ratings into sentiment labels.
def categorize_sentiment(rating):
    if rating >= 4:
        return 'positive'
    elif rating <= 2:
        return 'negative'
    else:
        return 'neutral'

In [None]:
# Now we apply this function to the entire 'rating' column to create a new 'sentiment' column.
# This uses the powerful .apply() method from Chapter 6.
df['sentiment'] = df['rating'].apply(categorize_sentiment)

In [None]:
# Let's inspect our DataFrame to see our newly created sentiment labels.
print("--- The Sentiment Labels are Forged ---")
print(df[['rating', 'sentiment', 'review_text']])

--- The Sentiment Labels are Forged ---
       rating sentiment                                        review_text
0           1  negative  I registered on the website, tried to order a ...
1           1  negative  Had multiple orders one turned up and driver h...
2           1  negative  I informed these reprobates that I WOULD NOT B...
3           1  negative  I have bought from Amazon before and no proble...
4           1  negative  If I could give a lower rate I would! I cancel...
...       ...       ...                                                ...
21209       5  positive  I have had perfect order fulfillment, and fast...
21210       5  positive  I have had perfect order fulfillment, and fast...
21211       3   neutral  I always find myself going back to amazon beco...
21212       5  positive  I have placed an abundance of orders with Amaz...
21213       4  positive  those goods i've ordered by Amazon.com, have b...

[20947 rows x 3 columns]


In [None]:
# Let's see the distribution of our new sentiment labels.

print("\n--- Distribution of New Sentiment Labels ---")
print(df['sentiment'].value_counts())



--- Distribution of New Sentiment Labels ---
sentiment
negative    14347
positive     5726
neutral       874
Name: count, dtype: int64
