In [None]:
# Step 2: Forging New Compounds – Data Transformation & Feature Engineering
import pandas as pd
import numpy as np


In [None]:
#We'll reload our data to ensure we're starting from a clean slate.

df = pd.read_csv("Amazon_Reviews.csv", on_bad_lines='skip', engine='python')
df.rename(columns={'Review Text': 'review_text', 'Rating': 'rating', 
                   'Date of Experience': 'date_of_experience'}, inplace=True)
df.dropna(subset=['review_text', 'rating', 'date_of_experience'], inplace=True)
df.drop_duplicates(inplace=True)
df['rating'] = df['rating'].str.extract(r'(\d+)')
df['rating'] = df['rating'].astype(int)
df['date_of_experience'] = pd.to_datetime(df['date_of_experience'], 
                                          errors='coerce')
df.dropna(subset=['date_of_experience'], inplace=True)

In [None]:
# 1. Creating the 'text_length' feature.
# This uses a vectorized string operation, just like we learned in Chapter 4!
df['text_length'] = df['review_text'].str.len()

In [6]:
# 2. Creating the 'exclamation_count' feature.
# We'll apply another vectorized string method to count a specific character.
# This is a simple but brilliant way to capture emotional intensity.
df['exclamation_count'] = df['review_text'].str.count('!')


In [None]:
# 3. Creating the 'review_age_days' feature.
# This is where we combine our knowledge of dates and vectorized operations.
# We get the current date and subtract the review date to get the difference in days.
current_date = pd.Timestamp.now()
df['review_age_days'] = (current_date - df['date_of_experience']).dt.days


In [None]:
# Let's inspect our DataFrame to see our newly forged compounds

print("--- The Newly Forged Compounds ---")
print(df[['review_text', 'rating', 'text_length', 
          'exclamation_count', 'review_age_days']].head())

--- The Newly Forged Compounds ---
                                         review_text  rating  text_length  \
0  I registered on the website, tried to order a ...       1          590   
1  Had multiple orders one turned up and driver h...       1          293   
2  I informed these reprobates that I WOULD NOT B...       1          611   
3  I have bought from Amazon before and no proble...       1          450   
4  If I could give a lower rate I would! I cancel...       1          538   

   exclamation_count  review_age_days  
0                  0              328  
1                  0              328  
2                  3              328  
3                  0              327  
4                  5              328  


In [None]:
print("\n--- Final Data Report with New Features ---")
print(df.info())


--- Final Data Report with New Features ---
<class 'pandas.core.frame.DataFrame'>
Index: 20947 entries, 0 to 21213
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   Reviewer Name       20947 non-null  object        
 1   Profile Link        20947 non-null  object        
 2   Country             20946 non-null  object        
 3   Review Count        20947 non-null  object        
 4   Review Date         20947 non-null  object        
 5   rating              20947 non-null  int64         
 6   Review Title        20947 non-null  object        
 7   review_text         20947 non-null  object        
 8   date_of_experience  20947 non-null  datetime64[ns]
 9   text_length         20947 non-null  int64         
 10  exclamation_count   20947 non-null  int64         
 11  review_age_days     20947 non-null  int64         
dtypes: datetime64[ns](1), int64(4), object(7)
memory usage: 2.1+ M

In [None]:
# Let's also run a quick description to see the statistics of our new features.
print("\n--- Descriptive Statistics of New Features ---")
print(df[['text_length', 'exclamation_count', 'review_age_days']]\
      .describe())



--- Descriptive Statistics of New Features ---
        text_length  exclamation_count  review_age_days
count  20947.000000       20947.000000     20947.000000
mean     462.690409           0.849095      1711.958228
std      517.244005           2.444339      1190.704878
min       10.000000           0.000000       327.000000
25%      148.000000           0.000000       821.000000
50%      317.000000           0.000000      1471.000000
75%      585.500000           1.000000      2144.500000
max     9951.000000          52.000000      6558.000000
