In [17]:
# Step 1: Polishing the Gems – Data Cleaning & Preprocessing
import pandas as pd

# First, we load our DataFrame from the raw CSV artifact.
df = pd.read_csv("Amazon_Reviews.csv", on_bad_lines='skip', engine='python')
print("--- Raw Data Loaded ---")
print(f"Initial shape of the DataFrame: {df.shape}")

--- Raw Data Loaded ---
Initial shape of the DataFrame: (21214, 9)


In [18]:
# Let's rename some columns to make them easier to work with.
# This is a good practice to ensure consistency, as we learned in Chapter 5.
df.rename(columns={'Review Text': 'review_text', 'Rating': 'rating'}, inplace=True)

In [19]:
# 1. Handling Missing Values (The Restoration Technique)
# We'll drop any rows where the essential columns are missing data.
# A review without text or a rating is useless to us.
df.dropna(subset=['review_text', 'rating'], inplace=True)
print("\n--- After dropping rows with missing text or rating ---")
print(f"Shape of the DataFrame: {df.shape}")


--- After dropping rows with missing text or rating ---
Shape of the DataFrame: (21055, 9)


In [20]:
# 2. Eliminating Duplicates (The Uniqueness Filter)
# We'll check for and remove any rows that are exact duplicates.
# Having the same review twice would skew our analysis and training.
df.drop_duplicates(inplace=True)
print("\n--- After removing duplicate rows ---")
print(f"Shape of the DataFrame: {df.shape}")


--- After removing duplicate rows ---
Shape of the DataFrame: (21055, 9)


In [None]:
# 3. Data Type Conversion (The Elemental Transformation)
df['rating'] = df['rating'].str.extract(r'(\d+)')
df['rating'] = df['rating'].astype(int)
print("\n--- After cleaning and converting 'rating' column ---")
print(df['rating'])


--- After cleaning and converting 'rating' column ---
0        1
1        1
2        1
3        1
4        1
        ..
21209    5
21210    5
21211    3
21212    5
21213    4
Name: rating, Length: 21055, dtype: int64


In [None]:
# We'll also convert the 'Date of Experience' column into a proper datetime object.
# This is crucial for any time-based analysis in a later step (Chapter 6 & 8).
# The errors='coerce' argument will turn any unparsable dates into NaT (Not a Time), which we can then drop.

df['date_of_experience'] =\
pd.to_datetime(df['Date of Experience'], errors='coerce')

df.dropna(subset=['date_of_experience'], inplace=True)
print("\n--- After converting date and cleaning ---")
print(f"Shape of the DataFrame: {df.shape}")


--- After converting date and cleaning ---
Shape of the DataFrame: (20947, 10)


In [23]:
print("\n--- The Polished Artifacts ---")
print(df.head())
print("\n--- Final Data Report ---")
# The .info() method now shows us the correct data types.
print(df.info())


--- The Polished Artifacts ---
      Reviewer Name                     Profile Link Country Review Count  \
0        Eugene ath  /users/66e8185ff1598352d6b3701a      US     1 review   
1  Daniel ohalloran  /users/5d75e460200c1f6a6373648c      GB    9 reviews   
2          p fisher  /users/546cfcf1000064000197b88f      GB   90 reviews   
3         Greg Dunn  /users/62c35cdbacc0ea0012ccaffa      AU    5 reviews   
4     Sheila Hannah  /users/5ddbe429478d88251550610e      GB    8 reviews   

                Review Date  rating  \
0  2024-09-16T13:44:26.000Z       1   
1  2024-09-16T18:26:46.000Z       1   
2  2024-09-16T21:47:39.000Z       1   
3  2024-09-17T07:15:49.000Z       1   
4  2024-09-16T18:37:17.000Z       1   

                                      Review Title  \
0       A Store That Doesn't Want to Sell Anything   
1           Had multiple orders one turned up and…   
2                      I informed these reprobates   
3  Advertise one price then increase it on website   
