In [1]:
import pandas as pd


# 1. Load dataset
df = pd.read_csv("Sephora Reviews - 2021 and 2022.csv")

  df = pd.read_csv("Sephora Reviews - 2021 and 2022.csv")


In [2]:
# 2. Standardize column names
df = df.rename(columns={
    'review_text': 'text',
    'rating': 'stars',
    'product_id': 'pid',
    'author_id': 'uid',
    'review_title': 'title'
})

In [3]:
# 3. Handle missing values
df = df.dropna(subset=['text', 'stars', 'pid', 'uid', 'title', 'product_name', 'brand_name'])
df = df[df['text'].str.strip() != ""]
df = df[df['stars'].between(1, 5)]
print(df.head(5))

   Unnamed: 0          uid  stars  is_recommended  helpfulness  \
0         450   1796832802      3               0     0.714286   
1         451  10255886039      2               0     0.500000   
2         452   5049431408      5               1     1.000000   
3         453   7052423388      4               1     0.333333   
4         454  23120633832      5               1     0.500000   

   total_feedback_count  total_neg_feedback_count  total_pos_feedback_count  \
0                     7                         2                         5   
1                     2                         1                         1   
2                     4                         0                         4   
3                     3                         2                         1   
4                     2                         1                         1   

       submission_time                                               text  \
0  2022-12-31 00:00:00  I decided to try this as I 

In [4]:
import re

# 4. Text cleaning
def clean_text(text):
    text = text.lower()                                 # lowercase
    text = re.sub(r"http\S+|www\S+", " ", text)         # remove urls
    text = re.sub(r"[^a-z\s]", " ", text)               # keep letters only
    text = re.sub(r"\s+", " ", text).strip()            # normalize whitespace
    return text

df['clean_text'] = df['text'].astype(str).apply(clean_text)


In [6]:
df = df[['stars', 'text', 'pid', 'uid', 'product_name', 'brand_name']]
print(df.head(5))

   stars                                               text      pid  \
0      3  I decided to try this as I had never used a li...  P420652   
1      2  Might as well be putting on Vaseline because I...  P420652   
2      5  Love this lip mask nothing else will do Aquaph...  P420652   
3      4  I really enjoy this product! Amazing smell and...  P420652   
4      5  The best. I have nothing else to really say ot...  P420652   

           uid                                       product_name brand_name  
0   1796832802  Lip Sleeping Mask Intense Hydration with Vitam...    LANEIGE  
1  10255886039  Lip Sleeping Mask Intense Hydration with Vitam...    LANEIGE  
2   5049431408  Lip Sleeping Mask Intense Hydration with Vitam...    LANEIGE  
3   7052423388  Lip Sleeping Mask Intense Hydration with Vitam...    LANEIGE  
4  23120633832  Lip Sleeping Mask Intense Hydration with Vitam...    LANEIGE  


In [7]:
#Map stars to sentiment
def star_to_sentiment(s):
    if s >= 4:
        return "positive"
    elif s <= 2:
        return "negative"
    else:
        return "neutral"

df['sentiment'] = df['stars'].apply(star_to_sentiment)

In [8]:
#sentiment
label_map = {"negative": 0, "positive": 1, "neutral": -1}
df['label'] = df['sentiment'].map(label_map)

In [9]:
#Reset index
df = df.reset_index(drop=True)

In [11]:
#clean dataset
print(df.head())
print(f"clean shape: {df.shape}")

   stars                                               text      pid  \
0      3  I decided to try this as I had never used a li...  P420652   
1      2  Might as well be putting on Vaseline because I...  P420652   
2      5  Love this lip mask nothing else will do Aquaph...  P420652   
3      4  I really enjoy this product! Amazing smell and...  P420652   
4      5  The best. I have nothing else to really say ot...  P420652   

           uid                                       product_name brand_name  \
0   1796832802  Lip Sleeping Mask Intense Hydration with Vitam...    LANEIGE   
1  10255886039  Lip Sleeping Mask Intense Hydration with Vitam...    LANEIGE   
2   5049431408  Lip Sleeping Mask Intense Hydration with Vitam...    LANEIGE   
3   7052423388  Lip Sleeping Mask Intense Hydration with Vitam...    LANEIGE   
4  23120633832  Lip Sleeping Mask Intense Hydration with Vitam...    LANEIGE   

  sentiment  label  
0   neutral     -1  
1  negative      0  
2  positive      1  
3 

In [12]:
# Save cleaned file
df.to_csv("sephora_reviews_clean_siti.csv", index=False)