## 📦 Dataset

This project uses and transforms the Kaggle dataset:

**Source**: [Sephora Products and Skincare Reviews by nadyinky](https://www.kaggle.com/datasets/nadyinky/sephora-products-and-skincare-reviews)  
**License**: [Creative Commons Attribution 4.0 (CC BY 4.0)](https://creativecommons.org/licenses/by/4.0/)  



In [1]:
import pandas as pd

In [2]:
reviews_0_250 = pd.read_csv(r"F:\My_CSV_\final_test_1\reviews_0-250.csv", low_memory=False)
reviews_250_500 = pd.read_csv(r"F:\My_CSV_\final_test_1\reviews_250-500.csv", low_memory=False)
reviews_500_750 = pd.read_csv(r"F:\My_CSV_\final_test_1\reviews_500-750.csv", low_memory=False)
reviews_750_1250 = pd.read_csv(r"F:\My_CSV_\final_test_1\reviews_750-1250.csv", low_memory=False)
reviews_1250_end = pd.read_csv(r"F:\My_CSV_\final_test_1\reviews_1250-end.csv", low_memory=False)

In [3]:
reviews_0_250_c = reviews_0_250.copy()
reviews_250_500_c =reviews_250_500.copy()
reviews_500_750_c =reviews_500_750.copy()
reviews_750_1250_c = reviews_750_1250.copy()
reviews_1250_end_C =reviews_1250_end.copy()

## Data Cleaning: Dropping Unnecessary Columns, Creating full_review, and Removing Duplicates
In this section, I dropped some columns, created a new column full_review by combining review_text and review_title, then cleaned the data and removed duplicate entries.

In [4]:
def clean_and_preprocess_reviews(df):
    import re

    # Step 1: Drop irrelevant columns
    columns_to_drop_initial = [
        'Unnamed: 0', 'submission_time', 'total_feedback_count',
        'total_pos_feedback_count', 'total_neg_feedback_count',
        'product_name', 'author_id', 'helpfulness', 'is_recommended',
        'brand_name', 'price_usd', 'product_id',
        # Drop weak or noisy features
        'eye_color', 'hair_color'
    ]
    df = df.drop(columns=columns_to_drop_initial, errors='ignore')

    # Step 2: Fill missing review_title and review_text
    df['review_title'] = df['review_title'].fillna('')
    df['review_text'] = df['review_text'].fillna('')

    # Step 3: Combine into full_review
    df['full_review'] = df['review_title'] + ' ' + df['review_text']

    # Step 4: Remove empty full_review rows
    df = df[df['full_review'].str.strip() != '']

    # Step 5: Drop original title/text
    df = df.drop(columns=['review_title', 'review_text'], errors='ignore')

    # Step 6: Clean full_review text
    def clean_text_bert(text):
        text = re.sub(r'\s+', ' ', text)
        return text.strip()

    df['full_review'] = df['full_review'].apply(clean_text_bert)

    # 🔥 Step 7: Drop duplicate full_review texts
    df = df.drop_duplicates(subset='full_review')

    return df


reviews_0_250_c = clean_and_preprocess_reviews(reviews_0_250.copy())
reviews_250_500_c = clean_and_preprocess_reviews(reviews_250_500.copy())
reviews_500_750_c = clean_and_preprocess_reviews(reviews_500_750.copy())
reviews_750_1250_c = clean_and_preprocess_reviews(reviews_750_1250.copy())
reviews_1250_end_C = clean_and_preprocess_reviews(reviews_1250_end.copy())


In [81]:
reviews_1250_end_C.columns

Index(['rating', 'skin_tone', 'skin_type', 'full_review'], dtype='object')

In [5]:
reviews_0_250_clean = reviews_0_250_c.copy()
reviews_250_500_clean =reviews_250_500_c.copy()
reviews_500_750_clean =reviews_500_750_c.copy()
reviews_750_1250_clean = reviews_750_1250_c.copy()
reviews_1250_end_Clean =reviews_1250_end_C.copy()

## Sampling and Concatenating Data
Since we had a large dataset, I decided to randomly select 2000 samples and concatenate them together for further analysis.

In [6]:
import pandas as pd
import warnings

# Suppress the specific DeprecationWarning about groupby.apply
warnings.filterwarnings(
    "ignore",
    category=DeprecationWarning,
    message="DataFrameGroupBy.apply operated on the grouping columns"
)

# Sample 2000 reviews from each DataFrame (with random_state for reproducibility)
sample_0_250 = reviews_0_250_clean.sample(n=2000, random_state=42)
sample_250_500 = reviews_250_500_clean.sample(n=2000, random_state=42)
sample_500_750 = reviews_500_750_clean.sample(n=2000, random_state=42)
sample_750_1250 = reviews_750_1250_clean.sample(n=2000, random_state=42)
sample_1250_end = reviews_1250_end_Clean.sample(n=2000, random_state=42)

# Combine all cleaned datasets
all_reviews = pd.concat([
    reviews_0_250_clean,
    reviews_250_500_clean,
    reviews_500_750_clean,
    reviews_750_1250_clean,
    reviews_1250_end_Clean
], ignore_index=True)

def sample_with_rating(df):
    sampled = df.sample(n=min(len(df), 2000), random_state=42)
    sampled['rating'] = df.name  # df.name is the group key (rating)
    return sampled

stratified_sample = (
    all_reviews
    .groupby('rating', group_keys=False)
    .apply(sample_with_rating)  # warning suppressed above, no changes here
    .reset_index(drop=True)
)

# Verify results
print("Final balanced dataset shape:", stratified_sample.shape)
print("\nPerfect class distribution:")
print(stratified_sample['rating'].value_counts(normalize=True))


Final balanced dataset shape: (10000, 4)

Perfect class distribution:
rating
1    0.2
2    0.2
3    0.2
4    0.2
5    0.2
Name: proportion, dtype: float64


In [7]:
X = stratified_sample


In [8]:
y = X.pop('rating')
#y.unique()

In [9]:
y = y -1

In [10]:
y.unique()

array([0, 1, 2, 3, 4])

In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1234)

## Preprocessing Pipelines Setup
I created separate pipelines for text and categorical data: the text pipeline uses TfidfVectorizer with unigram and bigram features, while the categorical pipeline handles missing values and applies one-hot encoding. These pipelines are then combined using make_column_transformer for streamlined preprocessing.

In [12]:
from sklearn.pipeline import make_pipeline
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

# Correct text pipeline - remove 'sparse' parameter
text_pipe = make_pipeline(
    TfidfVectorizer(
        max_features=5000,       # Increased from 100
        stop_words='english',
        ngram_range=(1, 2),      # Using both unigrams and bigrams
        min_df=5,                # Ignore terms appearing in <5 docs
        max_df=0.7               # Ignore terms in >70% of docs
    )
    # No sparse parameter needed - it's always sparse by default
)

# Categorical pipeline
cat_pipe = make_pipeline(
    SimpleImputer(strategy='constant', fill_value='N_A'),
    OneHotEncoder(handle_unknown='ignore', sparse_output=True)  # Correct parameter name
)

# Combine pipelines
preprocessor = make_column_transformer(
    (cat_pipe, ['skin_type', 'skin_tone']),  # Example categorical columns
    (text_pipe, "full_review"),             # Text column
    remainder='drop'
)


## Model Training with XGBoost and Feature Selection
For the regression task, I set up a pipeline combining preprocessing, Recursive Feature Elimination (RFE), and an XGBoost regressor. Hyperparameters were tuned using GridSearchCV with a custom RMSE scorer to optimize model performance.

In [13]:
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.feature_selection import RFE
from xgboost import XGBRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer



# Custom RMSE scorer for GridSearchCV (note: GridSearchCV expects higher score = better, so we negate RMSE)
rmse_scorer = make_scorer(lambda y_true, y_pred: -np.sqrt(mean_squared_error(y_true, y_pred)))



xgb_param_grid = {
    "rfe__estimator__n_estimators": [197],
    "rfe__estimator__max_depth": [4],
    "rfe__estimator__learning_rate": [0.059362936390648706],
    "rfe__estimator__reg_alpha": [0.3243563176277785],
    "rfe__estimator__reg_lambda": [5.0229339349076065],
    "rfe__n_features_to_select": [49]
}


pipe = make_pipeline(
    preprocessor,
    RFE(estimator=XGBRegressor(random_state=42, verbosity=0), step=0.2)
)


xgb_search = GridSearchCV(
    estimator=pipe,
    param_grid=xgb_param_grid,
    scoring=rmse_scorer,
    refit=True,
    cv=2,
    n_jobs=-1,
    verbose=1
)


xgb_search.fit(X_train, y_train)



print("Best parameters:", xgb_search.best_params_)
print("Best RMSE (negative):", xgb_search.best_score_)


Fitting 2 folds for each of 1 candidates, totalling 2 fits
Best parameters: {'rfe__estimator__learning_rate': 0.059362936390648706, 'rfe__estimator__max_depth': 4, 'rfe__estimator__n_estimators': 197, 'rfe__estimator__reg_alpha': 0.3243563176277785, 'rfe__estimator__reg_lambda': 5.0229339349076065, 'rfe__n_features_to_select': 49}
Best RMSE (negative): -1.0877509121322375


In [14]:
from sklearn.metrics import mean_squared_error
import numpy as np

# ----------------------------
# 1. Train RMSE
# ----------------------------
train_preds = xgb_search.predict(X_train)
rmse_train = np.sqrt(mean_squared_error(y_train, train_preds))
print(f"Train RMSE: {rmse_train:.4f}")

# ----------------------------
# 2. Validation RMSE (from CV)
# ----------------------------
# Since we used negative RMSE as the scoring function
rmse_val = -xgb_search.best_score_
print(f"Validation RMSE (CV): {rmse_val:.4f}")

# ----------------------------
# 3. Test RMSE (on unseen data)
# ----------------------------
test_preds = xgb_search.predict(X_test)
rmse_test = np.sqrt(mean_squared_error(y_test, test_preds))
print(f"Test RMSE: {rmse_test:.4f}")


Train RMSE: 1.0090
Validation RMSE (CV): 1.0878
Test RMSE: 1.0586


## Comparing Classification Models
After experimenting with RandomForestClassifier, XGBClassifier, and LogisticRegression for our classification task (predicting ratings from 1 to 5), we evaluated their performance to determine the best model for this problem.

In [127]:
from xgboost import XGBClassifier  # Use classifier
from sklearn.metrics import make_scorer, f1_score

# Define your preprocessor (missing in your snippet)
# Example: preprocessor = ColumnTransformer([...])

# Define a proper scorer
f1_macro_scorer = make_scorer(f1_score, average='macro')

pipe = make_pipeline(
    preprocessor,
    RFE(estimator=XGBClassifier(random_state=42, use_label_encoder=False, eval_metric='logloss'), step=0.2)
)

xgb_param_grid = {
    "rfe__estimator__n_estimators": [197],
    "rfe__estimator__max_depth": [4],
    "rfe__estimator__learning_rate": [0.059362936390648706],
    "rfe__estimator__reg_alpha": [0.3243563176277785],
    "rfe__estimator__reg_lambda": [5.0229339349076065],
    "rfe__n_features_to_select": [49]
}

xgb_search = GridSearchCV(
    estimator=pipe,
    param_grid=xgb_param_grid,
    scoring=f1_macro_scorer,
    refit=True,
    cv=3,
    n_jobs=-1,
    verbose=1
)

xgb_search.fit(X_train, y_train)
y_pred = xgb_search.predict(X_test)

print("Best parameters:", xgb_search.best_params_)
print("Best macro F1 score:", xgb_search.best_score_)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, digits=3))


Fitting 3 folds for each of 1 candidates, totalling 3 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Best parameters: {'rfe__estimator__learning_rate': 0.059362936390648706, 'rfe__estimator__max_depth': 4, 'rfe__estimator__n_estimators': 197, 'rfe__estimator__reg_alpha': 0.3243563176277785, 'rfe__estimator__reg_lambda': 5.0229339349076065, 'rfe__n_features_to_select': 49}
Best macro F1 score: 0.4145899583107882

Classification Report:

              precision    recall  f1-score   support

           0      0.441     0.624     0.517       426
           1      0.384     0.264     0.313       394
           2      0.431     0.291     0.347       409
           3      0.400     0.415     0.408       390
           4      0.551     0.643     0.593       381

    accuracy                          0.448      2000
   macro avg      0.441     0.448     0.436      2000
weighted avg      0.441     0.448     0.435      2000



In [128]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import make_scorer, f1_score
from sklearn.pipeline import make_pipeline
from sklearn.feature_selection import RFE
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# Define your preprocessor (make sure this is defined somewhere in your code)
# e.g., preprocessor = ColumnTransformer([...])

# Define the scoring metric
f1_macro_scorer = make_scorer(f1_score, average='macro')

pipe = make_pipeline(
    preprocessor,
    RFE(estimator=RandomForestClassifier(random_state=42), step=0.2)
)

# Parameter grid for RandomForest inside RFE
rf_param_grid = {
    "rfe__estimator__n_estimators": [197],
    "rfe__estimator__max_depth": [4],
    # RandomForest doesn't have learning_rate, reg_alpha, reg_lambda
    "rfe__n_features_to_select": [49]
}

rf_search = GridSearchCV(
    estimator=pipe,
    param_grid=rf_param_grid,
    scoring=f1_macro_scorer,
    refit=True,
    cv=3,
    n_jobs=-1,
    verbose=1
)

rf_search.fit(X_train, y_train)
y_pred = rf_search.predict(X_test)

print("Best parameters:", rf_search.best_params_)
print("Best macro F1 score:", rf_search.best_score_)
print("\nClassification Report:\n")
print(classification_report(y_test, y_pred, digits=3))


Fitting 3 folds for each of 1 candidates, totalling 3 fits
Best parameters: {'rfe__estimator__max_depth': 4, 'rfe__estimator__n_estimators': 197, 'rfe__n_features_to_select': 49}
Best macro F1 score: 0.36130793661590616

Classification Report:

              precision    recall  f1-score   support

           0      0.368     0.646     0.468       426
           1      0.363     0.277     0.314       394
           2      0.429     0.264     0.327       409
           3      0.398     0.231     0.292       390
           4      0.489     0.609     0.543       381

    accuracy                          0.407      2000
   macro avg      0.409     0.405     0.389      2000
weighted avg      0.408     0.407     0.389      2000



In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, f1_score, accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import SelectKBest, f_classif

# Assume `preprocessor`, `X_train`, `y_train`, `X_test`, `y_test` are already defined

# Define scoring metric
f1_macro_scorer = make_scorer(f1_score, average='macro')

# Define pipeline with preprocessing and univariate feature selection
pipe = make_pipeline(
    preprocessor,
    SelectKBest(score_func=f_classif, k=70),  # Pre-select top features
    LogisticRegression(
        max_iter=1000,
        random_state=42,
        class_weight='balanced'
    )
)

# Hyperparameter grid for logistic regression
logreg_param_grid = {
    "logisticregression__C": [0.01, 0.1, 1.0],  # Regularization strength
    "logisticregression__solver": ['liblinear', 'saga'],
    "logisticregression__penalty": ['l2', 'l1']  # Note: 'l1' only works with 'liblinear' or 'saga'
}

# Grid search
logreg_search = GridSearchCV(
    estimator=pipe,
    param_grid=logreg_param_grid,
    scoring=f1_macro_scorer,
    refit=True,
    cv=3,
    n_jobs=-1,
    verbose=2
)

# Fit model
logreg_search.fit(X_train, y_train)

# Predict and evaluate
y_pred = logreg_search.predict(X_test)

print("Best parameters:", logreg_search.best_params_)
print("Best macro F1 score (CV):", logreg_search.best_score_)
print("Test Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report on Test Set:\n")
print(classification_report(y_test, y_pred, digits=3))


Fitting 3 folds for each of 12 candidates, totalling 36 fits
Best parameters: {'logisticregression__C': 1.0, 'logisticregression__penalty': 'l1', 'logisticregression__solver': 'saga'}
Best macro F1 score (CV): 0.4408638795394544
Test Accuracy: 0.473

Classification Report on Test Set:

              precision    recall  f1-score   support

           0      0.489     0.617     0.546       426
           1      0.422     0.327     0.369       394
           2      0.433     0.330     0.374       409
           3      0.446     0.426     0.436       390
           4      0.536     0.664     0.593       381

    accuracy                          0.473      2000
   macro avg      0.465     0.473     0.464      2000
weighted avg      0.465     0.473     0.463      2000

