#**1.**

Feature Engineering aka creating new numerical features from the raw data to get more info...

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve, auc, roc_curve, roc_auc_score, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from imblearn.over_sampling import SMOTE

In [5]:
df = pd.read_csv("cscpopendata_clean.csv")
print(df.head())

   CDPHId                                       ProductName  CSFId  CSF  \
0       2  ULTRA COLOR RICH EXTRA PLUMP LIPSTICK-ALL SHADES    NaN  NaN   
1       3                        Glover's Medicated Shampoo    NaN  NaN   
2       3                        Glover's Medicated Shampoo    NaN  NaN   
3       4          PRECISION GLIMMER EYE LINER-ALL SHADES �    NaN  NaN   
4       5       AVON BRILLIANT SHINE LIP GLOSS-ALL SHADES �    NaN  NaN   

   CompanyId          CompanyName BrandName  PrimaryCategoryId  \
0          4         New Avon LLC      AVON                 44   
1        338  J. Strickland & Co.  Glover's                 18   
2        338  J. Strickland & Co.  Glover's                 18   
3          4         New Avon LLC      AVON                 44   
4          4         New Avon LLC      AVON                 44   

                     PrimaryCategory  SubCategoryId  ... ChemicalCreatedAt  \
0    Makeup Products (non-permanent)             53  ...        07/09/2009

In [7]:
chemical_hazard_map = {
    'Formaldehyde': 'Confirmed Carcinogen',
    'Titanium dioxide': 'Suspected Carcinogen',
    'Distillates (coal tar)': 'Confirmed Carcinogen',
    'Fragrance': 'Allergen/Irritant',
    'Linalool': 'Allergen/Irritant',
    'Glycerin': 'Not Hazardous',
    'Ethanol': 'Not Hazardous',
    'Sodium Lauryl Sulfate': 'Allergen/Irritant'
}

# Identify unique chemicals and map them to their hazard types
# We will create a new 'HazardType' column in the DataFrame.
df['ChemicalName_Clean'] = df['ChemicalName_Clean'].str.strip()  # Ensure no leading/trailing whitespace
df['HazardType'] = df['ChemicalName_Clean'].map(chemical_hazard_map).fillna('Not Classified')

# Summarize the distribution of chemical hazard types
hazard_distribution = df['HazardType'].value_counts()
print("Distribution of Chemical Hazard Types:")
print(hazard_distribution)
print("\n")

Distribution of Chemical Hazard Types:
HazardType
Suspected Carcinogen    93480
Not Classified          21152
Confirmed Carcinogen        3
Name: count, dtype: int64




In [10]:
df['Reformulated'] = df['ChemicalDateRemoved'].notna().astype(int)

In [11]:
# date columns in datetime format to avoid errors.
df['InitialDateReported'] = pd.to_datetime(df['InitialDateReported'], errors='coerce')
df['MostRecentDateReported'] = pd.to_datetime(df['MostRecentDateReported'], errors='coerce')
df['DiscontinuedDate'] = pd.to_datetime(df['DiscontinuedDate'], errors='coerce')

# time-based features...
df['ChemicalAge'] = (df['MostRecentDateReported'] - df['InitialDateReported']).dt.days
df['ProductLifespan'] = (df['DiscontinuedDate'] - df['InitialDateReported']).dt.days

# hazard score based on a custom function and the HazardType column.
def create_hazard_score(row):
    if 'carcinogen' in str(row['HazardType']).lower():
        return 10
    elif 'reproductive' in str(row['HazardType']).lower():
        return 8
    else:
        return 5

df['HazardScore'] = df.apply(create_hazard_score, axis=1)

print("Features after engineering:")
print(df[['ChemicalAge', 'ProductLifespan', 'HazardScore']].head())
print("\nDataset Info after Feature Engineering:")
print(df.info())


Features after engineering:
   ChemicalAge  ProductLifespan  HazardScore
0         1533            594.0           10
1            0              NaN           10
2            0              NaN            5
3         1511              NaN           10
4         1511            572.0           10

Dataset Info after Feature Engineering:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 114635 entries, 0 to 114634
Data columns (total 33 columns):
 #   Column                  Non-Null Count   Dtype         
---  ------                  --------------   -----         
 0   CDPHId                  114635 non-null  int64         
 1   ProductName             114635 non-null  object        
 2   CSFId                   80662 non-null   float64       
 3   CSF                     80237 non-null   object        
 4   CompanyId               114635 non-null  int64         
 5   CompanyName             114635 non-null  object        
 6   BrandName               114408 non-null  object        
 

In [12]:
X = df[['ChemicalAge', 'ProductLifespan', 'HazardScore']]
y = df['Reformulated']

# Drop any rows with missing values in the selected feature set.

data_clean = pd.concat([X, y], axis=1).dropna()
X_numeric = data_clean[['ChemicalAge', 'ProductLifespan', 'HazardScore']]
y_numeric = data_clean['Reformulated']

X_train, X_test, y_train, y_test = train_test_split(X_numeric, y_numeric, test_size=0.2, random_state=42, stratify=y_numeric)

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

print(f"\nTraining set shape after SMOTE: {X_train_smote.shape}")
print(f"Distribution of the target variable in the resampled training set:\n{y_train_smote.value_counts()}")



Training set shape after SMOTE: (20000, 3)
Distribution of the target variable in the resampled training set:
Reformulated
0    10000
1    10000
Name: count, dtype: int64


In [14]:
log_reg = LogisticRegression(solver='liblinear', random_state=42)
log_reg.fit(X_train_smote, y_train_smote)
y_pred_log_reg = log_reg.predict(X_test)
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, y_pred_log_reg))

# Tree-based Model: Random Forest Classifier
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train_smote, y_train_smote)
y_pred_rf = rf_model.predict(X_test)
print("\nRandom Forest Classification Report:")
print(classification_report(y_test, y_pred_rf))


Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.79      0.88      2500
           1       0.11      0.76      0.19        84

    accuracy                           0.79      2584
   macro avg       0.55      0.77      0.53      2584
weighted avg       0.96      0.79      0.85      2584


Random Forest Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.98      0.98      2500
           1       0.52      0.79      0.63        84

    accuracy                           0.97      2584
   macro avg       0.76      0.88      0.81      2584
weighted avg       0.98      0.97      0.97      2584



Precision: How many of the predicted "reformulated" products were actually correct.

Recall: How many of the actual "reformulated" products the model successfully found.

F1-Score: A balance between precision and recall, crucial for your imbalanced data.

#**LR:**
This model had a high recall (0.76) but low precision (0.11). This means it was great at catching most of the reformulated products but also flagged many products that **weren't actually reformulated** (lots of false alarms). It acted like a wide fishing net.


#**RF:**
The Random Forest model was a clear winner. It maintained high recall (0.79) while significantly improving precision (0.52). This means it was not only good at finding reformulated products but also more accurate in its predictions. Its higher F1-score (0.63) shows it's a much more reliable and balanced model...