<a href="https://colab.research.google.com/github/ShabnaIlmi/Data-Science-Group-Project/blob/recipe-risk-analyzer/DSGP_startover.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

**Load and Inspect the Data**

In [39]:
import pandas as pd

# Load datasets
recipes_path = "/content/recipes_nodup.csv"
chem_path = "/content/chem.csv"

df_recipes = pd.read_csv(recipes_path)
df_chem = pd.read_csv(chem_path)

# Display first few rows of each dataset
print("📌 recipes_nodup.csv:")
print(df_recipes.head(), "\n\n")

print("📌 chem.csv:")
print(df_chem.head(), "\n\n")

# Check dataset shapes
print(f"🔍 recipes_nodup.csv Shape: {df_recipes.shape}")
print(f"🔍 chem.csv Shape: {df_chem.shape}")

# Show column names
print("🛠 recipes_nodup Columns:", df_recipes.columns)
print("🛠 chem Columns:", df_chem.columns)

# Check missing values
print("⚠️ Missing values in recipes_nodup:\n", df_recipes.isnull().sum(), "\n")
print("⚠️ Missing values in chem:\n", df_chem.isnull().sum(), "\n")


📌 recipes_nodup.csv:
   Recipe ID                                    Chemical Names  \
0          1               Ephedrine + Red Phosphorus + Iodine   
1          2             Toluene + Nitric Acid + Sulfuric Acid   
2          3       Hydrogen Peroxide + Acetone + Sulfuric Acid   
3          4  Ephedrine + Potassium Permanganate + Acetic Acid   
4          5             Potassium Nitrate + Charcoal + Sulfur   

                     Formulas   Quantities (g/mL)  \
0           C10H15NO + P + I2     30g + 15g + 10g   
1         C7H8 + HNO3 + H2SO4  50mL + 30mL + 40mL   
2        H2O2 + C3H6O + H2SO4   20mL + 30mL + 5mL   
3  C10H15NO + KMnO4 + CH3COOH    25g + 10g + 50mL   
4                KNO3 + C + S     75g + 15g + 10g   

                         CAS Numbers    Solvent Used  \
0   299-42-3 + 7723-14-0 + 7553-56-2  Acetone, Ether   
1   108-88-3 + 7697-37-2 + 7664-93-9             NaN   
2    7722-84-1 + 67-64-1 + 7664-93-9             NaN   
3     299-42-3 + 7722-64-7 + 64-19-7   

In [40]:
# Check data types of columns
print("🔍 Data types in recipes_nodup.csv:\n", df_recipes.dtypes, "\n")
print("🔍 Data types in chem.csv:\n", df_chem.dtypes, "\n")


🔍 Data types in recipes_nodup.csv:
 Recipe ID                                       int64
Chemical Names                                 object
Formulas                                       object
Quantities (g/mL)                              object
CAS Numbers                                    object
Solvent Used                                   object
Reaction Conditions                            object
Toxicity Level                                 object
Flammability (Yes/No)                          object
Reactivity (Stable/Unstable)                   object
Explosiveness (1-10)                            int64
Health Risk Score (0-100)                       int64
Environmental Hazard (Yes/No)                  object
Dual Use Potential (Yes/No)                    object
Intended Use                                   object
Export Restriction (Yes/No)                    object
Controlled Substance (Yes/No)                  object
Risk Assessment Score (0-100)                 

**Check Missing Values and Remove Duplicates**

In [41]:
print("📌 Duplicate rows in recipes_nodup:", df_recipes.duplicated().sum())
print("📌 Duplicate rows in chem:", df_chem.duplicated().sum())

print("⚠️ Missing values in recipes_nodup:\n", df_recipes.isnull().sum(), "\n")
print("⚠️ Missing values in chem:\n", df_chem.isnull().sum(), "\n")



📌 Duplicate rows in recipes_nodup: 0
📌 Duplicate rows in chem: 0
⚠️ Missing values in recipes_nodup:
 Recipe ID                                       0
Chemical Names                                  0
Formulas                                        0
Quantities (g/mL)                               0
CAS Numbers                                     0
Solvent Used                                   54
Reaction Conditions                             0
Toxicity Level                                  0
Flammability (Yes/No)                           0
Reactivity (Stable/Unstable)                    0
Explosiveness (1-10)                            0
Health Risk Score (0-100)                       0
Environmental Hazard (Yes/No)                   0
Dual Use Potential (Yes/No)                     0
Intended Use                                    0
Export Restriction (Yes/No)                     0
Controlled Substance (Yes/No)                   0
Risk Assessment Score (0-100)                   

In [42]:
print("📌 Duplicate rows in recipes_nodup:", df_recipes.duplicated().sum())
print("📌 Duplicate rows in chem:", df_chem.duplicated().sum())

print("⚠️ Missing values in recipes_nodup:\n", df_recipes.isnull().sum(), "\n")
print("⚠️ Missing values in chem:\n", df_chem.isnull().sum(), "\n")

📌 Duplicate rows in recipes_nodup: 0
📌 Duplicate rows in chem: 0
⚠️ Missing values in recipes_nodup:
 Recipe ID                                       0
Chemical Names                                  0
Formulas                                        0
Quantities (g/mL)                               0
CAS Numbers                                     0
Solvent Used                                   54
Reaction Conditions                             0
Toxicity Level                                  0
Flammability (Yes/No)                           0
Reactivity (Stable/Unstable)                    0
Explosiveness (1-10)                            0
Health Risk Score (0-100)                       0
Environmental Hazard (Yes/No)                   0
Dual Use Potential (Yes/No)                     0
Intended Use                                    0
Export Restriction (Yes/No)                     0
Controlled Substance (Yes/No)                   0
Risk Assessment Score (0-100)                   

In [43]:
# Display unique values for categorical columns
print("Unique values in key columns (recipes_nodup):\n")
for col in df_recipes.columns:
    print(f"{col}: {df_recipes[col].nunique()} unique values")

print("\n🛠 Unique values in key columns (chem):\n")
for col in df_chem.columns:
    print(f"{col}: {df_chem[col].nunique()} unique values")


Unique values in key columns (recipes_nodup):

Recipe ID: 76 unique values
Chemical Names: 76 unique values
Formulas: 75 unique values
Quantities (g/mL): 57 unique values
CAS Numbers: 76 unique values
Solvent Used: 3 unique values
Reaction Conditions: 37 unique values
Toxicity Level: 3 unique values
Flammability (Yes/No): 2 unique values
Reactivity (Stable/Unstable): 3 unique values
Explosiveness (1-10): 8 unique values
Health Risk Score (0-100): 14 unique values
Environmental Hazard (Yes/No): 1 unique values
Dual Use Potential (Yes/No): 1 unique values
Intended Use: 46 unique values
Export Restriction (Yes/No): 2 unique values
Controlled Substance (Yes/No): 2 unique values
Risk Assessment Score (0-100): 13 unique values
Regulatory Body: 5 unique values
Compliance Status (Compliant/Non-compliant): 2 unique values
Risk Category: 3 unique values
Risk Score (0-100): 10 unique values

🛠 Unique values in key columns (chem):

ID: 401 unique values
Chemical name: 393 unique values
molarcular 

**Preprocessing recipes_nodup.csv**

In [44]:
# Fill missing values with "Unknown"
df_recipes["Solvent Used"].fillna("Unknown", inplace=True)


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df_recipes["Solvent Used"].fillna("Unknown", inplace=True)


In [45]:
import pandas as pd

# Example: "30g + 15g + 10g" → {"Chem1": 30, "Chem2": 15, "Chem3": 10}
def extract_chemical_quantities(row):
    components = row.split(" + ")  # Split chemicals by '+'
    quantities = {}

    for comp in components:
        parts = comp.split()  # ["30g", "ChemicalName"]
        if len(parts) == 2:
            qty, chem = parts
            qty = int(qty.replace("g", "").replace("mL", ""))  # Remove unit
            quantities[chem] = qty

    return quantities

# Apply function to extract quantities
quantities_df = df_recipes["Quantities (g/mL)"].apply(extract_chemical_quantities)

# Convert dictionary column to DataFrame
df_quantities = pd.DataFrame(quantities_df.tolist()).fillna(0)

# Merge with main dataset
df_recipes = pd.concat([df_recipes, df_quantities], axis=1)

# Drop original column
df_recipes.drop(columns=["Quantities (g/mL)"], inplace=True)

print("✅ Successfully extracted chemical quantities!")


✅ Successfully extracted chemical quantities!


In [49]:
import json

# Store chemical names and quantities as structured JSON
df_recipes["Quantities (structured)"] = df_recipes["Quantities (g/mL)"].apply(lambda x: json.dumps(extract_chemical_quantities(x)))

# Drop original column
df_recipes.drop(columns=["Quantities (g/mL)"], inplace=True)

print("✅ Stored quantities in structured format!")


KeyError: 'Quantities (g/mL)'

Encode Categorical Features

In [46]:
from sklearn.preprocessing import LabelEncoder

binary_cols = [
    "Flammability (Yes/No)", "Reactivity (Stable/Unstable)",
    "Environmental Hazard (Yes/No)", "Dual Use Potential (Yes/No)",
    "Export Restriction (Yes/No)", "Controlled Substance (Yes/No)",
    "Compliance Status (Compliant/Non-compliant)"
]

# Convert Yes/No to 0/1
for col in binary_cols:
    df_recipes[col] = df_recipes[col].apply(lambda x: 1 if x in ["Yes", "Compliant", "Stable"] else 0)

# Encode Risk Category
df_recipes["Risk Category Encoded"] = LabelEncoder().fit_transform(df_recipes["Risk Category"])


Normalize Numerical Features

In [48]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()
# Remove "Total Quantity (g/mL)" from num_cols as it is no longer present in df_recipes
num_cols = ["Risk Score (0-100)", "Health Risk Score (0-100)", "Risk Assessment Score (0-100)"]

df_recipes[num_cols] = scaler.fit_transform(df_recipes[num_cols])

**Preprocessing chem.csv**

In [None]:
# Fill missing values in chem.csv
df_chem["CAS number"].fillna("Unknown", inplace=True)
df_chem["UN number"].fillna("Unknown", inplace=True)
df_chem["synonyms"].fillna("Unknown", inplace=True)

# Standardize column names
df_chem.columns = df_chem.columns.str.strip().str.lower().str.replace(" ", "_")

# Save preprocessed chem.csv
df_chem.to_csv("/content/preprocessed_chem.csv", index=False)
print("Preprocessed chem.csv saved!")
