In [2]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

# Path to dataset
data_path = r"C:/Users/sanja/3. Cervical_Cancer_Risk_Analysis/Cervical_Cancer_Risk_Analysis\data\raw\risk_factors_cervical_cancer.csv"

# Load dataset
df = pd.read_csv(data_path)

# 1️⃣ Handle missing values
for col in df.columns:
    if df[col].dtype in ['int64', 'float64']:
        df[col].fillna(df[col].median(), inplace=True)
    else:
        df[col].fillna(df[col].mode()[0], inplace=True)

# 2️⃣ Encode categorical columns (if any)
df = pd.get_dummies(df, drop_first=True)

# 3️⃣ Feature scaling (except target variable)
scaler = StandardScaler()
features = df.drop(columns=['Biopsy'])  # 'Biopsy' is our main target
scaled_features = scaler.fit_transform(features)

# Convert back to DataFrame
df_scaled = pd.DataFrame(scaled_features, columns=features.columns)

# Add target column back
df_scaled['Biopsy'] = df['Biopsy']

# 4️⃣ Save cleaned dataset
processed_path = r"C:/Users/sanja/3. Cervical_Cancer_Risk_Analysis/Cervical_Cancer_Risk_Analysis/data/processed/risk_factors_cervical_cancer_clean.csv"
df_scaled.to_csv(processed_path, index=False)

print(f" Data preprocessing complete. Clean dataset saved at:\n{processed_path}")


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].mode()[0], inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting value

 Data preprocessing complete. Clean dataset saved at:
C:/Users/sanja/3. Cervical_Cancer_Risk_Analysis/Cervical_Cancer_Risk_Analysis/data/processed/risk_factors_cervical_cancer_clean.csv
