In [10]:
from pprint import pprint

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, f1_score, roc_auc_score
from ydata_profiling import ProfileReport

pd.set_option('display.max_colwidth', None)

In [11]:
datasets = [
    {
        "name": "wisconsin data",
        "path": "C:\\Users\\arpitha_work\\Downloads\\TRU MSCDS\\Sem2\\DS Seminar\\Project 1\\Datasets\\wisconsin data.csv",
        "features_numerical": [
            "radius_mean", "texture_mean", "perimeter_mean", "area_mean", "smoothness_mean", 
            "compactness_mean", "concavity_mean", "concave points_mean", "symmetry_mean", 
            "fractal_dimension_mean", "radius_se", "texture_se", "perimeter_se", "area_se", 
            "smoothness_se", "compactness_se", "concavity_se", "concave points_se", "symmetry_se", 
            "fractal_dimension_se", "radius_worst", "texture_worst", "perimeter_worst", 
            "area_worst", "smoothness_worst", "compactness_worst", "concavity_worst", 
            "concave points_worst", "symmetry_worst", "fractal_dimension_worst"
        ],
        "features_categorical": [],
        "target": "diagnosis"
    },

    {
        "name": "breast-cancer-dataset",
        "path": "C:\\Users\\arpitha_work\\Downloads\\TRU MSCDS\\Sem2\\DS Seminar\\Project 1\\Datasets\\breast-cancer-dataset.csv",
        "features_numerical": ["Year", "Age", "Tumor Size (cm)", "Inv-Nodes"],
        "features_categorical": ["Menopause", "Breast", "Metastasis", "Breast Quadrant", "History"],
        "target": "Diagnosis Result"
    },

    {
        "name": "BRCA",
        "path": "C:\\Users\\arpitha_work\\Downloads\\TRU MSCDS\\Sem2\\DS Seminar\\Project 1\\Datasets\\BRCA.csv",
        "features_numerical": ["Age", "Protein1", "Protein2", "Protein3", "Protein4"],
        "features_categorical": ["Gender", "Tumour_Stage", "Histology", "ER status", "PR status", "HER2 status", "Surgery_type"],
        "target": "Patient_Status"
    },

    {
        "name": "german bs data",
        "path": "C:\\Users\\arpitha_work\\Downloads\\TRU MSCDS\\Sem2\\DS Seminar\\Project 1\\Datasets\\german bs data.csv",
        "features_numerical": ["age", "size", "grade", "nodes", "pgr", "er", "rfstime"],
        "features_categorical": ["meno", "hormon"],
        "target": "status"
    },

    {
        "name": "seer data",
        "path": "C:\\Users\\arpitha_work\\Downloads\\TRU MSCDS\\Sem2\\DS Seminar\\Project 1\\Datasets\\seer data.csv",
        "features_numerical": ["Age", "Survival Months", "Regional Node Examined"],
        "features_categorical": ["Race", "Marital Status", "T Stage", "N Stage", "6th Stage", "differentiate", "Grade", "A Stage", "Tumor Size", "Estrogen Status", "Progesterone Status", "Reginol Node Positive"],
        "target": "Status"
    }
]


In [12]:
dataset_path = datasets[0]["path"]
df = pd.read_csv(dataset_path)
df

Unnamed: 0,id,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,...,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,Unnamed: 32
0,842302,M,17.99,10.38,122.80,1001.0,0.11840,0.27760,0.30010,0.14710,...,17.33,184.60,2019.0,0.16220,0.66560,0.7119,0.2654,0.4601,0.11890,
1,842517,M,20.57,17.77,132.90,1326.0,0.08474,0.07864,0.08690,0.07017,...,23.41,158.80,1956.0,0.12380,0.18660,0.2416,0.1860,0.2750,0.08902,
2,84300903,M,19.69,21.25,130.00,1203.0,0.10960,0.15990,0.19740,0.12790,...,25.53,152.50,1709.0,0.14440,0.42450,0.4504,0.2430,0.3613,0.08758,
3,84348301,M,11.42,20.38,77.58,386.1,0.14250,0.28390,0.24140,0.10520,...,26.50,98.87,567.7,0.20980,0.86630,0.6869,0.2575,0.6638,0.17300,
4,84358402,M,20.29,14.34,135.10,1297.0,0.10030,0.13280,0.19800,0.10430,...,16.67,152.20,1575.0,0.13740,0.20500,0.4000,0.1625,0.2364,0.07678,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
564,926424,M,21.56,22.39,142.00,1479.0,0.11100,0.11590,0.24390,0.13890,...,26.40,166.10,2027.0,0.14100,0.21130,0.4107,0.2216,0.2060,0.07115,
565,926682,M,20.13,28.25,131.20,1261.0,0.09780,0.10340,0.14400,0.09791,...,38.25,155.00,1731.0,0.11660,0.19220,0.3215,0.1628,0.2572,0.06637,
566,926954,M,16.60,28.08,108.30,858.1,0.08455,0.10230,0.09251,0.05302,...,34.12,126.70,1124.0,0.11390,0.30940,0.3403,0.1418,0.2218,0.07820,
567,927241,M,20.60,29.33,140.10,1265.0,0.11780,0.27700,0.35140,0.15200,...,39.42,184.60,1821.0,0.16500,0.86810,0.9387,0.2650,0.4087,0.12400,


In [17]:
# Function to load and clean the dataset
def load_and_clean_data(dataset_path):
    # Load the dataset
    df = pd.read_csv(dataset_path)

    # Data cleaning steps
    print(f"\nCleaning data for {dataset_path}:")
    
    # 1. Handle missing values
    print("Missing values in each column:")
    print(df.isnull().sum())

    # Remove the 'Unnamed: 32' column if it exists
    if 'Unnamed: 32' in df.columns:
        df = df.drop(columns=['Unnamed: 32'])
        # Remove rows with missing values
    df = df.dropna()

    # 2. Handle duplicates
    print("\nNumber of duplicate rows before dropping:")
    print(df.duplicated().sum())
    df = df.drop_duplicates()  # Remove duplicate rows
    
    # Return cleaned dataframe
    return df

In [18]:
# Loop through all datasets
for dataset in datasets:
    print(f"\nProcessing dataset: {dataset['name']}")

    # Load and clean the dataset
    dataset_path = dataset["path"]
    df = load_and_clean_data(dataset_path)

    # Check if the dataframe is not empty
    if df.empty:
        print(f"The dataframe for {dataset['name']} is empty after cleaning. Please check the dataset.")
    else:
        # Generate the EDA report using ydata_profiling
        profile = ProfileReport(df, title=f"EDA Report - {dataset['name']}", explorative=True)

        # Save the profile report as an HTML file
        report_path = f"eda_report_{dataset['name']}.html"
        profile.to_file(report_path)
        print(f"\nEDA report for {dataset['name']} has been generated and saved as '{report_path}'.")


Processing dataset: wisconsin data

Cleaning data for C:\Users\arpitha_work\Downloads\TRU MSCDS\Sem2\DS Seminar\Project 1\Datasets\wisconsin data.csv:
Missing values in each column:
id                           0
diagnosis                    0
radius_mean                  0
texture_mean                 0
perimeter_mean               0
area_mean                    0
smoothness_mean              0
compactness_mean             0
concavity_mean               0
concave points_mean          0
symmetry_mean                0
fractal_dimension_mean       0
radius_se                    0
texture_se                   0
perimeter_se                 0
area_se                      0
smoothness_se                0
compactness_se               0
concavity_se                 0
concave points_se            0
symmetry_se                  0
fractal_dimension_se         0
radius_worst                 0
texture_worst                0
perimeter_worst              0
area_worst                   0
smoothness_

Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]


EDA report for wisconsin data has been generated and saved as 'eda_report_wisconsin data.html'.

Processing dataset: breast-cancer-dataset

Cleaning data for C:\Users\arpitha_work\Downloads\TRU MSCDS\Sem2\DS Seminar\Project 1\Datasets\breast-cancer-dataset.csv:
Missing values in each column:
S/N                 0
Year                0
Age                 0
Menopause           0
Tumor Size (cm)     0
Inv-Nodes           0
Breast              0
Metastasis          0
Breast Quadrant     0
History             0
Diagnosis Result    0
dtype: int64

Number of duplicate rows before dropping:
0


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]


EDA report for breast-cancer-dataset has been generated and saved as 'eda_report_breast-cancer-dataset.html'.

Processing dataset: BRCA

Cleaning data for C:\Users\arpitha_work\Downloads\TRU MSCDS\Sem2\DS Seminar\Project 1\Datasets\BRCA.csv:
Missing values in each column:
Patient_ID             7
Age                    7
Gender                 7
Protein1               7
Protein2               7
Protein3               7
Protein4               7
Tumour_Stage           7
Histology              7
ER status              7
PR status              7
HER2 status            7
Surgery_type           7
Date_of_Surgery        7
Date_of_Last_Visit    24
Patient_Status        20
dtype: int64

Number of duplicate rows before dropping:
0


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]


EDA report for BRCA has been generated and saved as 'eda_report_BRCA.html'.

Processing dataset: german bs data

Cleaning data for C:\Users\arpitha_work\Downloads\TRU MSCDS\Sem2\DS Seminar\Project 1\Datasets\german bs data.csv:
Missing values in each column:
Unnamed: 0    0
pid           0
age           0
meno          0
size          0
grade         0
nodes         0
pgr           0
er            0
hormon        0
rfstime       0
status        0
dtype: int64

Number of duplicate rows before dropping:
0


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]


EDA report for german bs data has been generated and saved as 'eda_report_german bs data.html'.

Processing dataset: seer data

Cleaning data for C:\Users\arpitha_work\Downloads\TRU MSCDS\Sem2\DS Seminar\Project 1\Datasets\seer data.csv:
Missing values in each column:
Age                       0
Race                      0
Marital Status            0
T Stage                   0
N Stage                   0
6th Stage                 0
differentiate             0
Grade                     0
A Stage                   0
Tumor Size                0
Estrogen Status           0
Progesterone Status       0
Regional Node Examined    0
Reginol Node Positive     0
Survival Months           0
Status                    0
dtype: int64

Number of duplicate rows before dropping:
1


Summarize dataset:   0%|          | 0/5 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]


EDA report for seer data has been generated and saved as 'eda_report_seer data.html'.
