In [None]:
import pandas as pd
import numpy as np

# 1. Load the dataset
df = pd.read_csv('diabetes.csv')

# 2. Identify columns where '0' is technically impossible (biological stats)
#    Pregnancies can legitimately be 0, so we exclude it from this list.
cols_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']

# 3. Replace 0 with NaN so pandas recognizes them as missing
df[cols_with_zeros] = df[cols_with_zeros].replace(0, np.nan)

# 4. Impute missing values with the Median
#    (Median is safer than Mean for biological data which often has outliers)
for col in cols_with_zeros:
    df[col] = df[col].fillna(df[col].median())

# 5. Save specifically for Classification
#    We KEEP the 'Outcome' column because that is what we want to predict.
df.to_csv('diabetes_classification.csv', index=False)

print("Cleaned dataset saved as 'diabetes_classification.csv'")
print(df.head())

In [None]:
import pandas as pd

# 1. Load Data
df = pd.read_csv('WineQT.csv')

# 2. General Cleaning
if 'Id' in df.columns:
    df = df.drop(columns=['Id'])

# 3. Create Regression Version (Target = 'quality')
df.to_csv('wine_regression.csv', index=False)
print("Saved wine_regression.csv")

# 4. Create Classification Version (Target = 'quality_label')
# Rule: Quality >= 6 is Good (1), else Bad (0)
df_class = df.copy()
df_class['quality_label'] = (df_class['quality'] >= 6).astype(int)
df_class = df_class.drop(columns=['quality']) 
df_class.to_csv('wine_classification.csv', index=False)
print("Saved wine_classification.csv")

In [3]:
import pandas as pd

# 1. Load Data
df = pd.read_csv('housing.csv')

# 2. General Cleaning
# (Dataset was found to be clean, but we drop duplicates just in case)
df = df.drop_duplicates()

# 3. Save for Regression
# Target is 'MEDV' (House Price)
df.to_csv('housing_regression.csv', index=False)
print("Saved housing_regression.csv")

Saved housing_regression.csv


In [4]:
import pandas as pd
import numpy as np

# 1. Load Data
df = pd.read_csv('adult.csv')

# 2. Clean Strings (Strip Whitespace)
# This fixes issues where ' <=50K' != '<=50K'
cat_cols = df.select_dtypes(include=['object']).columns
for col in cat_cols:
    df[col] = df[col].str.strip()

# 3. Handle Missing Values
# Replace '?' with NaN and drop those rows
df.replace('?', np.nan, inplace=True)
df.dropna(inplace=True)

# 4. Remove Duplicates
df.drop_duplicates(inplace=True)

# 5. Encode Target
# Map income to binary 0/1
df['income'] = df['income'].map({'<=50K': 0, '>50K': 1})

# 6. Save
df.to_csv('adult_classification.csv', index=False)
print("Saved adult_classification.csv")

Saved adult_classification.csv


In [5]:
import pandas as pd

# 1. Load Data
df = pd.read_csv('train.csv')

# 2. Impute Missing Values
# Age -> Median
df['Age'] = df['Age'].fillna(df['Age'].median())
# Embarked -> Mode
df['Embarked'] = df['Embarked'].fillna(df['Embarked'].mode()[0])

# 3. Drop Irrelevant/High-Missing Columns
cols_to_drop = ['Cabin', 'Name', 'Ticket', 'PassengerId']
df.drop(columns=cols_to_drop, inplace=True)

# 4. Encode Sex (0=male, 1=female)
df['Sex'] = df['Sex'].map({'male': 0, 'female': 1})

# 5. Save
df.to_csv('titanic_classification.csv', index=False)
print("Saved titanic_classification.csv")

Saved titanic_classification.csv


In [6]:
import pandas as pd

# 1. Load Data
df = pd.read_csv('WineQT.csv')

# 2. Drop Target and ID
# Clustering is unsupervised, so we remove the answer ('quality')
cols_to_drop = ['quality', 'Id']
df.drop(columns=[c for c in cols_to_drop if c in df.columns], inplace=True)

# 3. Save
df.to_csv('wine_clustering.csv', index=False)
print("Saved wine_clustering.csv")

Saved wine_clustering.csv


In [7]:
import pandas as pd
import numpy as np

# 1. Load Data
df = pd.read_csv('diabetes.csv')

# 2. Impute Invalid Zeros
cols_with_zeros = ['Glucose', 'BloodPressure', 'SkinThickness', 'Insulin', 'BMI']
df[cols_with_zeros] = df[cols_with_zeros].replace(0, np.nan)
for col in cols_with_zeros:
    df[col] = df[col].fillna(df[col].median())

# 3. Drop Target
df.drop(columns=['Outcome'], inplace=True)

# 4. Save
df.to_csv('diabetes_clustering.csv', index=False)
print("Saved diabetes_clustering.csv")

Saved diabetes_clustering.csv


In [None]:
import pandas as pd

# 1. Load Data
df = pd.read_csv('housing.csv')

# 2. Drop Target ('MEDV')
# We only want features for clustering
if 'MEDV' in df.columns:
    df = df.drop(columns=['MEDV'])

# 3. Save
df.to_csv('boston_clustering.csv', index=False)
print("Saved boston_clustering.csv")

In [1]:
import pandas as pd
import numpy as np

# 1. Load Data
df = pd.read_csv('heart.csv')

# 2. Remove Duplicates
# The dataset has many duplicates (approx 700+). We drop them to return to the unique ~300 patients.
df_clean = df.drop_duplicates()

# 3. Handle Invalid Values
# 'ca' (major vessels) should be 0-3. Value '4' is treated as missing.
df_clean['ca'] = df_clean['ca'].replace(4, np.nan)
df_clean['ca'] = df_clean['ca'].fillna(df_clean['ca'].mode()[0])

# 'thal' should be 1-3. Value '0' is treated as missing.
df_clean['thal'] = df_clean['thal'].replace(0, np.nan)
df_clean['thal'] = df_clean['thal'].fillna(df_clean['thal'].mode()[0])

# 4. Save for Classification (Keeps 'target')
df_clean.to_csv('heart_classification.csv', index=False)
print("Saved heart_classification.csv")

# 5. Save for Clustering (Drops 'target')
df_cluster = df_clean.drop(columns=['target'])
df_cluster.to_csv('heart_clustering.csv', index=False)
print("Saved heart_clustering.csv")

Saved heart_classification.csv
Saved heart_clustering.csv


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['ca'] = df_clean['ca'].replace(4, np.nan)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['ca'] = df_clean['ca'].fillna(df_clean['ca'].mode()[0])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_clean['thal'] = df_clean['thal'].replace(0, np.nan)
A value is trying to be set on a c