In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.decomposition import PCA
from sklearn.preprocessing import KBinsDiscretizer

In [None]:
df = pd.read_csv('dataset.csv')

In [None]:
print("Original Data Shape:", df.shape)
print(df.head())
print(df.info())

In [None]:
# Data cleaning- replacing missing values with acceptable value
num_cols = df.select_dtypes(include=np.number).columns
cat_cols = df.select_dtypes(exclude=np.number).columns

imputer_num = SimpleImputer(strategy='mean')
imputer_cat = SimpleImputer(strategy='most_frequent')

df[num_cols] = imputer_num.fit_transform(df[num_cols])
df[cat_cols] = imputer_cat.fit_transform(df[cat_cols])

In [None]:
#removing duplicates
df.drop_duplicates(inplace=True)

In [None]:
#removing outliers
for col in num_cols:
    Q1 = df[col].quantile(0.25)
    Q3 = df[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df[col] = np.where(df[col] < lower, lower, df[col])
    df[col] = np.where(df[col] > upper, upper, df[col])

In [None]:
print("After Cleaning:", df.shape)

In [None]:
df.info()

In [None]:
df2 = pd.read_csv('synthetic_data.csv')

In [None]:
print("Original Data Shape:", df2.shape)
print(df2.head())
print(df2.info())

In [None]:
df2['stroke']=1

In [None]:
df2['stroke']

In [None]:
# Data cleaning- replacing missing values with acceptable value
num_cols = df2.select_dtypes(include=np.number).columns
cat_cols = df2.select_dtypes(exclude=np.number).columns

imputer_num = SimpleImputer(strategy='mean')
imputer_cat = SimpleImputer(strategy='most_frequent')

df2[num_cols] = imputer_num.fit_transform(df2[num_cols])
df2[cat_cols] = imputer_cat.fit_transform(df2[cat_cols])

In [None]:
#removing duplicates
df2.drop_duplicates(inplace=True)

In [None]:
#removing outliers
# Drop the 'Unnamed: 0' column if it exists before outlier removal
if 'Unnamed: 0' in df2.columns:
    df2 = df2.drop('Unnamed: 0', axis=1)
    # Update num_cols after dropping the column
    num_cols = df2.select_dtypes(include=np.number).columns


for col in num_cols:
    Q1 = df2[col].quantile(0.25)
    Q3 = df2[col].quantile(0.75)
    IQR = Q3 - Q1
    lower = Q1 - 1.5 * IQR
    upper = Q3 + 1.5 * IQR
    df2[col] = np.where(df2[col] < lower, lower, df2[col])
    df2[col] = np.where(df2[col] > upper, upper, df2[col])

In [None]:
print("After Cleaning:", df2.shape)

In [None]:
df2.info()

In [None]:
df.shape

In [None]:
df2.shape

In [None]:
df1 = pd.concat([df, df2], ignore_index=True)

print("After Integration:", df1.shape)

In [None]:
#encoding category
label_encoders = {}
# Redefine categorical columns after concatenation and exclude 'id'
cat_cols = df1.select_dtypes(include='object').columns
# Convert all categorical columns to string type to ensure consistency
for col in cat_cols:
    df1[col] = df1[col].astype(str)

for col in cat_cols:
    le = LabelEncoder()
    df1[col] = le.fit_transform(df1[col])
    label_encoders[col] = le

In [None]:
#feature scaling (Normalization/standardization)
scaler = StandardScaler()
df1[num_cols] = scaler.fit_transform(df1[num_cols])

print("✅ After Transformation:")
print(df1.head())

In [None]:
df

In [None]:
df1['stroke']

In [None]:
df1['stroke'] = df1['stroke'].astype(int)

In [None]:
df1['stroke']

In [None]:
# Reduce dataset to 2 principal components
pca = PCA(n_components=2)
pca_features = pca.fit_transform(df1[num_cols])

In [None]:
# Create a new DataFrame for PCA results
df_pca = pd.DataFrame(pca_features, columns=['PCA1', 'PCA2'])

In [None]:
# Combine PCA data with the original
df_final = pd.concat([df1, df_pca], axis=1)

print("After PCA Dimensionality Reduction:")
print(df_final.head())

In [None]:
df_final.to_csv('preprocessed_data.csv', index=False)
print("Preprocessed data saved as 'preprocessed_data.csv'")

In [None]:
df_final.info(
)

In [None]:
df_final.describe()