<a href="https://colab.research.google.com/github/Tech-pooja/CAD/blob/main/preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [77]:

#Pandas is commonly used for data manipulation and analysis, while numpy is used for numerical computations.
import pandas as pd
import numpy as np

#for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

#ColumnTransformer is used to apply different transformations to different columns of the dataset.
#StandardScaler and MinMaxScaler are used for feature scaling.
#OneHotEncoder is used for converting categorical variables into numerical representation.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

#train_test_split is used to split the dataset into training and testing subsets.
#StratifiedKFold is a cross-validation method that ensures each fold has the same proportion of class labels as the whole dataset.
#cross_validate is used to perform cross-validation and evaluate the model's performance.
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate

from sklearn.ensemble import RandomForestClassifier
# from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

#for evaluating the performance of the model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

#These lines import the warnings module and suppress all warnings that may occur during the execution of the code.
import warnings
warnings.filterwarnings('ignore')

In [78]:
import io
#df = pd.read_csv(io.BytesIO(uploaded['CAD.csv']))
df = pd.read_csv("/content/CADINPUT.csv")

In [79]:
df.head()

Unnamed: 0,HTN,Typical Chest Pain,Atypical,Age,Weight,BMI,BP,FBS,CR,TG,...,HDL,ESR,HB,K,WBC,Lymph,PLT,EF-TTE,Region RWMA,Cath
0,1,0,N,53,90,29.387755,110,90,0.7,250,...,30.0,7,15.6,4.7,5700,39,261,50,0,Cad
1,1,1,N,67,70,28.398718,140,80,1.0,309,...,36.0,26,13.9,4.7,7700,38,165,40,4,Cad
2,0,1,N,54,54,20.077335,100,85,1.0,103,...,45.0,10,13.5,4.7,7400,38,230,40,2,Cad
3,1,0,N,66,67,26.838648,100,78,1.2,63,...,27.0,76,12.1,4.4,13000,18,742,55,0,Normal
4,1,0,N,50,87,37.165193,110,104,1.0,170,...,50.0,27,13.2,4.0,9200,55,274,50,0,Normal


In [80]:
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum() / df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent*100], axis=1, keys=["Total", "Percent(%)"])

In [81]:
duplicates = df[df.duplicated()]
print("Total Duplicates rows observed:", duplicates.shape[0])
print("Dropping duplicates")

# Dropping duplicates
print("Shape before dropping duplicates: ", df.shape[0])
df.drop_duplicates(inplace=True)
print("Shae after dropping duplicates: ", df.shape[0])

Total Duplicates rows observed: 0
Dropping duplicates
Shape before dropping duplicates:  303
Shae after dropping duplicates:  303


In [82]:
num_cols = ['Age', 'Weight', 'BMI', 'BP', 'FBS', 'CR', 'TG', 'LDL', 'HDL', 'ESR', 'HB', 'K', 'WBC', 'Lymph', 'PLT','EF-TTE']
cat_cols = ['HTN', 'Typical Chest Pain', 'Atypical']
ord_cols = ['Region RWMA']

print(f"[Unique Values in {len(cat_cols)} Categorical Variables]\n")
for cat_col in cat_cols:
    print("* {} : {} Unique Values =>".format(cat_col, df[cat_col].nunique()), df[cat_col].unique())

[Unique Values in 3 Categorical Variables]

* HTN : 2 Unique Values => [1 0]
* Typical Chest Pain : 2 Unique Values => [0 1]
* Atypical : 2 Unique Values => ['N' 'Y']


In [83]:
print(f"Dataset : {df.shape[0]} rows X {df.shape[1]} columns")

Dataset : 303 rows X 21 columns


In [84]:
df[num_cols].describe()

Unnamed: 0,Age,Weight,BMI,BP,FBS,CR,TG,LDL,HDL,ESR,HB,K,WBC,Lymph,PLT,EF-TTE
count,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0,303.0
mean,58.89769,73.831683,27.248339,129.554455,119.184818,1.055611,150.343234,104.643564,40.233993,19.462046,13.153465,4.230693,7562.046205,32.39934,221.488449,47.231023
std,10.392278,11.987358,4.098865,18.938105,52.079653,0.264296,97.959451,35.396688,10.559077,15.936475,1.610452,0.458202,2413.739323,9.972592,60.796199,8.927194
min,30.0,48.0,18.115413,90.0,62.0,0.5,37.0,18.0,15.9,1.0,8.9,3.0,3700.0,7.0,25.0,15.0
25%,51.0,65.0,24.51438,120.0,88.5,0.9,90.0,80.0,33.5,9.0,12.2,3.9,5800.0,26.0,183.5,45.0
50%,58.0,74.0,26.77551,130.0,98.0,1.0,122.0,100.0,39.0,15.0,13.2,4.2,7100.0,32.0,210.0,50.0
75%,66.0,81.0,29.411765,140.0,130.0,1.2,177.0,122.0,45.5,26.0,14.2,4.5,8800.0,39.0,250.0,55.0
max,86.0,120.0,40.900658,190.0,400.0,2.2,1050.0,232.0,111.0,90.0,17.6,6.6,18000.0,60.0,742.0,60.0


In [85]:
df['Atypical'].sample(7)

28     N
10     N
119    N
114    N
146    N
291    Y
106    N
Name: Atypical, dtype: object

In [86]:
df.replace('N', 0, inplace=True)
df.replace('Y', 1, inplace=True)

df.head()

Unnamed: 0,HTN,Typical Chest Pain,Atypical,Age,Weight,BMI,BP,FBS,CR,TG,...,HDL,ESR,HB,K,WBC,Lymph,PLT,EF-TTE,Region RWMA,Cath
0,1,0,0,53,90,29.387755,110,90,0.7,250,...,30.0,7,15.6,4.7,5700,39,261,50,0,Cad
1,1,1,0,67,70,28.398718,140,80,1.0,309,...,36.0,26,13.9,4.7,7700,38,165,40,4,Cad
2,0,1,0,54,54,20.077335,100,85,1.0,103,...,45.0,10,13.5,4.7,7400,38,230,40,2,Cad
3,1,0,0,66,67,26.838648,100,78,1.2,63,...,27.0,76,12.1,4.4,13000,18,742,55,0,Normal
4,1,0,0,50,87,37.165193,110,104,1.0,170,...,50.0,27,13.2,4.0,9200,55,274,50,0,Normal


In [87]:
df['Atypical'].sample(7)

257    1
172    0
33     0
255    0
43     1
277    0
272    0
Name: Atypical, dtype: int64

In [88]:
X = df.drop("Cath", axis=1)
y = df['Cath']

map_label = {"Cad":1, "Normal":0}
y = y.map(map_label)

In [89]:
# Numerical variables:
# num_cols = ['Age', 'Weight', 'BMI', 'BP', 'FBS', 'CR', 'TG', 'LDL', 'HDL', 'ESR', 'HB', 'K', 'WBC', 'Lymph', 'PLT','EF-TTE']

# # Categorical variables:
# cat_cols = ['HTN', 'Typical Chest Pain', 'Atypical','Cath']
# cat_cols.remove('Cath')

# # Ordinal variables
# ord_cols = ['Region RWMA']

In [90]:
selected_columns = ['HTN', 'Typical Chest Pain', 'Atypical','Age', 'Weight', 'BMI', 'BP', 'FBS', 'CR', 'TG', 'LDL', 'HDL', 'ESR', 'HB', 'K', 'WBC', 'Lymph', 'PLT', 'EF-TTE', 'Region RWMA','Cath']

# Select the desired columns and assign them to a new DataFrame df2
df2 = df[selected_columns]
x_train = df2.drop("Cath", axis = 1)
y_train = df2['Cath']

In [91]:
# Create an instance of StandardScaler
scaler = StandardScaler()

# Fit the scaler to your training data and transform it
x_train_scaled = scaler.fit_transform(x_train)

In [92]:
# print(df2.head())


In [93]:
# preprocessor = ColumnTransformer(transformers = [('OHE', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first', dtype=np.int64), cat_cols),
#                                                  ('Scaler', StandardScaler(), num_cols)],
#                                  remainder = 'passthrough',verbose_feature_names_out=False).set_output(transform = 'pandas')
# X_prep = preprocessor.fit_transform(X)

In [94]:
# X_prep.head()

In [95]:
# X_prep.columns

In [96]:
import pickle
# Save the preprocessor
with open('standard_scaler1.pickle', 'wb') as handle:
    pickle.dump(scaler, handle, protocol=pickle.HIGHEST_PROTOCOL)