# Data PreProcessing

### *Import Libraries

In [1]:
# define dataset
import pandas as pd

#missing values define
import numpy as np

#define data vizulisation
import matplotlib.pyplot as plt
import seaborn as sns

#define model building
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

### *Importing DATA

In [4]:
df = pd.read_csv('./Data/data.csv')

### *EDA

In [7]:
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,28,1,2,130,132,0,2,185,0,0.0,?,?,?,0
1,29,1,2,120,243,0,0,160,0,0.0,?,?,?,0
2,29,1,2,140,?,0,0,170,0,0.0,?,?,?,0
3,30,0,1,170,237,0,1,170,0,0.0,?,?,6,0
4,31,0,2,100,219,0,1,150,0,0.0,?,?,?,0


### *Handling Missing Values

In [10]:
# Replace "?" with np.nan
df.replace("?", np.nan, inplace=True)
df.head()

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,num
0,28,1,2,130,132.0,0,2,185,0,0.0,,,,0
1,29,1,2,120,243.0,0,0,160,0,0.0,,,,0
2,29,1,2,140,,0,0,170,0,0.0,,,,0
3,30,0,1,170,237.0,0,1,170,0,0.0,,,6.0,0
4,31,0,2,100,219.0,0,1,150,0,0.0,,,,0


### *Feature Engineering

In [13]:
# Separate target column before imputing
target_column = 'num       '
y = df[target_column]
X = df.drop(columns=[target_column])

In [15]:
df[target_column] =df[target_column].astype('object')

In [17]:
df.describe(include='object')

Unnamed: 0,trestbps,chol,fbs,restecg,thalach,exang,slope,ca,thal,num
count,293,271,286,293,293,293,104,3,28,294
unique,31,153,2,3,71,2,3,1,3,2
top,120,246,0,0,150,0,2,0,7,0
freq,65,5,266,235,29,204,91,3,11,188


In [47]:
# Handle missing values
imputer = SimpleImputer(strategy='mean')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)
print(X_imputed)

      age  sex   cp  trestbps        chol  fbs  restecg  thalach  exang  \
0    28.0  1.0  2.0     130.0  132.000000  0.0      2.0    185.0    0.0   
1    29.0  1.0  2.0     120.0  243.000000  0.0      0.0    160.0    0.0   
2    29.0  1.0  2.0     140.0  250.848708  0.0      0.0    170.0    0.0   
3    30.0  0.0  1.0     170.0  237.000000  0.0      1.0    170.0    0.0   
4    31.0  0.0  2.0     100.0  219.000000  0.0      1.0    150.0    0.0   
..    ...  ...  ...       ...         ...  ...      ...      ...    ...   
289  52.0  1.0  4.0     160.0  331.000000  0.0      0.0     94.0    1.0   
290  54.0  0.0  3.0     130.0  294.000000  0.0      1.0    100.0    1.0   
291  56.0  1.0  4.0     155.0  342.000000  1.0      0.0    150.0    1.0   
292  58.0  0.0  2.0     180.0  393.000000  0.0      0.0    110.0    1.0   
293  65.0  1.0  4.0     130.0  275.000000  0.0      1.0    115.0    1.0   

     oldpeak     slope   ca      thal  
0        0.0  1.894231  0.0  5.642857  
1        0.0  1.894

### *Handling Outliers

In [24]:
# # Box plot
# sns.boxplot(data=X_imputed)
# plt.show()

In [26]:
# Handling Outliers

# # Calculate Q1 (25th percentile) and Q3 (75th percentile)
# Q1 = X_imputed[['age',	'sex', 'cp', 'trestbps', 'chol', 'fbs',	'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']].quantile(0.25)
# Q3 = X_imputed[['age',	'sex', 'cp', 'trestbps', 'chol', 'fbs',	'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']].quantile(0.75)
# IQR = Q3 - Q1

# # Determine outlier bounds
# lower_bound = Q1 - 1.5 * IQR
# upper_bound = Q3 + 1.5 * IQR

# # Identify outliers
# outliers = ((X_imputed[['age',	'sex', 'cp', 'trestbps', 'chol', 'fbs',	'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']] < lower_bound) |
#             (X_imputed[['age',	'sex', 'cp', 'trestbps', 'chol', 'fbs',	'restecg', 'thalach', 'exang', 'oldpeak', 'slope', 'ca', 'thal']] > upper_bound)).any(axis=1)

# # Remove outliers
# df_cleaned = X_imputed[~outliers]

# # Output the cleaned DataFrame
# print("Original DataFrame shape:", df.shape)
# print("Cleaned DataFrame shape:", df_cleaned.shape)
# print(df_cleaned)

### *Data Vizulisation

In [29]:
# # Box plot
# sns.boxplot(data=df_cleaned)
# plt.show()

# Model building

### *Train Test Split

In [49]:
x_train,x_test,y_train,y_test=train_test_split(X_imputed,y,test_size=0.2,random_state=42)

In [51]:
x_train.shape

(235, 13)

### *Model Initialization

In [54]:
model=KNeighborsClassifier(n_neighbors=3)

### *Model Training

In [57]:
model.fit(x_train,y_train)

### *Hyper Parameter tuning

# evaluation & Result:

In [61]:
# Predict and evaluate the model
y_pred = model.predict(x_test)
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.6610169491525424
Confusion Matrix:
[[30  8]
 [12  9]]
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.79      0.75        38
           1       0.53      0.43      0.47        21

    accuracy                           0.66        59
   macro avg       0.62      0.61      0.61        59
weighted avg       0.65      0.66      0.65        59



## Hyper Parameter Tuning

In [64]:
tuned_model = KNeighborsClassifier()

In [66]:
params_grid={
    'n_neighbors': [i for i in np.arange(0,2,1)],
    'algorithm' : ['ball_tree', 'kd_tree'],
    'weights' : ['uniform', 'distance'],
}

In [68]:
params_grid_cv = GridSearchCV(estimator=KNeighborsClassifier(), param_grid=params_grid)

In [70]:
params_grid_cv.fit(x_train, y_train)

20 fits failed out of a total of 40.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\hp\anaconda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\hp\anaconda3\Lib\site-packages\sklearn\neighbors\_classification.py", line 213, in fit
    self._validate_params()
  File "C:\Users\hp\anaconda3\Lib\site-packages\sklearn\base.py", line 600, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\hp\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py", line 97, in validate_parameter_constraints
    raise InvalidParameterE

In [72]:
params_grid

{'n_neighbors': [0, 1],
 'algorithm': ['ball_tree', 'kd_tree'],
 'weights': ['uniform', 'distance']}

In [73]:
params_grid_cv.best_params_

{'algorithm': 'ball_tree', 'n_neighbors': 1, 'weights': 'uniform'}

In [74]:
tuned_model_KNN = KNeighborsClassifier(algorithm= 'ball_tree', n_neighbors = 1, weights= 'uniform')

In [75]:
tuned_model_KNN.fit(x_train,y_train)

In [77]:
# Predict and evaluate the model
y_pred = tuned_model_KNN.predict(x_test)

In [82]:
accuracy = accuracy_score(y_test, y_pred)
conf_matrix = confusion_matrix(y_test, y_pred)
class_report = classification_report(y_test, y_pred)

print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print("Classification Report:")
print(class_report)

Accuracy: 0.6779661016949152
Confusion Matrix:
[[29  9]
 [10 11]]
Classification Report:
              precision    recall  f1-score   support

           0       0.74      0.76      0.75        38
           1       0.55      0.52      0.54        21

    accuracy                           0.68        59
   macro avg       0.65      0.64      0.64        59
weighted avg       0.67      0.68      0.68        59

