In [2]:
import pandas as pd
data = pd.read_csv("cardio_base.csv")

In [3]:
data.head()

Unnamed: 0,id,age,gender,height,weight,ap_hi,ap_lo,cholesterol,smoke
0,0,18393,2,168,62.0,110,80,1,0
1,1,20228,1,156,85.0,140,90,3,0
2,2,18857,1,165,64.0,130,70,3,0
3,3,17623,2,169,82.0,150,100,1,0
4,4,17474,1,156,56.0,100,60,1,0


In [4]:
# Convert age from days to years
data['age_years'] = data['age'] / 365.25

# Drop the original 'age' column and 'id' as they are not needed for modeling
data.drop(['age', 'id'], axis=1, inplace=True)

# Check for missing values
missing_values = data.isnull().sum()

# Normalize the data (excluding 'cholesterol' as it's our target variable)
from sklearn.preprocessing import MinMaxScaler

features = data.drop('cholesterol', axis=1)
scaler = MinMaxScaler()
features_scaled = pd.DataFrame(scaler.fit_transform(features), columns=features.columns)

# Prepare the final dataset for KNN
final_data = features_scaled
final_data['cholesterol'] = data['cholesterol']

missing_values, final_data.head()


(gender         0
 height         0
 weight         0
 ap_hi          0
 ap_lo          0
 cholesterol    0
 smoke          0
 age_years      0
 dtype: int64,
    gender    height    weight     ap_hi     ap_lo  smoke  age_years  \
 0     1.0  0.579487  0.273684  0.016079  0.013550    0.0   0.588076   
 1     0.0  0.517949  0.394737  0.017934  0.014453    0.0   0.730159   
 2     0.0  0.564103  0.284211  0.017316  0.012647    0.0   0.624003   
 3     1.0  0.584615  0.378947  0.018553  0.015357    0.0   0.528455   
 4     0.0  0.517949  0.242105  0.015461  0.011743    0.0   0.516918   
 
    cholesterol  
 0            1  
 1            3  
 2            3  
 3            1  
 4            1  )

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score

# Split the data into training and testing sets
X = final_data.drop('cholesterol', axis=1)
y = final_data['cholesterol']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a KNN model
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Predict cholesterol levels on the test set
y_pred = knn.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
classification_reports = classification_report(y_test, y_pred,output_dict=True)

accuracy


0.7214285714285714

In [6]:
# ! pip install tabulate
from tabulate import tabulate

# Convert the classification report dictionary to a DataFrame
report_df = pd.DataFrame(classification_reports).transpose()

# Print the formatted classification report
print(tabulate(report_df, headers='keys', tablefmt='psql', showindex=True, floatfmt=".2f"))

+--------------+-------------+----------+------------+-----------+
|              |   precision |   recall |   f1-score |   support |
|--------------+-------------+----------+------------+-----------|
| 1            |        0.75 |     0.95 |       0.84 |  10475.00 |
| 2            |        0.15 |     0.04 |       0.06 |   1881.00 |
| 3            |        0.22 |     0.04 |       0.07 |   1644.00 |
| accuracy     |        0.72 |     0.72 |       0.72 |      0.72 |
| macro avg    |        0.37 |     0.34 |       0.32 |  14000.00 |
| weighted avg |        0.61 |     0.72 |       0.65 |  14000.00 |
+--------------+-------------+----------+------------+-----------+


In [7]:
# ! pip install imbalanced-learn

from imblearn.over_sampling import SMOTE

# Assuming X and y are your features and target variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)


In [8]:
# from sklearn.neighbors import KNeighborsClassifier
# from sklearn.metrics import classification_report, accuracy_score

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_smote, y_train_smote)

# Make predictions and evaluate the model
y_pred = knn.predict(X_test)
print(accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

0.4599285714285714
              precision    recall  f1-score   support

           1       0.79      0.51      0.62     10475
           2       0.16      0.31      0.21      1881
           3       0.15      0.32      0.21      1644

    accuracy                           0.46     14000
   macro avg       0.36      0.38      0.34     14000
weighted avg       0.63      0.46      0.51     14000



In [9]:
class_counts = y_train.value_counts()

In [10]:
class_counts

cholesterol
1    41910
2     7668
3     6422
Name: count, dtype: int64

In [11]:
from sklearn.utils import resample

# Separate the majority and minority classes
df_majority = data[data.cholesterol==1]
df_minority_2 = data[data.cholesterol==2]
df_minority_3 = data[data.cholesterol==3]

# Upsample minority classes
df_minority_upsampled_2 = resample(df_minority_2, 
                                   replace=True,     # sample with replacement
                                   n_samples=df_majority.shape[0],    # to match majority class
                                   random_state=123) # reproducible results

df_minority_upsampled_3 = resample(df_minority_3, 
                                   replace=True,     # sample with replacement
                                   n_samples=df_majority.shape[0],    # to match majority class
                                   random_state=123) # reproducible results

# Combine majority class with upsampled minority classes
df_upsampled = pd.concat([df_majority, df_minority_upsampled_2, df_minority_upsampled_3])

# Undersample the majority class slightly to avoid a sudden jump in class sizes
df_majority_downsampled = resample(df_majority, 
                                   replace=False,    # sample without replacement
                                   n_samples=int(df_majority.shape[0]*0.8),     # to decrease majority class size
                                   random_state=123) # reproducible results

# Combine the downsampled majority class with upsampled minority classes again
df_balanced = pd.concat([df_majority_downsampled, df_minority_upsampled_2, df_minority_upsampled_3])

# New class distribution
balanced_class_distribution = df_balanced.cholesterol.value_counts(normalize=True)

# Prepare the features and target variable for the balanced dataset
X_balanced = df_balanced.drop('cholesterol', axis=1)
y_balanced = df_balanced['cholesterol']

# Split the balanced data into new training and testing sets
X_train_balanced, X_test_balanced, y_train_balanced, y_test_balanced = train_test_split(
    X_balanced, y_balanced, test_size=0.2, random_state=42)

balanced_class_distribution



cholesterol
2    0.357143
3    0.357143
1    0.285714
Name: proportion, dtype: float64

In [12]:
# Retrain the KNN model with the balanced dataset
knn_balanced = KNeighborsClassifier(n_neighbors=5)
knn_balanced.fit(X_train_balanced, y_train_balanced)

# Predict cholesterol levels on the balanced test set
y_pred_balanced = knn_balanced.predict(X_test_balanced)

# Evaluate the retrained model
accuracy_balanced = accuracy_score(y_test_balanced, y_pred_balanced)
classification_report_balanced = classification_report(y_test_balanced, y_pred_balanced,output_dict=True)

accuracy_balanced


0.7727365694027816

In [13]:
from tabulate import tabulate

# Convert the classification report dictionary to a DataFrame
report_df_balanced = pd.DataFrame(classification_report_balanced).transpose()

# Print the formatted classification report
print(tabulate(report_df_balanced, headers='keys', tablefmt='psql', showindex=True, floatfmt=".2f"))

+--------------+-------------+----------+------------+-----------+
|              |   precision |   recall |   f1-score |   support |
|--------------+-------------+----------+------------+-----------|
| 1            |        0.78 |     0.41 |       0.54 |   8405.00 |
| 2            |        0.75 |     0.90 |       0.82 |  10413.00 |
| 3            |        0.79 |     0.93 |       0.86 |  10518.00 |
| accuracy     |        0.77 |     0.77 |       0.77 |      0.77 |
| macro avg    |        0.78 |     0.75 |       0.74 |  29336.00 |
| weighted avg |        0.77 |     0.77 |       0.75 |  29336.00 |
+--------------+-------------+----------+------------+-----------+


**Instance-based learning**, specifically the k-Nearest Neighbors (KNN) algorithm, is known as a "lazy learning" approach because it does not build a model or learn a function from the training data during the training phase. Instead, it stores the entire training dataset and performs the learning or prediction process when a new instance needs to be classified or predicted.

The key aspects of instance-based learning and KNN can be summarized as follows:

* **No explicit model building**: During the training phase, the algorithm simply stores the training examples (feature vectors and their corresponding labels or target values) in memory. No model or function is learned from the data at this stage.

* **Similarity-based prediction**: When a new instance needs to be classified or predicted, the algorithm finds the k most similar instances from the training data to the new instance. Similarity is typically measured using distance metrics like Euclidean distance, Manhattan distance, or cosine similarity, depending on the nature of the data.

**d(P, Q) = √[Σ(pi - qi)^2]**


* **Nearest neighbor voting/averaging**: For classification tasks, the algorithm assigns the class label based on the majority vote among the k nearest neighbors. For regression tasks, the predicted value is the average or median of the target values of the k nearest neighbors.

* **Lazy learning**: The learning process is deferred until a new instance needs to be predicted, making it a "lazy" approach. The algorithm does not build a model or learn a function until it encounters a new instance.

* **Dimensionality and distance metrics**: The performance of KNN can be affected by the dimensionality of the feature space and the choice of distance metric. As the number of features increases, the distance calculations become less reliable, and the algorithm may need to be adapted or combined with dimensionality reduction techniques.

* **Memory and computational complexity**: KNN requires storing the entire training dataset, which can be memory-intensive for large datasets. Additionally, the prediction process involves computing distances between the new instance and all training instances, which can be computationally expensive, especially for large datasets or high-dimensional feature spaces.

* **Parameter tuning**: The choice of the value of k (the number of nearest neighbors) and the distance metric can significantly impact the performance of the KNN algorithm. Cross-validation or grid search techniques are often used to select the optimal value of k and the appropriate distance metric for a given dataset.


In summary, instance-based learning and KNN are unique in their approach of deferring the learning process until prediction time, relying solely on the similarity between the new instance and the stored training examples. This lazy learning approach has advantages in terms of simplicity and adaptability but also faces challenges related to dimensionality, memory requirements, and computational complexity, which need to be addressed through appropriate techniques and parameter tuning.

In the k-Nearest Neighbors (KNN) algorithm, the choice of k and the weighting of attributes can have a significant impact on the performance of the model. Here are some general guidelines and considerations:


**Equal Weighting of Attributes**:
* Equal weighting of attributes is appropriate when there is no prior knowledge or reason to believe that some features are more important than others in determining the target variable.
* It is a simple and straightforward approach that treats all features equally.
* However, if there is prior knowledge or domain expertise that suggests certain features are more relevant or discriminative, it may be better to assign higher weights to those features.

**Choice of k**:

**Larger k**:

* A larger value of k tends to reduce the noise in the classification or regression, making the model more stable and less sensitive to outliers.

* However, a larger k may also lead to smoother decision boundaries, potentially missing fine details or local patterns in the data.
* A larger k is generally preferred when the data is noisy or when there is a higher risk of overfitting with smaller values of k.

**k = 1 (Nearest Neighbor)**:

* Setting k = 1 means that the prediction is based on the single nearest neighbor, making the model highly sensitive to individual data points and potential outliers.
* k = 1 can lead to overfitting, especially in cases where the training data is noisy or has overlapping class distributions.
* However, k = 1 can be useful when the decision boundaries are highly complex or irregular, and a higher k might over-smooth the boundaries.

In general, the choice of k and attribute weighting should be guided by the characteristics of the dataset, prior knowledge, and the trade-off between bias and variance. Here are some common practices:

* Cross-validation: Use techniques like k-fold cross-validation or leave-one-out cross-validation to evaluate different values of k and attribute weightings. The combination that yields the best performance on the validation set can be chosen for the final model.
* Domain knowledge: If there is prior knowledge or domain expertise that suggests certain features are more important, assign higher weights to those features.
* Feature scaling: Scale the features to a common range, especially if the features have different units or scales, to prevent some features from dominating the distance calculations.
* Feature selection: Perform feature selection techniques to identify and remove redundant or irrelevant features, which can improve the performance of KNN and reduce the curse of dimensionality.

It's important to note that KNN is a **non-parametric algorithm**

**Weighted Example**

One way to assign different weights to features in KNN is by using a weighted distance metric. Instead of using the standard Euclidean distance, which treats all features equally, you can introduce weights to the distance calculation. Here's an example:

Suppose you have a dataset with three features: age, income, and credit score, and you want to predict whether a person will default on a loan or not. Based on domain knowledge or prior analysis, you know that credit score is a more important factor in determining loan default risk compared to age or income.

You can assign a higher weight to the credit score feature by modifying the distance calculation as follows:

weighted_distance = sqrt((weight_age * (age1 - age2)^2) + (weight_income * (income1 - income2)^2) + (weight_credit_score * (credit_score1 - credit_score2)^2))


you can set weight_credit_score to a higher value, such as 3, while keeping the weights for age and income at 1. This way, the credit score feature will have a greater influence on the distance calculation and, consequently, on the predictions made by the KNN algorithm.


In the k-Nearest Neighbors (KNN) algorithm, the distance between the new data point (whose label or target value needs to be predicted) and the existing data points in the training set is calculated.