<a href="https://colab.research.google.com/github/Tech-pooja/CAD/blob/main/Lasso_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [27]:
# introduction

# background info

# motivation - computational cost
# objective

# metgods and models - working flowchart

# 20-30 pages


#Pandas is commonly used for data manipulation and analysis, while numpy is used for numerical computations.
import pandas as pd
import numpy as np

#for data visualization
import matplotlib.pyplot as plt
import seaborn as sns

#ColumnTransformer is used to apply different transformations to different columns of the dataset.
#StandardScaler and MinMaxScaler are used for feature scaling.
#OneHotEncoder is used for converting categorical variables into numerical representation.
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

#train_test_split is used to split the dataset into training and testing subsets.
#StratifiedKFold is a cross-validation method that ensures each fold has the same proportion of class labels as the whole dataset.
#cross_validate is used to perform cross-validation and evaluate the model's performance.
from sklearn.model_selection import train_test_split
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_validate

from sklearn.ensemble import RandomForestClassifier
# from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC

#for evaluating the performance of the model
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.metrics import classification_report

#These lines import the warnings module and suppress all warnings that may occur during the execution of the code.
import warnings
warnings.filterwarnings('ignore')

In [28]:
import io
#df = pd.read_csv(io.BytesIO(uploaded['CAD.csv']))
df = pd.read_csv("/content/CAD.csv")

In [29]:
total = df.isnull().sum().sort_values(ascending=False)
percent = (df.isnull().sum() / df.isnull().count()).sort_values(ascending=False)
missing_data = pd.concat([total, percent*100], axis=1, keys=["Total", "Percent(%)"])
#missing_data

In [30]:
duplicates = df[df.duplicated()]
print("Total Duplicates rows observed:", duplicates.shape[0])
print("Dropping duplicates")

# Dropping duplicates
print("Shape before dropping duplicates: ", df.shape[0])
df.drop_duplicates(inplace=True)
print("Shae after dropping duplicates: ", df.shape[0])

Total Duplicates rows observed: 0
Dropping duplicates
Shape before dropping duplicates:  303
Shae after dropping duplicates:  303


In [31]:
# Check for unique values

# Numerical variables:
num_cols = ['Age','Weight', 'Length','BMI', 'BP', 'PR', 'FBS', 'CR', 'TG', 'LDL', 'HDL', 'BUN', 'ESR', 'HB', 'K', 'Na', 'WBC', 'Lymph', 'Neut', 'PLT', 'EF-TTE']

# Categorical variables:
cat_cols = ['Sex', 'DM', 'HTN', 'Current Smoker', 'EX-Smoker', 'FH', 'Obesity', 'CRF', 'CVA', 'Airway disease', 'Thyroid Disease', 'CHF', 'DLP', 'Edema', 'Weak Peripheral Pulse', 'Lung rales', 'Systolic Murmur', 'Diastolic Murmur', 'Typical Chest Pain', 'Dyspnea', 'Atypical', 'Nonanginal', 'Exertional CP', 'LowTH Ang', 'Q Wave', 'St Elevation', 'St Depression', 'Tinversion', 'LVH', 'Poor R Progression', 'Cath']

# Ordinal variables
ord_cols = ['Function Class', "Region RWMA", "VHD"]

print(f"[Unique Values in {len(cat_cols)} Categorical Variables]\n")

for cat_col in cat_cols:
    print("* {} : {} Unique Values =>".format(cat_col, df[cat_col].nunique()), df[cat_col].unique())

[Unique Values in 31 Categorical Variables]

* Sex : 2 Unique Values => ['Male' 'Fmale']
* DM : 2 Unique Values => [0 1]
* HTN : 2 Unique Values => [1 0]
* Current Smoker : 2 Unique Values => [1 0]
* EX-Smoker : 2 Unique Values => [0 1]
* FH : 2 Unique Values => [0 1]
* Obesity : 2 Unique Values => ['Y' 'N']
* CRF : 2 Unique Values => ['N' 'Y']
* CVA : 2 Unique Values => ['N' 'Y']
* Airway disease : 2 Unique Values => ['N' 'Y']
* Thyroid Disease : 2 Unique Values => ['N' 'Y']
* CHF : 2 Unique Values => ['N' 'Y']
* DLP : 2 Unique Values => ['Y' 'N']
* Edema : 2 Unique Values => [0 1]
* Weak Peripheral Pulse : 2 Unique Values => ['N' 'Y']
* Lung rales : 2 Unique Values => ['N' 'Y']
* Systolic Murmur : 2 Unique Values => ['N' 'Y']
* Diastolic Murmur : 2 Unique Values => ['N' 'Y']
* Typical Chest Pain : 2 Unique Values => [0 1]
* Dyspnea : 2 Unique Values => ['N' 'Y']
* Atypical : 2 Unique Values => ['N' 'Y']
* Nonanginal : 2 Unique Values => ['N' 'Y']
* Exertional CP : 1 Unique Values => 

In [32]:
# Data shape

print(f"Dataset : {df.shape[0]} rows X {df.shape[1]} columns")

Dataset : 303 rows X 55 columns


In [33]:
#Distribution of continuous features are not uniform, and has a certain skewness. Especially, feature TG (max value = 1050),
#FBS (max value = 400), and HDL (max value = 111) and other features has some outliers.
#We will check outliers by plotting boxplot in the next part.

vhd = {"N": 0, "mild": 1, "Moderate": 2, "Severe": 3}
sex = {"Male": "Male", "Fmale": "Female"}

df['VHD'] = df['VHD'].map(vhd)
df['Sex'] = df['Sex'].map(sex)

df.replace('N', 0, inplace=True)
df.replace('Y', 1, inplace=True)

df.head()

Unnamed: 0,Age,Weight,Length,Sex,BMI,DM,HTN,Current Smoker,EX-Smoker,FH,...,K,Na,WBC,Lymph,Neut,PLT,EF-TTE,Region RWMA,VHD,Cath
0,53,90,175,Male,29.387755,0,1,1,0,0,...,4.7,141,5700,39,52,261,50,0,0,Cad
1,67,70,157,Female,28.398718,0,1,0,0,0,...,4.7,156,7700,38,55,165,40,4,0,Cad
2,54,54,164,Male,20.077335,0,0,1,0,0,...,4.7,139,7400,38,60,230,40,2,1,Cad
3,66,67,158,Female,26.838648,0,1,0,0,0,...,4.4,142,13000,18,72,742,55,0,3,Normal
4,50,87,153,Female,37.165193,0,1,0,0,0,...,4.0,140,9200,55,39,274,50,0,3,Normal


In [34]:
X = df.drop("Cath", axis=1)
y = df['Cath']

map_label = {"Cad":1, "Normal":0}
y = y.map(map_label)

In [35]:
# Numerical variables:
num_cols = ['Age','Weight', 'Length','BMI', 'BP', 'PR', 'FBS', 'CR', 'TG', 'LDL', 'HDL', 'BUN', 'ESR', 'HB', 'K', 'Na', 'WBC', 'Lymph', 'Neut', 'PLT', 'EF-TTE']

# Categorical variables:
cat_cols = ['Sex', 'DM', 'HTN', 'Current Smoker', 'EX-Smoker', 'FH', 'Obesity', 'CRF', 'CVA', 'Airway disease', 'Thyroid Disease', 'CHF', 'DLP', 'Edema', 'Weak Peripheral Pulse', 'Lung rales', 'Systolic Murmur', 'Diastolic Murmur', 'Typical Chest Pain', 'Dyspnea', 'Atypical', 'Nonanginal', 'Exertional CP', 'LowTH Ang', 'Q Wave', 'St Elevation', 'St Depression', 'Tinversion', 'LVH', 'Poor R Progression', 'Cath']
cat_cols.remove('Cath')

# Ordinal variables
ord_cols = ['Function Class', "Region RWMA", "VHD"]

In [36]:
preprocessor = ColumnTransformer(transformers = [('OHE', OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='first', dtype=np.int64), cat_cols),
                                                 ('Scaler', StandardScaler(), num_cols)],
                                 remainder = 'passthrough',verbose_feature_names_out=False).set_output(transform = 'pandas')
X_prep = preprocessor.fit_transform(X)

In [37]:
# X_prep_df
X_prep.columns
# len(X_prep.columns)

Index(['Sex_Male', 'DM_1', 'HTN_1', 'Current Smoker_1', 'EX-Smoker_1', 'FH_1',
       'Obesity_1', 'CRF_1', 'CVA_1', 'Airway disease_1', 'Thyroid Disease_1',
       'CHF_1', 'DLP_1', 'Edema_1', 'Weak Peripheral Pulse_1', 'Lung rales_1',
       'Systolic Murmur_1', 'Diastolic Murmur_1', 'Typical Chest Pain_1',
       'Dyspnea_1', 'Atypical_1', 'Nonanginal_1', 'LowTH Ang_1', 'Q Wave_1',
       'St Elevation_1', 'St Depression_1', 'Tinversion_1', 'LVH_1',
       'Poor R Progression_1', 'Age', 'Weight', 'Length', 'BMI', 'BP', 'PR',
       'FBS', 'CR', 'TG', 'LDL', 'HDL', 'BUN', 'ESR', 'HB', 'K', 'Na', 'WBC',
       'Lymph', 'Neut', 'PLT', 'EF-TTE', 'Function Class', 'Region RWMA',
       'VHD'],
      dtype='object')

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X_prep, y, test_size=0.1, stratify=y,random_state = 1)

print("Training Data Shape : ", X_train.shape, y_train.shape)
print("Test Data Shape : ", X_test.shape, y_test.shape)

#X_train.columns

Training Data Shape :  (272, 53) (272,)
Test Data Shape :  (31, 53) (31,)


In [39]:
svm_classifier = SVC(kernel='linear', C = 0.1, random_state = 1)
svm_classifier.fit(X_train, y_train)
svm_predictions = svm_classifier.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_precision = precision_score(y_test, svm_predictions)
svm_recall = recall_score(y_test, svm_predictions)
svm_conf_matrix = confusion_matrix(y_test, svm_predictions)

print("SVM Accuracy:", svm_accuracy)
print("SVM Precision:", svm_precision)
print("SVM Recall:", svm_recall)
print("SVM Confusion Matrix:")
print(svm_conf_matrix)

SVM Accuracy: 0.9032258064516129
SVM Precision: 0.88
SVM Recall: 1.0
SVM Confusion Matrix:
[[ 6  3]
 [ 0 22]]


In [40]:
knn_classifier = KNeighborsClassifier(n_neighbors = 10)
knn_classifier.fit(X_train, y_train)
knn_predictions = knn_classifier.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_predictions)
knn_precision = precision_score(y_test, knn_predictions)
knn_recall = recall_score(y_test, knn_predictions)
knn_conf_matrix = confusion_matrix(y_test, knn_predictions)
print("KNN Accuracy:", knn_accuracy)
print("KNN Precision:", knn_precision)
print("KNN Recall:", knn_recall)
print("KNN Confusion Matrix:")
print(knn_conf_matrix)

KNN Accuracy: 0.9032258064516129
KNN Precision: 0.88
KNN Recall: 1.0
KNN Confusion Matrix:
[[ 6  3]
 [ 0 22]]


In [41]:
rf_classifier = RandomForestClassifier(n_estimators = 10, random_state = 1)
rf_classifier.fit(X_train, y_train)
rf_predictions = rf_classifier.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_precision = precision_score(y_test, rf_predictions)
rf_recall = recall_score(y_test, rf_predictions)
rf_conf_matrix = confusion_matrix(y_test, rf_predictions)

print("RF Accuracy:", rf_accuracy)
print("RF Precision:", rf_precision)
print("RF Recall:", rf_recall)
print("RF Confusion Matrix:")
print(rf_conf_matrix)

RF Accuracy: 0.9032258064516129
RF Precision: 0.88
RF Recall: 1.0
RF Confusion Matrix:
[[ 6  3]
 [ 0 22]]


In [42]:
from sklearn.linear_model import Lasso
from sklearn.feature_selection import SelectFromModel

# Initialize the Lasso Regression model
lasso = Lasso(alpha=0.1)  # You can adjust the alpha parameter for regularization strength

# Fit Lasso Regression on the training data
lasso.fit(X_train, y_train)

# Select features based on the coefficients obtained from Lasso
model = SelectFromModel(lasso, prefit=True, threshold=-np.inf, max_features=15)

# Transform the training data to select the important features
X_train_selected = model.transform(X_train)

# Get the selected feature indices
selected_feature_indices = model.get_support(indices=True)

# Get the names of selected features
selected_features = X_train.columns[selected_feature_indices].tolist()

# Print the selected features
selected_features

['Sex_Male',
 'DM_1',
 'HTN_1',
 'Current Smoker_1',
 'EX-Smoker_1',
 'FH_1',
 'Obesity_1',
 'CRF_1',
 'CVA_1',
 'Airway disease_1',
 'Thyroid Disease_1',
 'CHF_1',
 'Typical Chest Pain_1',
 'Age',
 'Region RWMA']

In [43]:
X_new = X_prep[selected_features]
X_train, X_test, y_train, y_test = train_test_split(X_new, y, test_size=0.1, stratify=y,random_state = 1)

In [44]:
svm_classifier = SVC(kernel='linear', C = 0.1, random_state = 1)
svm_classifier.fit(X_train, y_train)
svm_predictions = svm_classifier.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_predictions)
svm_precision = precision_score(y_test, svm_predictions)
svm_recall = recall_score(y_test, svm_predictions)
svm_conf_matrix = confusion_matrix(y_test, svm_predictions)

print("SVM Accuracy:", svm_accuracy)
print("SVM Precision:", svm_precision)
print("SVM Recall:", svm_recall)
print("SVM Confusion Matrix:")
print(svm_conf_matrix)

SVM Accuracy: 0.9032258064516129
SVM Precision: 0.88
SVM Recall: 1.0
SVM Confusion Matrix:
[[ 6  3]
 [ 0 22]]


In [45]:
knn_classifier = KNeighborsClassifier(n_neighbors = 10)
knn_classifier.fit(X_train, y_train)
knn_predictions = knn_classifier.predict(X_test)
knn_accuracy = accuracy_score(y_test, knn_predictions)
knn_precision = precision_score(y_test, knn_predictions)
knn_recall = recall_score(y_test, knn_predictions)
knn_conf_matrix = confusion_matrix(y_test, knn_predictions)
print("KNN Accuracy:", knn_accuracy)
print("KNN Precision:", knn_precision)
print("KNN Recall:", knn_recall)
print("KNN Confusion Matrix:")
print(knn_conf_matrix)


KNN Accuracy: 0.9354838709677419
KNN Precision: 0.9166666666666666
KNN Recall: 1.0
KNN Confusion Matrix:
[[ 7  2]
 [ 0 22]]


In [47]:
rf_classifier = RandomForestClassifier(n_estimators = 10, random_state = 1)
rf_classifier.fit(X_train, y_train)
rf_predictions = rf_classifier.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_predictions)
rf_precision = precision_score(y_test, rf_predictions)
rf_recall = recall_score(y_test, rf_predictions)
rf_conf_matrix = confusion_matrix(y_test, rf_predictions)

print("RF Accuracy:", rf_accuracy)
print("RF Precision:", rf_precision)
print("RF Recall:", rf_recall)
print("RF Confusion Matrix:")
print(rf_conf_matrix)

RF Accuracy: 0.9354838709677419
RF Precision: 0.9166666666666666
RF Recall: 1.0
RF Confusion Matrix:
[[ 7  2]
 [ 0 22]]
