# Deployed Model Link : [Task_5 deployed model link](https://task-5.streamlit.app/)

# ***Dataset:***

In [70]:
import pandas as pd
import warnings
warnings.filterwarnings('ignore')
df = pd.read_csv('/content/drive/MyDrive/heart.csv')
df.head(6)

Unnamed: 0,age,sex,cp,trestbps,chol,fbs,restecg,thalach,exang,oldpeak,slope,ca,thal,target
0,52,1,0,125,212,0,1,168,0,1.0,2,2,3,0
1,53,1,0,140,203,1,0,155,1,3.1,0,0,3,0
2,70,1,0,145,174,0,1,125,1,2.6,0,0,3,0
3,61,1,0,148,203,0,1,161,0,0.0,2,1,3,0
4,62,0,0,138,294,1,1,106,0,1.9,1,3,2,0
5,58,0,0,100,248,0,0,122,0,1.0,1,0,2,1


# ***DataAnalyzer:***


---
***This class has focused on the following:***
1.   Data infomation (dtypes and null)
2.   Missing Values  
3.   Duplicates
4.   Whole Data Distribution (Normal or Curved)
5.   Imbalance binary features
6.   Feature importance / Selection
7.   Main Driver to run all class functions







In [71]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [72]:
class DataAnalyzer:
    def __init__(self, df):                  # constructor
        self.df = df

    def data_info(self):                     # Data information
        print("Data Information (dtypes, etc.)")
        return self.df.info()

    def data_shape(self):                    # Shape (rows & columns)
        print("No. of rows and columns")
        return self.df.shape

    def check_missing_values(self):          # Missing values
        print("No. of Missing Values")
        return self.df.isnull().sum()

    def check_duplicates(self):              # Duplicates
        print("No. of Duplicates in the data")
        return self.df.duplicated().sum()

    def Whole_dataset_distribution(self):    # Whole Data Distribution
        for col in self.df.select_dtypes(include='object').columns:  # Data Distribution for categorical columns
            print(f"Distribution of {col}:")

    def binary_features_distribution(self):  # checking for imbalance binary classes
        Binary_features = ['sex', 'cp', 'fbs', 'restecg', 'exang']  # Binary features
        for feature in Binary_features:
            print(f"Percentage distribution for {feature}:")
            value_counts = self.df[feature].value_counts()
            for value, count in value_counts.items():
                percentage = (count / len(self.df)) * 100
                print(f"Class {value}: {percentage:.2f}%")
            print()

    def feature_importance(self):            # Feature Selection
        from sklearn.feature_selection import VarianceThreshold
        var_thres = VarianceThreshold(threshold=0)
        var_thres.fit(self.df)

        constant_columns = [column for column in self.df.columns
                            if column not in self.df.columns[var_thres.get_support()]]
        print("The number of non-important features are", len(constant_columns))
        for feature in constant_columns:
            print(feature)

    def Main_Class_1(self):                  # Driver for all functions
        self.data_info()
        self.data_shape()
        self.check_missing_values()
        self.check_duplicates()
        self.feature_importance()


# ***DataPreprocessor:***


---
***This class has focused on the following:***
1.   Normalized the data  
2.   Split the data into training and testing
3.   Main()

***Conclusion***:  
Since there are no missing values and duplicates in the data, the only issue was, some features from the dataset that were imbalanced (binary features). So, after applying resampling technique the problem was solved.



In [73]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import BorderlineSMOTE
from sklearn.preprocessing import StandardScaler

In [74]:
class DataPreprocessor:  # Class for DataPreprocessing
    def __init__(self, data):              # Constructor
        self.data = data

        # Separate features and target
        self.X = data.drop(columns='target', axis = 1)
        self.Y = data['target']

        # Split data into training and testing sets
        self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(self.X, self.Y, test_size=0.3, random_state=42)

    def normalize_data(self):              # Normalize the data
        scaler = StandardScaler()
        self.X_train = scaler.fit_transform(self.X_train)
        self.X_test = scaler.transform(self.X_test)
        print("Data normalization complete.")

    def split_data(self):                  # Split the data
        print("Final train-test split done.")
        return self.X_train, self.X_test, self.y_train, self.y_test

# ***Model-Selection / Training:***

In [75]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [76]:
# ModelSelection class
class ModelSelection(DataPreprocessor):
    def __init__(self, data):              # Parent class constructor for data loading
        super().__init__(data)
        self.models = {
            "LogisticRegression": LogisticRegression(max_iter=5000),
            "DecisionTree": DecisionTreeClassifier(),
            "RandomForest": RandomForestClassifier(),
            "SVC": SVC(),
            "KNeighbors": KNeighborsClassifier()
        }

    def evaluate_models(self, folds=5):    # K-folds cross-validation for model selection
        model_performance = {}

        for model_name, model in self.models.items():
            print(f"Evaluating {model_name}...")
            cv_scores = cross_val_score(model, self.X_train, self.y_train, cv=folds)
            model_performance[model_name] = cv_scores.mean()
            print(f"{model_name} CV Scores: {cv_scores}")
            print(f"Average CV Score: {cv_scores.mean():.4f}\n")

        # Select the best model based on highest average CV score
        best_model_name = max(model_performance, key=model_performance.get)
        best_model = self.models[best_model_name]
        print(f"Best model: {best_model_name} with CV score of {model_performance[best_model_name]:.4f}")
        return best_model

    def train_best_model(self, model):     # Train the best model based on cross-validation
        model.fit(self.X_train, self.y_train)
        print(f"{model.__class__.__name__} training complete.")
        return model

    def Main_Class_3(self):                # Driver class for Model Training and Data Preprocessor
        best_model = self.evaluate_models(5)
        trained_model = self.train_best_model(best_model)
        self.split_data()
        return trained_model


# ***Model-Actual Training / Evaluation***

In [77]:
import numpy as np
class ModelTraining(ModelSelection):
    def __init__(self, data):
        super().__init__(data)

    def train_model(self, model):
        model.fit(self.X_train, self.y_train)
        return model

    def evaluate_model(self, model):
        print(f"Model Name : -> {model.__class__.__name__}")
        y_pred = model.predict(self.X_test)
        print(classification_report(self.y_test, y_pred))
        print("Confusion Matrix:")
        print(confusion_matrix(self.y_test, y_pred))

    def predict_new_data(self, trained_model, new_data):
        predictions = trained_model.predict(new_data)
        threshold = np.random.randint(0,1)

        if predictions > threshold:
            print("Issues detected.")
        else:
            print("No issues detected.")

    def save_model(self, trained_model):
        import pickle
        with open('failure.pkl', 'wb') as file:
            pickle.dump(trained_model, file)

    def run_evaluation(self):
        self.normalize_data()
        trained_model = self.train_best_model(self.evaluate_models(5))
        self.train_model(trained_model)
        self.evaluate_model(trained_model)










In [78]:
# (Class 1)
# analyzer = DataAnalyzer(df)
# analyzer.Main_Class_1()

In [79]:
# (Class 2)
# preprocessor = DataPreprocessor(df)
# preprocessor.split_data()

In [80]:
# (Class 3)
# model_trainer = ModelSelection(df)
# trained_model = model_trainer.Main_Class_3()

In [81]:
# (Class 4)
evaluator = ModelTraining(df)
evaluator.run_evaluation()
evaluator.save_model(trained_model)

Data normalization complete.
Evaluating LogisticRegression...
LogisticRegression CV Scores: [0.82638889 0.875      0.88111888 0.86013986 0.7972028 ]
Average CV Score: 0.8480

Evaluating DecisionTree...
DecisionTree CV Scores: [0.97222222 0.99305556 0.97902098 0.99300699 0.88811189]
Average CV Score: 0.9651

Evaluating RandomForest...
RandomForest CV Scores: [0.94444444 0.97222222 0.99300699 0.98601399 0.92307692]
Average CV Score: 0.9638

Evaluating SVC...
SVC CV Scores: [0.88888889 0.92361111 0.90909091 0.93706294 0.88111888]
Average CV Score: 0.9080

Evaluating KNeighbors...
KNeighbors CV Scores: [0.85416667 0.86111111 0.86713287 0.84615385 0.81818182]
Average CV Score: 0.8493

Best model: DecisionTree with CV score of 0.9651
DecisionTreeClassifier training complete.
Model Name : -> DecisionTreeClassifier
              precision    recall  f1-score   support

           0       0.95      1.00      0.97       159
           1       1.00      0.94      0.97       149

    accuracy     

# ***Model Testing (Unseed Data)***

In [82]:
 # Test Cases in 2D Array Format with Labels
Test_Case_1 = [[72.0, 0, 1, 120, 210, 0, 1, 150, 0, 1.5, 2, 1, 3]]  # Unseen Data 0
Test_Case_2 = [[55.0, 1, 0, 140, 240, 1, 0, 160, 1, 2.0, 1, 0, 2]]  # Unseen Data 1
Test_Case_3 = [[68.0, 0, 1, 135, 220, 0, 1, 155, 1, 1.0, 2, 1, 2]]  # Unseen Data 1
Test_Case_4 = [[63.0, 1, 0, 130, 190, 1, 0, 170, 0, 0.5, 1, 3, 2]]  # Unseen Data 0
Test_Case_5 = [[59.0, 0, 1, 145, 250, 0, 1, 145, 1, 1.2, 0, 1, 3]]  # Unseen Data 1

evaluator.predict_new_data(trained_model, Test_Case_5)

Issues detected.
