<a href="https://colab.research.google.com/github/OA21796/L3T1/blob/ope/Copy_of_Predicting_heart_data.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
# Importing the relevant libraries required for use to display and query dataset
import numpy as np
import pandas as pd

# import used to plot graphs & charts etc
import matplotlib.pyplot as plt
import seaborn as sns
sns.set(style='darkgrid')
import plotly.graph_objs as go
import plotly.offline as py

#Import required to be able to import a file from users desktop
from google.colab import files

#For machine learning
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier



#Source: https://www.kaggle.com/code/microvision/heart-disease-exploratory-data-analysis/notebook
# [2]
def readData():
  """ reads and returns the csv """
  heart_data = pd.read_csv("heart.csv")
  return heart_data



# [3] & [5]
def printHead(heart_data):
  """ Previews the dataset """
  print(heart_data.head())



# [4]
def renameColumns(heart_data):
  """ Renames the columns to a better understood column name """
  heart_data.columns = ['Age', 'Sex', 'Chest_pain_type', 'Resting_bp',
              'Cholesterol', 'Fasting_bs', 'Resting_ecg',
              'Max_heart_rate', 'Exercise_induced_angina',
              'ST_depression', 'ST_slope', 'Num_major_vessels',
              'Thallium_test', 'Condition',
              ]
  return heart_data



# [6]
def describeData(heart_data):
  """ DescribeData takes in the csv, and prints a summary of the following:
     -> count             -> min
     -> mean              -> 25% percentile
     -> std               -> 50% percintile
     -> 75% percintile    -> max val """

  print(heart_data.describe())



# [7]
def infoData(heart_data):
  """ This prints the data types, memory usage, no. of features, and no. of entries made """
  heart_data.info() # prints numbered row by row with datatype, and non null count.
  print()
  print(f'Shape of the dataset')
  print(f'Number of Features: {heart_data.shape[1]}')
  print(f'Number of Observations: {heart_data.shape[0]}')
  # the print statements return the numb of columns and no of entries. f" formatted



# [8]
def checkMissingValues(heart_data):
  """ This checks for missing values in any row """
  print("{:<8}\033[1m Missing values".format(" "))
  print(heart_data.isnull().sum())
  # At current there are no missing values in the dataset



# [9]
def conditionPieChart(data):
    """
    Make a pie chart of 'Condition' values
    Condition: 0 = Benign, 1 = Malignant
    0 means they do NOT have heart disease. 1 is opposite.
    """
    results = data['Condition'].value_counts()
    values = [results[0], results[1]]
    labels = ['Benign', 'Malignant']
    colors = ['lime', 'red']

    fig_pie = go.Pie(labels=labels, values=values,
                     marker={'colors': colors,
                             'line': {'color': 'Black', 'width': 2}})
    py.iplot([fig_pie])
    print(py)
    print("\nObservations: Most members in the dataset are diagnosed with Malignant, 54.5% (165 cases)."
    "On the other hand, the proportion of Benign in Condition is less than 50%.")



# [10]
def sex_ratio(data):
  """
  Make a pie chart of 'Sex' values
  Sex: 0 = Female, 1 = Male
  A pie chart displaying the ratio between men and women
  """
  print("{:<55}\033[1m Pie Chart showing ratio between Men:Women".format(" "))
  # Pie chart of sex ratio
  results = data['Sex'].value_counts()
  values = [results[0], results[1]]
  labels = ['Female', 'Male']
  colors = ['Red', 'RoyalBlue']

  fig_pie = go.Pie(labels=labels, values=values,
                    marker={'colors': colors,
                            'line': {'color': 'Black', 'width': 2}})
  py.iplot([fig_pie])
  print(py)
  print("\nObservations: Most members in the dataset are male, 68.7% male compared to 31.3% female")



# [11]
def conditionPerSex(data):
    """
    Plot a bar chart of the proportion of Conditon vs.  Sex.
    Show the percentage of Malignant for each sex.
    """
    # Bivariate analysis: Sex vs. Condition
    data['Sex'].groupby(data['Condition']).value_counts(normalize=True).rename('proportion').reset_index().pipe((sns.barplot, 'data'), x='Sex', y='proportion', hue='Condition', palette='Dark2');
    plt.title('Proportion of Condition for Sex')
    plt.xlabel('Sex (0 = Female, 1 = Male)')
    plt.show()


    # Show the percentage of Malignant for Sex
    # female_malignant vs. Malignant
    female_malignant = data[data['Sex']==0]['Condition'].sum()/data[data['Condition']==1]['Condition'].count()
    # male_malignant vs. Malignant
    male_malignant = data[data['Sex']==1]['Condition'].sum()/data[data['Condition']==1]['Condition'].count()

    print('The proportion of Malignant for Sex:')
    print(f'Female: {female_malignant:.2%}')
    print(f'Male: {male_malignant:.2%}')
    # I may only need the percentages and not the bar chart??



# [12]
def risk_factors_fbs(data):
    """
    Plot bar charts of fasting blood sugar (over 120 mg/dl) and compare for Sex and Condition
    """
    fig = plt.figure(figsize=(20, 6))

    # Fasting blood sugar > 120 mg/dl
    plt.subplot(1, 3, 1)
    sns.countplot(x='Fasting_bs', data=data)
    plt.title('Fasting blood sugar (over 120 mg/dl)')
    plt.xlabel('Fasting_bs (0 = False, 1 = True)')


    # Fasting blood sugar for Sex
    plt.subplot(1, 3, 2)
    data['Fasting_bs'].groupby(data['Sex']).value_counts(normalize=True).rename('proportion').reset_index().pipe((sns.barplot, 'data'), x='Fasting_bs', y='proportion', hue='Sex', palette='Set1')
    plt.title('Proportion of Fasting_bs (over 120 mg/dl) for Sex')
    plt.xlabel('Fasting_bs (0 = False, 1 = True)')


    # Fasting blood sugar for Condition
    plt.subplot(1, 3, 3)
    data['Fasting_bs'].groupby(data['Condition']).value_counts(normalize=True).rename('proportion').reset_index().pipe((sns.barplot, 'data'), x='Fasting_bs', y='proportion', hue='Condition', palette='Dark2')
    plt.title('Proportion of Fasting_bs (over 120 mg/dl) for Condition')
    plt.xlabel('Fasting_bs (0 = False, 1 = True)')
    plt.show()



# [13]
def risk_factors_dist(data):
    """
    Show distributions of risk factors, Resting_bp, Chlesterol, and Max_heart_rate
    """
    fig = plt.figure(figsize=(18, 8))

    # Resting blood pressure
    plt.subplot(2, 3, 1)
    sns.distplot(data['Resting_bp'])
    plt.title('Resting Blood Pressure (mmHg) Distribution', fontsize=15)
    plt.axvline(x=130, color='r', linestyle='--', label='Hypertension: over 130 mmHg')
    plt.legend()

    plt.subplot(2, 3, 4)
    sns.boxplot(data['Resting_bp'], orient='h')


    # Serum cholesterol
    plt.subplot(2, 3, 2)
    sns.distplot(data['Cholesterol'])
    plt.title('Serum Cholesterol (mg/dl) Distribution', fontsize=15)
    plt.axvline(x=200, color='r', linestyle='--', label='High Cholesterol: over 200 mg/dl')
    plt.legend()

    plt.subplot(2, 3, 5)
    sns.boxplot(data['Cholesterol'], orient='h')


    # Maximum heart rate
    plt.subplot(2, 3, 3)
    sns.distplot(data['Max_heart_rate'])
    plt.title('Max Heart Rate Achieved (bpm) Distribution', fontsize=15)

    plt.subplot(2, 3, 6)
    sns.boxplot(data['Max_heart_rate'], orient='h')

    plt.tight_layout()
    plt.show()



# [14]
def riskFactors_bySex(data):
    """
    Show distributions of risk factors for each sex
    Risk factors include:
     -> Resting blood pressure
     -> Chloestrerol level
     -> Max heart rate
    All presented with a distribution table and standard deviation graph***
    """
    fig = plt.figure(figsize=(18, 8))

    # Resting blood pressure for each sex
    plt.subplot(2, 3, 1)
    trestbps_female = data[data['Sex']==0]['Resting_bp']
    trestbps_male = data[data['Sex']==1]['Resting_bp']
    sns.histplot(trestbps_female, color='Red')
    sns.histplot(trestbps_male, color='Blue')
    plt.title('Resting Blood Pressure (mmHg) Distribution for Each Sex')
    plt.gca().legend(title='Sex', labels=['Female','Male'])
    plt.axvline(x=130, color='r', linestyle='--', label='Hypertension: over 130 mmHg')

    plt.subplot(2, 3, 4)
    sns.boxplot(x=data['Resting_bp'], y=data['Sex'],
                palette='Set1', orient='h')


    # Serum cholesterol distribution for each sex
    plt.subplot(2, 3, 2)
    chol_female = data[data['Sex']==0]['Cholesterol']
    chol_male = data[data['Sex']==1]['Cholesterol']
    sns.histplot(chol_female, color='Red')
    sns.histplot(chol_male, color='Blue')
    plt.title('Serum Cholesterol (mg/dl) Distribution for Each Sex')
    plt.gca().legend(title='Sex', labels=['Female','Male'])
    plt.axvline(x=200, color='r', linestyle='--', label='High Cholesterol: over 200 mg/dl')

    plt.subplot(2, 3, 5)
    sns.boxplot(x=data['Cholesterol'], y=data['Sex'],
                palette='Set1', orient='h')


    # Max heart rate distribution for each sex
    plt.subplot(2, 3, 3)
    thalach_female = data[data['Sex']==0]['Max_heart_rate']
    thalach_male = data[data['Sex']==1]['Max_heart_rate']
    sns.histplot(thalach_female, color='Red')
    sns.histplot(thalach_male, color='Blue')
    plt.title('Max Heart Rate (bpm) Distribution for Each Sex')
    plt.gca().legend(title='Sex', labels=['Female','Male'])

    plt.subplot(2, 3, 6)
    sns.boxplot(x=data['Max_heart_rate'], y=data['Sex'],
                palette='Set1', orient='h')

    plt.tight_layout()
    plt.show()



# [15]
def risk_factors_dist_condition(data):
    """
    Show distributions of risk factors for each condition
    """
    fig = plt.figure(figsize=(18, 8))

    # Resting blood pressure distribution for each condition
    plt.subplot(2, 3, 1)
    trestbps_b = data[data['Condition']==0]['Resting_bp']
    trestbps_m = data[data['Condition']==1]['Resting_bp']
    sns.histplot(trestbps_b, color='Green', kde = True)
    sns.histplot(trestbps_m, color='Red', kde = True)
    plt.title('Resting Blood Pressure (mmHg) Distribution for Condition')
    plt.gca().legend(title='Condition', labels=['Benign','Malignant'])
    plt.axvline(x=130, color='r', linestyle='--', label='Hypertension: over 130 mmHg')

    plt.subplot(2, 3, 4)
    sns.boxplot(x=data['Resting_bp'], y=data['Condition'],
                orient='h', palette='Dark2')


    # Serum cholesterol distribution for each condition
    plt.subplot(2, 3, 2)
    chol_b = data[data['Condition']==0]['Cholesterol']
    chol_m = data[data['Condition']==1]['Cholesterol']
    sns.histplot(chol_b, color='Green', kde = True)
    sns.histplot(chol_m, color='Red', kde = True)
    plt.title('Serum Cholesterol (mg/dl) Distribution for Condition')
    plt.gca().legend(title='Condition', labels=['Benign','Malignant'])
    plt.axvline(x=200, color='r', linestyle='--', label='High Cholesterol: over 200 mg/dl')

    plt.subplot(2, 3, 5)
    sns.boxplot(x=data['Cholesterol'], y=data['Condition'],
                orient='h', palette='Dark2')


    # Max heart rate achieved distribution for each condition
    plt.subplot(2, 3, 3)
    thalach_b = data[data['Condition']==0]['Max_heart_rate']
    thalach_m = data[data['Condition']==1]['Max_heart_rate']
    sns.histplot(thalach_b, color='Green', kde = True)
    sns.histplot(thalach_m, color='Red', kde = True)
    plt.title('Max Heart Rate (bpm) Distribution for Condition')
    plt.gca().legend(title='Condition', labels=['Benign','Malignant'])

    plt.subplot(2, 3, 6)
    sns.boxplot(x=data['Max_heart_rate'], y=data['Condition'],
                orient='h', palette='Dark2')



# [16]
def symptoms_features(data):
    """
    Count the number of Chest_pain_type and Exercise_induced_angina values.
    """
    plt.figure(figsize=(18, 6))

    # Chest pain types
    plt.subplot(1, 2, 1)
    sns.countplot(x='Chest_pain_type', data=data)
    plt.title('Chest Pain Types')


    # Exercise induced angina
    plt.subplot(1, 2, 2)
    sns.countplot(x='Exercise_induced_angina', data=data)
    plt.title('Exercise Induced Angina')



# [17]
def symptoms_features_sex(data):
    """
    Plot bar charts of chest pain type and exercise induced angina for Sex.
    """
    fig = plt.figure(figsize=(18, 6))

    # Chest pain types for Sex
    plt.subplot(1, 2, 1)
    data['Chest_pain_type'].groupby(data['Sex']).value_counts(normalize=True).rename('proportion').reset_index().pipe((sns.barplot, 'data'), x='Chest_pain_type', y='proportion', hue='Sex', palette='Set1')
    plt.title('Proportion of Chest pain types for Sex')


    # Exercise induced angina for Sex
    plt.subplot(1, 2, 2)
    data['Exercise_induced_angina'].groupby(data['Sex']).value_counts(normalize=True).rename('proportion').reset_index().pipe((sns.barplot, 'data'), x='Exercise_induced_angina', y='proportion', hue='Sex', palette='Set1')
    plt.title('Proportion of Exercise induced angina for Sex')
    plt.show()



# [18]
def symptoms_features_condition(data):
    """
    Plot bar charts of chest pain type and exercise induced angina for Condition.
    """
    fig = plt.figure(figsize=(18, 6))

    plt.subplot(1, 2, 1)
    data['Chest_pain_type'].groupby(data['Condition']).value_counts(normalize=True).rename('proportion').reset_index().pipe((sns.barplot, 'data'), x='Chest_pain_type', y='proportion', hue='Condition', palette='Dark2')
    plt.title('Proportion of Chest pain types for Condition')

    plt.subplot(1, 2, 2)
    data['Exercise_induced_angina'].groupby(data['Condition']).value_counts(normalize=True).rename('proportion').reset_index().pipe((sns.barplot, 'data'), x='Exercise_induced_angina', y='proportion', hue='Condition', palette='Dark2')
    plt.title('Proportion of Exercise induced angina for Condition')
    plt.show()



# [19]
def heart_functions(data):
    """
    Make a bar chart of Resting_ecg and ST_slope.

    Parameters:
    - data: DataFrame containing the heart data with columns 'Resting_ecg' and 'ST_slope'.

    Returns:
    - None
    """
    plt.figure(figsize=(18, 6))

    # Resting_ecg
    plt.subplot(1, 2, 1)
    sns.countplot(x='Resting_ecg', data=data)
    plt.title('Resting electrocardiographic results')

    # ST_slope
    plt.subplot(1, 2, 2)
    sns.countplot(x='ST_slope', data=data)
    plt.title('The slope of the peak exercise ST segment')
    plt.show()



# [20]
def heart_functions_sex(data):
    """
    Plot the proportion of Resting_ecg and ST_slope for Sex.

    Parameters:
    - data: DataFrame containing the heart data with columns 'Resting_ecg', 'ST_slope', and 'Sex'.

    Returns:
    - None
    """
    plt.figure(figsize=(18, 6))

    # Resting_ecg
    plt.subplot(1, 2, 1)
    data['Resting_ecg'].groupby(data['Sex']).value_counts(normalize=True).rename('proportion').reset_index().pipe((sns.barplot, 'data'), x='Resting_ecg', y='proportion', hue='Sex', palette='Set1')
    plt.title('Proportion of Resting electrocardiographic results for Sex')

    # ST_slope
    plt.subplot(1, 2, 2)
    data['ST_slope'].groupby(data['Sex']).value_counts(normalize=True).rename('proportion').reset_index().pipe((sns.barplot, 'data'), x='ST_slope', y='proportion', hue='Sex', palette='Set1')
    plt.title('Proportion of the slope of the peak exercise ST segment for Sex')
    plt.show()



# [21]
def plot_condition_features(data):
    """
    Plot the proportion of Resting_ecg and ST_slope for different heart conditions.

    Parameters:
    - data: DataFrame containing the heart data with columns 'Resting_ecg', 'ST_slope', and 'Condition'.
    """
    plt.figure(figsize=(18, 6))

    # Resting_ecg
    plt.subplot(1, 2, 1)
    data['Resting_ecg'].groupby(data['Condition']).value_counts(normalize=True).rename('proportion').reset_index().pipe((sns.barplot, 'data'), x='Resting_ecg', y='proportion', hue='Condition', palette='Dark2')
    plt.title('Proportion of Resting electrocardiographic results for Condition')


    # ST_slope
    plt.subplot(1, 2, 2)
    data['ST_slope'].groupby(data['Condition']).value_counts(normalize=True).rename('proportion').reset_index().pipe((sns.barplot, 'data'), x='ST_slope', y='proportion', hue='Condition', palette='Dark2')
    plt.title('Proportion of the slope of the peak exercise ST segment for Condition')
    plt.show()



# [22]
def explore_sex_related_variables(data):
    """
    Visualize the distribution of ST-depression induced by exercise relative to rest,
    segmented by sex and heart disease condition.

    Parameters:
    - data: A pandas DataFrame containing heart disease data.

    Returns:
    - None
    """
    fig = plt.figure(figsize=(18, 8))

    # ST-depression induced by exercise relative to rest
    plt.subplot(2, 3, 1)
    sns.distplot(data['ST_depression'])
    plt.title('ST-depression induced by exercise relative to rest', fontsize=15)
    plt.axvline(x=0.5, color='r', linestyle='--', label='Normal')
    plt.legend()

    plt.subplot(2, 3, 4)
    sns.boxplot(data['ST_depression'], orient='h')


    # ST-depression for Sex
    plt.subplot(2, 3, 2)
    chol_female = data[data['Sex']==0]['ST_depression']
    chol_male = data[data['Sex']==1]['ST_depression']
    sns.distplot(chol_female, color='Red')
    sns.distplot(chol_male, color='Blue')
    plt.title('ST-depression Distribution for Sex')
    plt.gca().legend(title='Sex', labels=['Female','Male'])
    plt.axvline(x=0.5, color='r', linestyle='--', label='Normal')

    plt.subplot(2, 3, 5)
    sns.boxplot(x=data['ST_depression'], y=data['Sex'],
                palette='Set1', orient='h')


    # ST-depression for Condition
    plt.subplot(2, 3, 3)
    thalach_b = data[data['Condition']==0]['ST_depression']
    thalach_m = data[data['Condition']==1]['ST_depression']
    sns.distplot(thalach_b, color='Green')
    sns.distplot(thalach_m, color='Red')
    plt.title('ST-depression Distribution for Condition')
    plt.gca().legend(title='Condition', labels=['Benign','Malignant'])
    plt.axvline(x=0.5, color='r', linestyle='--', label='Normal')

    plt.subplot(2, 3, 6)
    sns.boxplot(x=data['ST_depression'], y=data['Condition'],
                palette='Dark2', orient='h')

    plt.tight_layout()
    plt.show()



# [23]
def explore_numeric_variables(data):
    """
    Visualize the distribution of numeric variables related to heart disease.

    This function generates visualizations to explore the distribution of numeric
    variables such as the number of major vessels colored by flourosopy and the
    results of Thallium scintigraphy in the provided heart disease dataset.

    Parameters:
    - data: A pandas DataFrame containing heart disease data.

    Returns:
    - None
    """
    fig = plt.figure(figsize=(18, 6))

    # Num_major_vessels
    plt.subplot(1, 2, 1)
    sns.countplot(x='Num_major_vessels', data=data)
    plt.title('Number of major vessels colored by flourosopy')

    # Thallium
    plt.subplot(1, 2, 2)
    sns.countplot(x='Thallium_test', data=data)
    plt.title('Thallium scintigraphy')

    plt.tight_layout()
    plt.show()



# [24]
def plot_heart_sex_proportions(data):
    """
    Plot proportions of Num_major_vessels and Thallium_test for Sex.

    Parameters:
    - data: pandas DataFrame containing heart disease data.

    Returns:
    - None
    """
    plt.figure(figsize=(18, 6))

    # Num_major_vessels
    plt.subplot(1, 2, 1)
    data['Num_major_vessels'].groupby(data['Sex']).value_counts(normalize=True).rename('proportion').reset_index().pipe((sns.barplot, 'data'), x='Num_major_vessels', y='proportion', hue='Sex', palette='Set1')
    plt.title('Proportion of Number of major vessels colored by flourosopy for Sex')

    # Thallium test
    plt.subplot(1, 2, 2)
    data['Thallium_test'].groupby(data['Sex']).value_counts(normalize=True).rename('proportion').reset_index().pipe((sns.barplot, 'data'), x='Thallium_test', y='proportion', hue='Sex', palette='Set1')
    plt.title('Proportion of Thallium scintigraphy for Sex')
    plt.show()



# [25]
def plot_heart_condition_proportions(data):
    """
    Plot proportions of Num_major_vessels and Thallium_test for Condition.

    Parameters:
    - data: pandas DataFrame containing heart disease data.

    Returns:
    - None
    """
    plt.figure(figsize=(18, 6))

    # Num_major_vessels
    plt.subplot(1, 2, 1)
    data['Num_major_vessels'].groupby(data['Condition']).value_counts(normalize=True).rename('proportion').reset_index().pipe((sns.barplot, 'data'), x='Num_major_vessels', y='proportion', hue='Condition', palette='Dark2')
    plt.title('Number of major vessels colored by flourosopy for Condition')

    # ST_slope
    plt.subplot(1, 2, 2)
    data['Thallium_test'].groupby(data['Condition']).value_counts(normalize=True).rename('proportion').reset_index().pipe((sns.barplot, 'data'), x='Thallium_test', y='proportion', hue='Condition', palette='Dark2')
    plt.title('Thallium scintigraphy for Condition')
    plt.show()



# [26]
def age_distribution(data):
    """
    Visualize the age distribution and its relationship with sex and condition.

    Parameters:
    - data: pandas DataFrame containing age, sex, and condition information.

    Returns:
    - None
    """
    fig = plt.figure(figsize=(15, 7))

    # Age distribution
    plt.subplot(2, 3, 1)
    sns.distplot(data['Age'])
    plt.title('Age Distribution', fontsize=15)

    plt.subplot(2, 3, 4)
    sns.boxplot(data['Age'], orient='h')


    # Age distribution for sex
    plt.subplot(2, 3, 2)
    female = data[data['Sex']==0]['Age']
    male = data[data['Sex']==1]['Age']
    sns.distplot(male, color='Blue', label='Male')
    sns.distplot(female, color='Red', label='Female')
    plt.title('Age Distribution (Male vs. Female)', fontsize=15)
    plt.legend(title='Sex', fontsize=10)

    plt.subplot(2, 3, 5)
    sns.boxplot(x=data['Age'], y=data['Sex'], orient='h', palette='Set1')


    # Age distribution for Condition
    plt.subplot(2, 3, 3)
    benign = data[data['Condition']==0]['Age']
    malignant = data[data['Condition']==1]['Age']

    sns.distplot(benign, color='Green', label='Benign')
    sns.distplot(malignant, color='Red', label='Malignant')
    plt.title('Age Distribution for Condition', fontsize=15)
    plt.legend(title='Condition', fontsize=10)

    plt.subplot(2, 3, 6)
    sns.boxplot(x=data['Age'], y=data['Sex'], orient='h', palette='Dark2')

    plt.tight_layout()
    plt.show()



# [27]
def numeric_features_vs_age(data):
    """
    Visualize the effects of numeric features for heart disease by age.

    Parameters:
    - data: pandas DataFrame containing numeric features and condition labels.

    Returns:
    - None
    """
    benign = data.Condition==0
    malignant = data.Condition==1

    age_benign = data.Age[benign]
    age_malignant = data.Age[malignant]

    b_color = 'MediumSeaGreen'
    m_color = 'LightCoral'

    fig = plt.figure(figsize=(12, 10))

    # Resting blood pressure (mmHg)
    plt.subplot(2, 2, 1)
    plt.scatter(x=age_benign, y=data.Resting_bp[benign], color=b_color)
    plt.scatter(x=age_malignant, y=data.Resting_bp[malignant], color=m_color)
    plt.title('Resting_bp vs. age', fontsize=15)
    plt.legend(['Benign', 'Malignant'])
    plt.xlabel('age', fontsize=10)
    plt.ylabel('Resting blood pressure (mmHg)', fontsize=10)


    # Serum Cholesterol (mg/dl)
    plt.subplot(2, 2, 2)
    plt.scatter(x=age_benign, y=data.Cholesterol[benign], color=b_color)
    plt.scatter(x=age_malignant, y=data.Cholesterol[malignant], color=m_color)
    plt.title('Serum cholesterol (mg/dl)', fontsize=15)
    plt.legend(['Benign', 'Malignant'])
    plt.xlabel('age', fontsize=10)
    plt.ylabel('chol', fontsize=10)


    # Maximum heart rate achieved (bpm)
    plt.subplot(2, 2, 3)
    plt.scatter(x=age_benign, y=data.Max_heart_rate[benign], color=b_color)
    plt.scatter(x=age_malignant, y=data.Max_heart_rate[malignant], color=m_color)
    plt.title('Max_heart_rate vs. age', fontsize=15)
    plt.legend(['Benign', 'Malignant'])
    plt.xlabel('age',fontsize=10)
    plt.ylabel('Maximum heart rate achieved (bpm)', fontsize=10)


    # ST_depression
    plt.subplot(2, 2, 4)
    plt.scatter(x=age_benign, y=data.ST_depression[benign], color=b_color)
    plt.scatter(x=age_malignant, y=data.ST_depression[malignant], color=m_color)
    plt.title('ST_depression vs. age', fontsize=15)
    plt.legend(['Benign', 'Malignant'])
    plt.xlabel('age',fontsize=10)
    plt.ylabel('ST_depression', fontsize=10)

    plt.tight_layout()
    plt.show()



# [28]
def feature_by_Condition(data):
  """
  25 GRAPHS THAT PRESENTS EACH FEATURE BY CONDITION**

  Generate pair plots for each feature by condition.

  Parameters:
  - data: pandas DataFrame containing features and condition labels.

  Returns:
  - None
  """
  sns.pairplot(data[['Resting_bp','Cholesterol','Max_heart_rate','ST_depression','Age', 'Condition']],hue='Condition', palette='Dark2');



# [29]
def correlation_heatmap(data):
    """
    Generate a correlation heatmap of the features.

    Parameters:
    - data: pandas DataFrame containing numerical features.

    Returns:
    - None
    """
    plt.figure(figsize=(11, 7))
    sns.heatmap(data.corr(), annot=True, linewidth=0.2,
                fmt='.2f', cmap='RdGy_r')
    plt.title('Correlations between Features', fontsize=15)
    plt.show()





# Normalising the data which ensures the ML algorithims converge faster
def get_normalisation(train_data, test_data=None):
    """Normalize the data using MinMaxScaler.

    Parameters:
    - train_data: Training data to fit the scaler.
    - test_data: Optional test data to transform using the same scaler.

    Returns:
    - train_data_normalized: Normalized training data.
    - test_data_normalized: Normalized test data if provided, None otherwise.
    """

    scaler = MinMaxScaler() # MinMaxScaler & RobustScaler achieved the highest accuracy of all 3 types of normalisation functions
    # scaler = StandardScaler()
    # scaler = RobustScaler()
    train_data_normalized = scaler.fit_transform(train_data)

    if test_data is not None:
        # Transform test data using the same scaler fitted on training data
        test_data_normalized = scaler.transform(test_data)
    else:
        test_data_normalized = None

    return train_data_normalized, test_data_normalized



# Building svm model
def support_vector_machine(iv_train, iv_test, dv_train, dv_test):
  """
  Build, fit, and evaluate Support Vector Classification model.

  Parameters:
  - iv_train: independent variables (features) of the training data
  - iv_test: independent variables (features) of the test data
  - dv_train: dependent variable (target) of the training data
  - dv_test: dependent variable (target) of the test data
  """

  # Build and fit Support Vector Classification model
  svm_model = SVC()
  svm_model = svm_model.fit(iv_train, dv_train)

  # Evaluate model accuracy on the test data
  svm_accuracy = get_model_accuracy(svm_model, iv_test, dv_test)
  print(f'SVM Accuracy: {svm_accuracy:.4}')

  #Predict class for iv_test -> not sure what this means
  dv_prediction_svm = svm_model.predict(iv_test)

  # Display classification report
  print("Classification Report for SVM:")
  print(classification_report(dv_prediction_svm, dv_test))

  # Return the trained SVM model
  return svm_model



#Required for gridsearch:
def get_best_parameters_GridSearchCV(model, params, X_train, y_train):
    """
    Perform grid search to find the best hyperparameters for the given model.

    Parameters:
    - model: Machine learning model to be optimized.
    - params: Dictionary of hyperparameters to search over.
    - X_train: Independent variables (features) of the training data.
    - y_train: Dependent variable (target) of the training data.

    Returns:
    - best_estimator: Best estimator found by grid search.
    """
    # Perform grid search using cross-validation
    grid_search = GridSearchCV(model, params, cv=5)
    grid_search.fit(X_train, y_train)

    # Get best parameters and estimator
    best_params = grid_search.best_params_
    best_estimator = grid_search.best_estimator_

    # Print best parameters
    print(f'Best Parameters for {type(model).__name__}: {best_params}\n')

    return best_estimator



#Implementing the SVM Gridsearch method
def SVM_gridsearch(X_train, X_test, y_train, y_test):
  """
  Perform grid search to find the best hyperparameters for Support Vector Machine.

  Parameters:
  - X_train: independent variables (features) of the training data
  - X_test: independent variables (features) of the test data
  - y_train: dependent variable (target) of the training data
  - y_test: dependent variable (target) of the test data

  Returns:
  - best_model: the best SVM model found by grid search
  """

  params_svm = {'C': [0.1, 1, 10, 100],
                'gamma': [1, 0.1, 0.01, 0.001, 'scale', 'auto'],
                'kernel': ['linear', 'poly', 'sigmoid']}

  # Initialize the SVM classifier
  svm_model = SVC()

  # Find the best hyperparameters using grid search
  best_estimator = get_best_parameters_GridSearchCV(svm_model, params_svm, X_train, y_train)

  # Train the best model on the training data
  best_estimator.fit(X_train, y_train)

  # Evaluate accuracy on the test data
  svm_accuracy = get_model_accuracy(best_estimator, X_test, y_test)
  print(f'SVM with GridSearchCV Accuracy: {svm_accuracy:.4}')

  # Predict class for test data
  y_pred_svm = best_estimator.predict(X_test)

  # Display classification report
  print("Classification Report for SVM with GridSearchCV:")
  print(classification_report(y_pred_svm, y_test))

  return best_estimator



#Implementing Logistic Regression:
def logistic_regression(X_train, X_test, y_train, y_test):
  """
  Build, fit, and evaluate Logistic Regression model.

  Parameters:
  - X_train: independent variables (features) of the training data
  - X_test: independent variables (features) of the test data
  - y_train: dependent variable (target) of the training data
  - y_test: dependent variable (target) of the test data

  Returns:
  - logreg: trained Logistic Regression model
  """

  # Build and fit Logistic Regression model
  logreg = LogisticRegression()
  logreg = logreg.fit(X_train, y_train)

  # Accuracy in Logistic Regression model
  logreg_acc = get_model_accuracy(logreg, X_test, y_test)
  print(f'Logistic Regression Accuracy: {logreg_acc:.4}')
  print()

  # Predict class for X_test
  y_pred_logreg = logreg.predict(X_test)

  # Classification Report of logistic regression model
  print(classification_report(y_pred_logreg, y_test))

  return logreg



#Implementing Logistic Regression WITH Gridsearch
def logistic_regression_gridsearch(X_train, X_test, y_train, y_test):
    """
    Build, fit, and evaluate Logistic Regression model with grid search.

    Parameters:
    - X_train: independent variables (features) of the training data
    - X_test: independent variables (features) of the test data
    - y_train: dependent variable (target) of the training data
    - y_test: dependent variable (target) of the test data

    Returns:
    - best_model: the best logistic regression model found by grid search
    """

    # Define grid of hyperparameters for logistic regression
    param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100],
                  'penalty': ['l2'],}
                  # 'max_iter': [100, 500, 1000],
                  # 'class_weight': [None, 'balanced'],
                  # }
                  # Tested variables above for better accuracy - it had no affect.

    # Initialize the Logistic Regression classifier
    logreg = LogisticRegression()

    # Perform grid search using cross-validation
    best_estimator = get_best_parameters_GridSearchCV(logreg, param_grid, X_train, y_train)

    # Train the best model on the training data
    best_estimator.fit(X_train, y_train)

    # Evaluate accuracy on the test data
    logreg_accuracy = get_model_accuracy(best_estimator, X_test, y_test)
    print(f'Logistic Regression Accuracy with GridSearch: {logreg_accuracy:.4f}')

    # Predict class for test data
    y_pred_logreg = best_estimator.predict(X_test)

    # Display classification report
    print("Classification Report for Logistic Regression:")
    print(classification_report(y_pred_logreg, y_test))



def get_model_accuracy(model, iv_test, dv_test):
    """
    Return the mean accuracy of the model on iv_test and dv_test.

    Parameters:
    - model: Trained machine learning model
    - iv_test: Independent variables (features) of the test data
    - dv_test: Dependent variable (target) of the test data

    Returns:
    - model_acc: Mean accuracy of the model on the test data
    """
    model_acc = model.score(iv_test, dv_test)
    return model_acc



#Implementing k nearest neighbours
def k_nearest_neighbours(X_train, X_test, y_train, y_test, n_neighbors=5):
    """
    Train a KNN classifier using the provided training data and evaluate its performance on the test data.

    Parameters:
    - X_train: Training features
    - X_test: Test features
    - y_train: Training labels
    - y_test: Test labels
    - n_neighbors: Number of neighbors for the KNN classifier (default=5)

    Returns:
    - accuracy: Accuracy of the trained model on the test data
    - classification_report: Classification report showing precision, recall, and F1-score
    """
    # Step 2: Create an instance of the KNeighborsClassifier class
    knn = KNeighborsClassifier(n_neighbors=n_neighbors)

    # Step 3: Train the KNN model using the training data
    knn.fit(X_train, y_train)

    # Step 4: Make predictions on the test data
    y_pred = knn.predict(X_test)

    # Step 5: Evaluate the performance of the model
    accuracy = accuracy_score(y_test, y_pred)
    # print("Accuracy: ", accuracy)
    print(f'KNN - Accuracy: {accuracy:.4f}')

    print("Classification Report for K-Nearest-Neighbours:")
    report = classification_report(y_test, y_pred)
    print(report)

    return accuracy, report



#Implementing k nearest neighbours + GS
def knn_gridsearch(X_train, X_test, y_train, y_test, params={'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 9, 11]}, cv=5): #
    """Perform grid search to find the best hyperparameters for KNN.

    Parameters:
    - X_train: Training features
    - X_test: Test features
    - y_train: Training labels
    - y_test: Test labels
    - params: Dictionary of hyperparameters for grid search (default={'n_neighbors': [3, 5, 7]})
    - cv: Number of folds for cross-validation (default=5)

    Returns:
    - best_model: The best KNN model found by grid search
    - best_params: Best hyperparameters found by grid search
    - accuracy: Accuracy of the best model on the test data
    - classification_report: Classification report showing precision, recall, and F1-score
    """
    # Perform grid search to find the best hyperparameters for KNN
    best_estimator = get_best_parameters_GridSearchCV(KNeighborsClassifier(), params, X_train, y_train)

    # Get model accuracy on the test data
    accuracy = get_model_accuracy(best_estimator, X_test, y_test)
    print(f'KNN + GS Accuracy: {accuracy:.4f}')

    # Get classification report
    y_pred = best_estimator.predict(X_test)
    report = classification_report(y_test, y_pred)

    # Print the classification report
    print("KNN GRIDSEARCH: Classification Report:")
    print(report)

    return best_estimator, params, accuracy, report



# Function that provides insights into which features are most relvant for predicting the target variable
def feature_importance_analysis(X_train, y_train, column_names):
    """
    Perform feature importance analysis using a Random Forest classifier.

    Parameters:
    - X_train: Training feature matrix.
    - y_train: Target vector for training.
    - column_names: List of feature names.

    Returns:
    - sorted_features: Names of features sorted by their importance scores.
    - sorted_importance: Importance scores of features sorted in descending order.
    """

    # Instantiate and train the Random Forest classifier
    rf_classifier = RandomForestClassifier(n_estimators=100, random_state=42)
    rf_classifier.fit(X_train, y_train)

    # Retrieve feature importance scores
    feature_importance = rf_classifier.feature_importances_

    # Sort features by their importance scores
    sorted_indices = feature_importance.argsort()[::-1]
    sorted_features = [column_names[i] for i in sorted_indices]
    sorted_importance = feature_importance[sorted_indices]

    # Visualize feature importance
    plt.figure(figsize=(10, 6))
    plt.bar(range(X_train.shape[1]), sorted_importance, align='center')
    plt.xticks(range(X_train.shape[1]), sorted_features, rotation=90)
    plt.xlabel('Features')
    plt.ylabel('Feature Importance Score')
    plt.title('Feature Importance Analysis (Random Forest)')
    plt.show()

    return sorted_features, sorted_importance



# Implementing Cross Validaiton
def compare_models(X_train, y_train):
    """
    Perform cross-validation with GridSearchCV for multiple classifiers.

    Parameters:
    - X_train: Training feature matrix.
    - y_train: Target vector for training.

    Returns:
    - results: Dictionary containing the mean cross-validation scores for each classifier.
    """

    #-param_grids: Dictionary containing parameter grids for each classifier.
    param_grids = {
    'SVM': {'C': [0.1, 1, 10], 'gamma': [0.1, 1, 10], 'kernel': ['linear', 'rbf']},
    'KNN': {'n_neighbors': [3, 5, 7], 'weights': ['uniform', 'distance'], 'p': [1, 2]},
    'Logistic Regression': {'C': [0.1, 1, 10], 'solver': ['liblinear', 'saga']}
    }

    # models: Dictionary containing classifiers as keys and their respective names as values.
    models = {
    'SVM': SVC(),
    'KNN': KNeighborsClassifier(),
    'Logistic Regression': LogisticRegression(max_iter=1000)
    }

    results = {}
    # For each model in the dict, store accuracy in results
    for model_name, model in models.items():
        param_grid = param_grids[model_name]
        clf_grid = GridSearchCV(model, param_grid, cv=5)
        clf_grid.fit(X_train, y_train)
        results[model_name] = clf_grid.best_score_


    print("Mean cross-validation scores:")
    for model_name, score in results.items():
        print(f"{model_name}: {round(score, 4) * 100}%")

    return results



#Source https://www.kaggle.com/code/microvision/heart-disease-classification/notebook

def train_test_split_data(data):
  """
    This function randomly splits the dataset into two subsets: one for training the ML model and other for evaluating its performance

    Parameters:
    - data: DataFrame, the input dataset containing both independent and dependent variables.
    - test_size: float, optional (default=0.2), the proportion of the dataset to include in the testing split.
    - random_state: int, optional (default=42), controls the randomness of the data splitting.

    Returns:
    - X_train: DataFrame, training set of independent variables.
    - X_test: DataFrame, testing set of independent variables.
    - y_train: Series, training set of dependent variable.
    - y_test: Series, testing set of dependent variable.
  """
  heart_data_independent_variables = data.drop(['Condition'], axis=1)
  heart_data_dependent_variable = data.Condition
  hdiv_train, hdiv_test, hddv_train, hddv_test = train_test_split(heart_data_independent_variables, heart_data_dependent_variable, test_size = 0.225, random_state=42)
  # To optimize the data, a test_size of 0.2 is used.

  # If programmer wanted to see where training starts from/to end - this can be printed.
  # print(f'Shape of heartdata_independentVariable_train {hdiv_train.shape}')
  # print(f'Shape of heartdata_indpendentVariable_test {hdiv_test.shape}')
  # print(f'Shape of heartdata_dependentVariable_train {hddv_train.shape}')
  # print(f'Shape of heartdata_dependentVariable_test {hddv_test.shape}')

  return hdiv_train, hdiv_test, hddv_train, hddv_test










def __main__():
  """ this function will hold the main functions required to run this file """
  uploaded = files.upload()                      #this is how googlCollab allows files to be uploaded
  heart_data = readData()                        #heart_data is now the CSV file I will be querying.
  heart_data = renameColumns(heart_data)         #renames the columns to something more readable
  hd_indVar_train, hd_indVar_test, hd_depVar_train, hd_depVar_test = train_test_split_data(heart_data) # splitting the data #indVar = independent variable; depVar = dependent...
  hd_indVar_train_normalised, hd_indVar_test_normalised = get_normalisation(hd_indVar_train, hd_indVar_test)  #Normalising the independent variable data:

  # describeData(heart_data)                     # [6]  prints a summary of numeric statistics
  # infoData(heart_data)                         # [7]  prints the csv schema
  # checkMissingValues(heart_data)               # [8]  prints & checks for any missing values in any columns.
  # conditionPieChart(heart_data)                # [9]  prints a pie chart that shows the ratio between malignant and benign patients
  # sex_ratio(heart_data)                        # [10] prints a pie chart that shows the ratio between men and women
  # conditionPerSex(heart_data)                  # [11] prints a bar char that shows the proportion of malginant to benign per male and female patients
  # riskFactors_bySex(heart_data)                # [12] prints 6 distribution tables that detail the risk factors in both men and women that include max heart rate, serum cholestrol level and resting-bp
  # risk_factors_dist_condition(heart_data)      # [13] prints histplots of risk factors distribution for condition (benign, malignant)
  # risk_factors_dist(heart_data)                # [14] prints Show distributions of risk factors, Resting_bp, Chlesterol, and Max_heart_rate
  # risk_factors_fbs(heart_data)                 # [15] prints Plot bar charts of fasting blood sugar (over 120 mg/dl) and compare for Sex and Condition
  # printHead(heart_data)                        # [15] prints first 5 rows for each column in the csv file
  # symptoms_features(heart_data)                # [16] prints two bar charts that depict the chest pain types and exercise induced angina
  # symptoms_features_sex(heart_data)            # [17] prints Plot bar charts of chest pain type and exercise induced angina by Sex.
  # symptoms_features_condition(heart_data)      # [18] prints Plot bar charts of chest pain type and exercise induced angina by Condition
  # heart_functions(heart_data)                  # [19] prints a bar chart of Resting_ecg and ST_slope
  # heart_functions_sex(heart_data)              # [20] Prints two bar charts: lot the proportion of Resting_exc and ST_slope for Sex.
  # plot_condition_features(heart_data)          # [21] Plot the proportion of Resting_exc and ST_slope for Condition.
  # explore_sex_related_variables(heart_data)    # [22] Make a standard distribution of ST_depression and distributions for Sex and Condition.
  # explore_numeric_variables(heart_data)        # [23] Prints two bar graphs that plot number of vessels coloured by flourosophy and thaillium scintigraphy
  # plot_heart_sex_proportions(heart_data)       # [24] Plot proportions of Num_major_vessels and Thallium_test for Sex.
  # plot_heart_condition_proportions(heart_data) # [25] Plot proportions of Num_major_vessels and Thallium_test for Condition.
  # age_distribution(heart_data)                 # [26] Makes a standard distribution and distributions for Sex
  # numeric_features_vs_age(heart_data)          # [27] 4x Scatter graphs that show the Effects of features for heart disease by age
  # feature_by_Condition(heart_data)             # [28] 25x Plot graphs that present features by condition
  # correlation_heatmap(heart_data)              # [29] HEAT MAP: Correlation Heat Map of the features.
  # feature_importance_analysis(hd_indVar_train_normalised, hd_depVar_train, heart_data.columns) #Plots a bar chart of the most important features
  # compare_models(hd_indVar_train_normalised, hd_depVar_train)            # Calling the cross validation function

  #Calling the SVM and passing in the normalised independentVariable Q's
  # svm_model = support_vector_machine(hd_indVar_train_normalised, hd_indVar_test_normalised, hd_depVar_train, hd_depVar_test)  #Prints the accuracy of SVM along with scores of each data in a 6x5 column-rows.

  #Calling the SVM model WITH gridsearch
  svm_gridsearch = SVM_gridsearch(hd_indVar_train_normalised, hd_indVar_test_normalised, hd_depVar_train, hd_depVar_test)

  #Calling the Logistic regression model
  # logreg_model = logistic_regression(hd_indVar_train_normalised, hd_indVar_test_normalised, hd_depVar_train, hd_depVar_test)

  #Calling the Logistic regression model WITH gridsearch
  # logreg_gridsearch = logistic_regression_gridsearch(hd_indVar_train_normalised, hd_indVar_test_normalised, hd_depVar_train, hd_depVar_test) # ***** Code runs but error/weird output is shown, alongside the accuracy of the search *******

  #Calling the K nearest neighbours model
  # KNN_model = k_nearest_neighbours(hd_indVar_train_normalised, hd_indVar_test_normalised, hd_depVar_train, hd_depVar_test)

  #Calling the KNN model with Gridsearch
  # KNN_gridSearch = knn_gridsearch(hd_indVar_train_normalised, hd_indVar_test_normalised, hd_depVar_train, hd_depVar_test)







if __name__ == "__main__":
  __main__()








Saving heart.csv to heart (1).csv
Best Parameters for SVC: {'C': 100, 'gamma': 'auto', 'kernel': 'poly'}

SVM with GridSearchCV Accuracy: 0.8841
Classification Report for SVM with GridSearchCV:
              precision    recall  f1-score   support

           0       0.84      0.90      0.87        30
           1       0.92      0.87      0.89        39

    accuracy                           0.88        69
   macro avg       0.88      0.89      0.88        69
weighted avg       0.89      0.88      0.88        69

