### Notebook Overview
This notebook outlines the complete data preparation steps, *excluding scaling and class balancing*. The key processes included are:

- Null value imputation  
- Statistical analysis and feature selection
- Data investigation with visualizations<br>

For the remaining steps in this study's implementation, please refer to the other notebooks included in this project.
<br>**Also**, multiple visualizations were analysed, but only the ones with significant insights are mentioned here.

In [None]:
## Import libraries
import numpy as np
import pandas as pd 
from matplotlib import pyplot as plt
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score, f1_score, recall_score
from sklearn.preprocessing import StandardScaler
from tpot import TPOTClassifier
from imblearn.over_sampling import ADASYN
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix


In [None]:
## Import libraries
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from catboost import CatBoostClassifier

In [None]:
df = pd.read_csv("data01.csv")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.shape

# Data Preparation and Pre-Processing

In [None]:
df.columns

In [None]:
df.rename(columns=lambda x: x.replace(' ', '_'), inplace=True)

### Null-value Imputation

In [None]:
df.isna().sum()

In [None]:
null_features = [x for x in df.columns if df[x].isna().any()]
null_features.remove('outcome')

In [None]:
null_features

In [None]:
len(null_features)

In [None]:
plt.style.use('default')

In [None]:
# Total number of rows in the dataset
total_rows = df.shape[0]

# Calculate the percentage of null values for each feature relative to the total number of rows
null_proportions = df[null_features].isnull().sum() / total_rows * 100

# Plotting the proportion of null values
plt.figure(figsize=(10, 6))
null_proportions.plot(kind='bar', color='red')
plt.title('Proportion of Null Values for Each Feature relative to Total Rows')
plt.xlabel('Null Features')
plt.ylabel('Percentage of Null Values')
plt.xticks(rotation=45, ha='right')
plt.show()


**Insights**:
* Null % < 25 for all columns, no need to drop any features

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np



for feature in null_features:
  # Check distribution
  plt.hist(df[feature])
  plt.xlabel(feature)
  plt.ylabel("Frequency")
  plt.title(f"Distribution of {feature}")
  plt.grid(True)
  plt.show()
  
  # Check normality with qqplot
  plt.figure()
  stats = np.percentile(df[feature], [25, 50, 75])
  q1, q3 = stats[0], stats[2]
  IQR = q3 - q1
  plt.scatter(df[feature], np.sort(df[feature]))
  lower_bound = q1 - (1.5 * IQR)
  upper_bound = q3 + (1.5 * IQR)
  outliers = [x for x in df[feature] if x < lower_bound or x > upper_bound]
  plt.scatter(outliers, outliers, color='red', marker='o', alpha=0.5)
  plt.plot([lower_bound, upper_bound], [lower_bound, upper_bound], color='red', linestyle='--')
  plt.xlabel(feature)
  plt.ylabel("Ordered Values")
  plt.title(f"QQ-plot of {feature}")
  plt.grid(True)
  plt.show()


In [None]:
Normal_nulls=['heart_rate','Systolic_blood_pressure','Diastolic_blood_pressure','Blood_calcium']
Median_nulls=['BMI','Respiratory_rate','temperature','PH']
Skewed_nulls=['SP_O2','Urine_output','Neutrophils','Basophils','Lymphocyte','PT','INR','Creatine_kinase','glucose',
              'Lactic_acid','PCO2']

In [None]:
import pandas as pd

# Creating a dictionary with the categories and their respective variables
null_categories = {
    "Normal Nulls": ['heart_rate', 'Systolic_blood_pressure', 'Diastolic_blood_pressure', 'Blood_calcium'],
    "Median Nulls": ['BMI', 'Respiratory_rate', 'temperature', 'PH'],
    "Skewed Nulls": ['SP_O2', 'Urine_output', 'Neutrophils', 'Basophils', 'Lymphocyte', 
                     'PT', 'INR', 'Creatine_kinase', 'glucose', 'Lactic_acid', 'PCO2']
}

# Converting the dictionary to a DataFrame and replacing NaN values with empty strings
nulls_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in null_categories.items()])).fillna('')


nulls_df.index = nulls_df.index + 1

# Display the DataFrame with additional formatting 
styled_nulls_df = nulls_df.style.set_table_styles([
    {'selector': 'th', 'props': [('border', '2px solid black')]},
    {'selector': 'td', 'props': [('border', '1px solid black')]}
])


styled_nulls_df


In [None]:
# Creating a dictionary with the categories and their respective variables
null_categories = {
    "Normal_Nulls": ['heart_rate', 'Systolic_blood_pressure', 'Diastolic_blood_pressure', 'Blood_calcium'],
    "Median_Nulls": ['BMI', 'Respiratory_rate', 'temperature', 'PH'],
    "Skewed_Nulls": ['SP_O2', 'Urine_output', 'Neutrophils', 'Basophils', 'Lymphocyte', 
                     'PT', 'INR', 'Creatine_kinase', 'glucose', 'Lactic_acid', 'PCO2']
}

# Converting the dictionary to a DataFrame and replacing NaN values with empty strings
nulls_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in null_categories.items()])).fillna('')

# Displaying the DataFrame with additional formatting for a neater output without row numbers
styled_nulls_df = nulls_df.style.set_table_styles([
    {'selector': 'th', 'props': [('border', '2px solid black')]},
    {'selector': 'td', 'props': [('border', '1px solid black')]}
]).hide(axis='index')


styled_nulls_df


In [None]:
from sklearn.experimental import enable_iterative_imputer

In [None]:
from sklearn.impute import SimpleImputer, IterativeImputer


In [None]:
# Imputer objects for different strategies
mean_imputer = SimpleImputer(strategy='mean')
median_imputer = SimpleImputer(strategy='median')
iterative_imputer = IterativeImputer(max_iter=10, random_state=0)  # Create IterativeImputer object

# Impute missing values based on distribution
df[Normal_nulls] = mean_imputer.fit_transform(df[Normal_nulls])
df[Median_nulls] = median_imputer.fit_transform(df[Median_nulls])
df[Skewed_nulls] = iterative_imputer.fit_transform(df[Skewed_nulls])



In [None]:
df.isna().sum()

**Insights**:
* Only 1 row with null value, but the null value is in the target variable.
* Best if row dropped entirely

In [None]:
df=df.dropna(subset=['outcome'])


In [None]:
df.isna().sum()

In [None]:
df.group.value_counts()

In [None]:
df.ID.value_counts()

**Insights**:
* We can now observe that both "group" and "ID" hold no significance in the analysis.

In [None]:
df.drop(columns=["group","ID"],inplace=True)

In [None]:
df.shape

In [None]:
df.outcome.value_counts(normalize=True)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Computing the correlation matrix
corr = df.corr()

# Generating a mask for the upper triangle
mask = np.triu(np.ones_like(corr, dtype=bool))

# Set up the matplotlib figure
plt.figure(figsize=(20,20))

# Drawing the heatmap 
sns.heatmap(corr, mask=mask, cmap='coolwarm', annot=True, fmt=".2f", linewidths=.5)

plt.title('Correlation Matrix')
plt.show()


### Outlier Analysis:

In [None]:
df.columns

In [None]:
Categorical_Vars=df[['hypertensive','atrialfibrillation', 'CHD_with_no_MI', 'diabetes', 'deficiencyanemias','depression', 
                 'Hyperlipemia', 'Renal_failure', 'COPD','gendera','outcome']]
Numerical_Vars= df.drop(columns=Categorical_Vars.columns)

In [None]:
Categorical_Vars_L=list(Categorical_Vars.columns)
Numerical_Vars_L=list(Numerical_Vars.columns)

In [None]:

for feature in Numerical_Vars_L:
    plt.figure(figsize=(6, 4))
    sns.boxplot(x='outcome', y=feature, data=df)
    plt.title(f'Box Plot of {feature} by Outcome')
    plt.show()

In [None]:

for column in Numerical_Vars_L:
    plt.figure(figsize=(6, 4))  # Set figure size
    sns.boxplot(x=df[column])
    plt.title(f'Box Plot of {column}')
    plt.xlabel(column)
    plt.show()


In [None]:
Outlier_Columns=["Basophils","Lymphocyte","Creatine_kinase","BMI"]

In [None]:

for column in Outlier_Columns:
    plt.figure(figsize=(6, 4))  # Set figure size
    sns.boxplot(x=df[column])
    plt.title(f'Box Plot of {column}')
    plt.xlabel(column)
    plt.show()

In [None]:
# Capping outliers using IQR method
for feature in Outlier_Columns:
    # Calculate Q1 (25th percentile) and Q3 (75th percentile)
    Q1 = df[feature].quantile(0.25)
    Q3 = df[feature].quantile(0.75)
    IQR = Q3 - Q1

    # Defining the lower and upper bounds for capping
    cap_low = Q1 - 1.5 * IQR
    cap_high = Q3 + 1.5 * IQR

    # Caping the feature values
    df[feature] = np.where(df[feature] < cap_low, cap_low, df[feature])
    df[feature] = np.where(df[feature] > cap_high, cap_high, df[feature])


print(df[Outlier_Columns].describe())


In [None]:
# Ploting each feature in a separate figure
for column in Outlier_Columns:
    plt.figure(figsize=(6, 4))  # Set figure size
    sns.boxplot(x=df[column])
    plt.title(f'Box Plot of {column}')
    plt.xlabel(column)
    plt.show()

In [None]:
df.dtypes

In [None]:
df.head()

In [None]:
df[Categorical_Vars_L]

In [None]:
df.dtypes

### Visualization

In [None]:
print(sns.axes_style())

In [None]:
plt.style.available

In [None]:
df.columns

In [None]:
plt.style.available

In [None]:
sns.histplot(df['age'], kde=True, color='orange')
plt.title('Distribution of Age')
plt.xlabel('Age')
plt.ylabel('Density')
plt.show()

In [None]:
import seaborn as sns

plt.style.use("classic")

plt.figure(figsize=(10, 6))
sns.kdeplot(df['heart_rate'].dropna(), shade=True)
plt.title('Kernel Density Estimate of Heart Rate')
plt.xlabel('Heart Rate')
plt.ylabel('Density')
plt.show()

In [None]:
plt.figure(figsize=(10, 6))
df['hypertensive'].value_counts().plot(kind='bar')
plt.title('Frequency Distribution of Hypertension')
plt.xlabel('Hypertensive')
plt.ylabel('Count')
plt.show()

In [None]:
cond_probs = df.groupby('atrialfibrillation')['outcome'].mean()

plt.style.use("ggplot")

plt.figure(figsize=(10, 6))
cond_probs.plot(kind='bar',color="brown")
plt.title('Conditional Probability of Outcome=1 Given atrialfibrillation')
plt.xlabel('atrialfibrillation')
plt.ylabel('Probablity(Outcome=1/atrialfibrillation)')
plt.show()


In [None]:
import matplotlib.pyplot as plt
import pandas as pd

# Creating a crosstab for 'hypertensive' and 'diabetes'
crosstab = pd.crosstab(df['hypertensive'], df['diabetes'])

# Setting up the positions and width for the bars
positions = list(range(len(crosstab)))
bar_width = 0.35

# Creating figure and axis
plt.figure(figsize=(12, 8))

# Ploting bars for each diabetes category
for i, column in enumerate(crosstab.columns):
    plt.bar([p + bar_width * i for p in positions], crosstab[column], width=bar_width, label=f'Diabetes {column}')

# Adding labels, title, and legend
plt.xlabel('Hypertensive')
plt.ylabel('Count')
plt.title('Interaction between Hypertension and Diabetes')
plt.xticks([p + bar_width * (len(crosstab.columns) / 2) for p in positions], crosstab.index)
plt.legend(title='Diabetes')


plt.show()

In [None]:
sns.countplot(x='gendera', hue='outcome', data=df, palette='Set2')
plt.title('Gender vs. Outcome')
plt.xlabel('Gender')
plt.ylabel('Frequency')
plt.show()

In [None]:
import seaborn as sns

sns.pairplot(df[['age', 'BMI', 'heart_rate', 'Systolic_blood_pressure', 'Diastolic_blood_pressure']])
plt.show()

In [None]:
correlation_matrix = df[['age', 'BMI', 'heart_rate', 'Systolic_blood_pressure', 'Diastolic_blood_pressure']].corr()
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Heatmap of Correlation Matrix')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.violinplot(x='diabetes', y='age', data=df)
plt.title('Violin Plot of Age by Diabetes Status')
plt.xlabel('Diabetes')
plt.ylabel('Age')
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.kdeplot(df[df['outcome'] == 0]['NT-proBNP'], label='Outcome=0', shade=True)
sns.kdeplot(df[df['outcome'] == 1]['NT-proBNP'], label='Outcome=1', shade=True)
plt.title('KDE Plot of NT-proBNP by Outcome')
plt.xlabel('NT-proBNP')
plt.ylabel('Density')
plt.legend()
plt.show()

In [None]:
plt.figure(figsize=(12, 6))
sns.scatterplot(x='age', y='heart_rate', hue='outcome', data=df)
plt.title('Scatter Plot of Age and Heart Rate by Outcome')
plt.xlabel('Age')
plt.ylabel('Heart Rate')
plt.legend(title='Outcome')
plt.show()


### Statistical Analysis

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import scipy.stats as stats

# Assuming X_Numerical contains the continuous features and 'outcome' is in df
continuous_features = Numerical_Vars_L
# Function to create Q-Q plot
def qq_plot(data, feature_name):
    stats.probplot(data, dist="norm", plot=plt)
    plt.title(f'Q-Q Plot for {feature_name}')
    plt.show()

# Apply Q-Q plot for each numerical feature
for feature in continuous_features:
    # Drop NaN values
    data = df[feature].dropna()
    
    # Q-Q Plot
    qq_plot(data, feature)



In [None]:
continuous_features

In [None]:
continuous_features_normal=["heart_rate",'Systolic_blood_pressure','Diastolic_blood_pressure','Respiratory_rate', "MCH","MCHC",
                            'Platelets','Blood_calcium', "Chloride","Anion_gap","Magnesium_ion","Bicarbonate"]

In [None]:
continuous_features_skewed = [item for item in continuous_features if item not in continuous_features_normal]

In [None]:
import pandas as pd


continuous_features = {
    "Normal Distribution": ['age', 'BMI', 'temperature', 'Creatinine', 'Urea_nitrogen'],
    "Skewed Distribution": ['heart_rate', 'Systolic_blood_pressure', 'Diastolic_blood_pressure', 'SP_O2',
                            'Urine_output', 'Neutrophils', 'Basophils', 'Lymphocyte', 'NT-proBNP', 
                            'Blood_potassium', 'Blood_sodium', 'Blood_calcium', 'Chloride', 'Anion_gap',
                            'Magnesium_ion', 'Bicarbonate', 'Lactic_acid', 'PCO2']
}

features_df = pd.DataFrame(dict([(k, pd.Series(v)) for k, v in continuous_features.items()])).fillna('')

# Displaying the DataFrame with additional formatting for a neater output without row numbers
styled_features_df = features_df.style.set_table_styles([
    {'selector': 'th', 'props': [('border', '2px solid black')]},
    {'selector': 'td', 'props': [('border', '1px solid black')]}
]).hide(axis='index')

styled_features_df

In [None]:
import scipy.stats as stats
import pandas as pd


t_test_results = []

for feature in continuous_features_normal:
    group0 = df[df['outcome'] == 0][feature].dropna()
    group1 = df[df['outcome'] == 1][feature].dropna()

    # Perform t-test
    t_stat, p_value = stats.ttest_ind(group0, group1)
    
    # Store the results with rounded values for better readability
    t_test_results.append({
        'Feature': feature,
        't-statistic': round(t_stat, 5),
        'p-value': round(p_value, 5)
    })

t_test_df = pd.DataFrame(t_test_results)


t_test_df.index = t_test_df.index + 1

# Displaying the DataFrame with formatting
styled_t_test_df = t_test_df.style.set_table_styles([
    {'selector': 'th', 'props': [('border', '2px solid black'), ('background-color', '#f2f2f2')]},
    {'selector': 'td', 'props': [('border', '1px solid black')]}
]).format({
    't-statistic': '{:.5f}',
    'p-value': '{:.5f}'
}).hide(axis='index')  # Hiding the index to remove row numbers


styled_t_test_df


In [None]:
import matplotlib.pyplot as plt

features = t_test_df['Feature']
p_values = t_test_df['p-value']

plt.figure(figsize=(12, 8))
bars = plt.bar(features, p_values, color='blue')

# Adding a red cut-off line at p-value = 0.50
plt.axhline(y=0.50, color='red', linestyle='--', linewidth=2)


plt.xlabel('Features')
plt.ylabel('p-value')
plt.title('p-values of Features with Cut-off Line')
plt.xticks(rotation=90)  # Rotate feature names for better readability
plt.ylim(0, max(p_values) + 0.1)  # Set y-axis limit to be a bit higher than the max p-value


for bar in bars:
    yval = bar.get_height()
    plt.text(bar.get_x() + bar.get_width()/2, yval + 0.02, round(yval, 5), ha='center', va='bottom')

plt.tight_layout()
plt.show()

In [None]:
continuous_features_skewed

In [None]:
import pandas as pd
from scipy.stats import mannwhitneyu


mann_whitney_results = []


for feature in continuous_features_skewed:
    group0 = df[df['outcome'] == 0][feature].dropna()  # Group 0 (Outcome = 0)
    group1 = df[df['outcome'] == 1][feature].dropna()  # Group 1 (Outcome = 1)
    
   
    stat, p_value = mannwhitneyu(group0, group1)
    
    # Storing the result with p-value as float for correct formatting
    mann_whitney_results.append({
        'Feature': feature,
        'Mann-Whitney U Test p-value': p_value  # Store as float
    })


mann_whitney_results_df = pd.DataFrame(mann_whitney_results)

# Display the DataFrame with formatting and without index
styled_mann_whitney_df = mann_whitney_results_df.style.set_table_styles([
    {'selector': 'th', 'props': [('border', '2px solid black'), ('background-color', '#f2f2f2')]},
    {'selector': 'td', 'props': [('border', '1px solid black')]}
]).format({
    'Mann-Whitney U Test p-value': '{:.5f}'  # Format p-value to 5 decimal places
}).hide(axis='index')  # Hide the index column


styled_mann_whitney_df

In [None]:
import matplotlib.pyplot as plt


features = mann_whitney_results_df['Feature']
p_values = mann_whitney_results_df['Mann-Whitney U Test p-value']


plt.figure(figsize=(12, 8))
bars = plt.barh(features, p_values, color='blue')

plt.axvline(x=0.50, color='red', linestyle='--', label='p-value = 0.50')


plt.xlabel('p-value')
plt.ylabel('Feature')
plt.title('Mann-Whitney U Test p-values by Feature')
plt.legend()


plt.tight_layout()
plt.show()

In [None]:
import pandas as pd
from scipy.stats import chi2_contingency


chi_square_results = []


for feature in Categorical_Vars:
    
    contingency_table = pd.crosstab(df[feature], df['outcome'])
    
    # Performing Chi-Square Test
    chi2_stat, p_value, dof, expected = chi2_contingency(contingency_table)
    
    # Storing the results with rounded values for better readability
    chi_square_results.append({
        'Feature': feature,
        'Chi-Square statistic': round(chi2_stat, 3),
        'p-value': round(p_value, 3)
    })


chi_square_df = pd.DataFrame(chi_square_results)

# Display the DataFrame with formatting and without index
styled_chi_square_df = chi_square_df.style.set_table_styles([
    {'selector': 'th', 'props': [('border', '2px solid black'), ('background-color', '#f2f2f2')]},
    {'selector': 'td', 'props': [('border', '1px solid black')]}
]).format({
    'Chi-Square statistic': '{:.3f}',
    'p-value': '{:.3f}'
}).hide(axis='index')  # Hide the index column


styled_chi_square_df

In [None]:
import matplotlib.pyplot as plt


features = chi_square_df['Feature']
p_values = chi_square_df['p-value']


plt.figure(figsize=(12, 8))
bars = plt.barh(features, p_values, color='blue')

# Adding a horizontal line at p-value = 0.50
plt.axvline(x=0.50, color='red', linestyle='--', label='p-value = 0.50')


plt.xlabel('p-value')
plt.ylabel('Feature')
plt.title('Chi-Square Test p-values by Feature')
plt.legend()


plt.tight_layout()
plt.show()

In [None]:
## Dropping the rejected columns
Drop=['CHD_with_no_MI','hematocrit','Creatine_kinase','glucose','MCH']
df.drop(columns=Drop,inplace=True)

In [None]:
df.shape

In [None]:
## Saving the dataframe after feature selection
df.to_csv('prepared_df.csv', index=False)