# Importing Libraries


In [None]:
# Importing Libraries for ML

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
from scipy.stats import ttest_ind

from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from scipy.stats import yeojohnson

from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.datasets import make_classification
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import FunctionTransformer, PowerTransformer

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

import warnings
warnings.filterwarnings("ignore")

# Importing Cleaned Data

In [None]:
df = pd.read_csv('cleaned_data.csv')
df.head(3)

# 1. Feature Selection

## Selecting the features based upon mutual information gain in classification

In [None]:
# splitting the dataset into dependant and independant variables
x = df.drop('status', axis=1)
y = df['status']

In [None]:
# Function for selecting the features based upon mutual information gain in classification
# Values ranges from 0-1 higher is better and that variable selected

def select_features_mutual_info_classification(features, target, threashold):
    mutual_info = mutual_info_classif(features, target)
    mutual_data = pd.Series(mutual_info,index = features.columns)
    top_feature = mutual_data.sort_values(ascending=False)
    return top_feature[top_feature>threashold] 

top_features = select_features_mutual_info_classification(x, y, 0.1)
top_features

In [None]:
list(top_features.index)

In [None]:
df_deleted1 = df[list(top_features.index)]
df_deleted1.head()

## Deleting the unnecessary columns with correlation value

In [None]:
# Correlations matrix
plt.figure(figsize=(15, 15))
sns.heatmap(df_deleted1.corr(), annot=True)

In [None]:
threshold = 0.9

# find and remove correlated features
# we will create function which will find the columns having highest correlation with each other
# we will delete the columns which gives highest correlation as if two columns having highest correlation above threashold then, 
# it will be considered as both columns gives same information gain.

def columns_to_delete_due_to_high_correlation(dataset, threshold):
    col_corr = set()                                        # Set of all the names of correlated columns
    corr_matrix = dataset.corr()
    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i, j]) > threshold:     # we are interested in absolute coeff value
                colname = corr_matrix.columns[i]            # getting the name of column
                col_corr.add(colname)
    return col_corr

In [None]:
# columns which can be deleted due to high correlation
cols_delete = columns_to_delete_due_to_high_correlation(df_deleted1, threshold)
list(cols_delete)

In [None]:
# delete the columns
df_removed_cols = df.drop(list(cols_delete), axis=1)
df_removed_cols.head()

In [None]:
# As 'name' column is not required we will delete the column
df_removed_cols = df_removed_cols.drop('name', axis=1)
df_removed_cols.head()

In [None]:
df_removed_cols.columns

### Conclusion - 

- We have used the Mutual information gain method for classification to get the features which are most important for classification based on dependant and indepedant variables
- Then we deleted the columns columns which are having lower mutual information gain score than 0.1
- We have then deleted the columns which are having the highest correlation score, considering the fact that if two columns are having highest correlation score then both columns are giving the same information gain and weightage to the output.

- We have selected the below columns for further analysis and modelling

- ['MDVP:Fo(Hz)', 'MDVP:Fhi(Hz)', 'MDVP:Flo(Hz)', 'MDVP:Jitter(%)',
       'MDVP:RAP', 'MDVP:PPQ', 'Jitter:DDP', 'MDVP:APQ', 'NHR', 'HNR',
       'status', 'RPDE', 'DFA', 'spread2', 'D2', 'PPE']

# 2. Imbalanced Data

In [None]:
# lets plot the status column values to see the imbalance in the dataset
positive = df_removed_cols[df_removed_cols["status"]==1]
negative = df_removed_cols[df_removed_cols["status"]==0]

# shape of the data
print(positive.shape, negative.shape)

# plot the count of records for status 1 and 0
df_removed_cols["status"].hist()

### From above we can conclude that data is imbalanced.
As we can see dataset having 140 records for postive while only 49 records for negative outcomes. Hence we will upsample the dataset for negative outcomes

### Upsampling -

- We don't reduce the dataset from class having maximum instances but we rather increase the instances of class who have less dataset.

In [None]:
x = df_removed_cols.drop(["status"], axis=1)
y = df_removed_cols["status"]

print(x.shape, y.shape)

over_sampler = RandomOverSampler()
X_res, Y_res = over_sampler.fit_resample(x, y)

In [None]:
# Combining the datasets after upsampling
df_upsample = X_res
df_upsample['status'] = Y_res
df_upsample.head()

In [None]:
# lets plot the status column values to see the imbalance in the dataset
positive = df_upsample[df_upsample["status"]==1]
negative = df_upsample[df_upsample["status"]==0]

# shape of the data
print(positive.shape, negative.shape)

# plot the count of records for status 1 and 0
df_upsample["status"].hist()

# 3. Feature Transformation

In [None]:
# Function to create dataframe for columns and its unique count of values, datatypes and type of columns
def find_categorical_continous_variables(data, threashold):
    dic = {}
    continus_cols = []
    categorical_cols = []

    for column in data.columns:
        li = []
        li.append(len(data[column].unique()))
        li.append(data[column].dtype)

        # Check if the column is categorical or continuous
        if len(data[column].unique()) <= threashold or data[column].dtype == 'O':
            li.append("Categorical column")
            categorical_cols.append(column)
        else:
            li.append("Continous Column")
            continus_cols.append(column)

        dic[column] = li
    
    # create dataframe for columns and its details
    dic_df = pd.DataFrame(dic, index=["Unique values", "Data Type", "Categorical/Continous"])
    
    # return the dataframe, and lists for continous and categorical columns
    return dic_df.T, categorical_cols, continus_cols

# Function to find descriptive statistics by providing it the contious cols and dataframe
def descriptive_statistics_continous(df, continus_cols):
    dic={}
    for col in continus_cols:
        dic[col] = []
        dic[col].append(df[col].mean())
        dic[col].append(df[col].median())
        dic[col].append(df[col].mode()[0])
        dic[col].append(df[col].std())
        dic[col].append(df[col].var())
        dic[col].append(df[col].max() - df[col].min())
        dic[col].append(df[col].quantile(0.75) - df[col].quantile(0.25) )
        dic[col].append(df[col].skew())
        dic[col].append(df[col].kurtosis())

    df_details = pd.DataFrame(dic, index=["Mean", "Median", "Mode", "Std Deviation", "Variance", "Range", "IQR", "Skew", "Kurtosis"])
    df_details = df_details.T
    
    # find out the skenewss
    df_details.loc[df_details["Skew"]<=-0.5, "Skeness"] = "Left/Negative Skew"
    df_details.loc[df_details["Skew"]>=0.5, "Skeness"] = "Right/Positive Skew"
    df_details.loc[(df_details["Skew"]<0.5) & (df_details["Skew"]>-0.5), "Skeness"] = "Symmetric"
    
    # find out the kurtosis
    df_details.loc[df_details["Kurtosis"]<2.5, "Kurtosis_type"] = "Platykurtic"
    df_details.loc[df_details["Kurtosis"]>3.5, "Kurtosis_type"] = "Leptokurtic"
    df_details.loc[(df_details["Kurtosis"]<3.5) & (df_details["Kurtosis"]>2.5), "Kurtosis_type"] = "Mesokurtic"
    
    return df_details

# lets find out the unique counts, datatypes, variable type like continous/categorical and lists for columns names having continous/categorical columns
col_type_df, categorical_cols, continus_cols = find_categorical_continous_variables(df_temp, 10)

# Get the descriptive statistics
df_distribution = descriptive_statistics_continous(df_temp, continus_cols)
df_distribution

In [None]:
def plot_distribution_numerical(df):

    # Lets plot the histogram for each variable
    features = df.select_dtypes(include='number').columns

    # plot the subplot for histogram of each variable
    fig, axs = plt.subplots(len(df.columns), 3, figsize=(20, 60))
    row = 0
    for feature in features:

        axs[row, 0].set_title("Histogram for {}".format(feature))
        sns.histplot(data=df, x=feature, kde=True, color="red", ax=axs[row, 0])

        axs[row, 1].set_title("Boxplot for {}".format(feature))
        sns.boxplot(data=df, x=feature,  color="skyblue", ax=axs[row, 1])

        axs[row, 2].set_title("Vaiolinplot for {}".format(feature))
        sns.violinplot(data=df, x=feature,  color="lightgreen", ax=axs[row, 2])
        row = row+1

    plt.title("Histogram, Boxplot and Violinplots for all variables")
    plt.tight_layout()
    plt.show()
    
plot_distribution_numerical(df_temp)

In [None]:
right_skewed_columns = [df_distribution["Skeness"]=='Right/Positive Skew'].index
print(right_skewed_columns)

In [None]:
normal_distributed_columns = df_distribution[df_distribution["Skeness"]=='Symmetric'].index
print(normal_distributed_columns)

In [None]:
left_skewed_columns= df_distribution[df_distribution["Skeness"]=='Left/Negative Skew'].index
print(left_skewed_columns)

In [None]:
# Function to find out which feature transformation method for which column
def select_feature_transformation_methods(X):

    # Initialize dictionary to store selected transformation methods for each column
    transformation_methods = {}
    df_transformed = pd.DataFrame()

    # Loop through each column of the dataset
    for col_name in X.columns:
        column_data = X[col_name].to_numpy()  # Convert Pandas Series to NumPy array

        # Check if column contains only positive values and a wide range
        if np.all(column_data > 0) and (np.max(column_data) - np.min(column_data)) > 10:
            transformation_methods[col_name] = 'logarithmic'
            df_transformed[col_name] = np.log1p(column_data)

        # Check if column has large values and heavy tails
        elif np.max(column_data) > 100 and np.ptp(column_data) > 100:
            transformation_methods[col_name] = 'square'
            df_transformed[col_name] = np.square(column_data)

        # Check if column has small values and heavy tails
        elif np.max(column_data) < 10 and np.ptp(column_data) > 10:
            transformation_methods[col_name] = 'reciprocal'
            df_transformed[col_name] = np.reciprocal(column_data)

        # For other cases, use Box-Cox or Yeo-Johnson transformation
        else:
            try:
                _ , maxlog, _ = yeojohnson(column_data)
                if maxlog < 0:
                    transformation_methods[col_name] = 'yeo_johnson'
                    transformer = PowerTransformer(method='yeo-johnson')
                    df_transformed[col_name] = transformer.fit_transform(column_data.reshape(-1, 1)).flatten()

                else:
                    transformation_methods[col_name] = 'boxcox'
                    transformer = PowerTransformer(method='box-cox')
                    df_transformed[col_name] = transformer.fit_transform(column_data.reshape(-1, 1)).flatten()

            except ValueError:
                transformation_methods[col_name] = 'square'
                df_transformed[col_name] = np.square(column_data)
                
    return transformation_methods, df_transformed

transformation_methods, df_transformed = select_feature_transformation_methods(df)

In [None]:
transformation_methods

In [None]:
df_transformed.head()

In [None]:
plot_distribution_numerical(df_transformed)