In [1]:
# Importing libraries needed for importing the dataframes.
import pandas as pd
import glob
import os
from contextlib import contextmanager

# Importing preprocessing libraries
from sklearn.preprocessing import MaxAbsScaler, OneHotEncoder
from sklearn.impute import KNNImputer, SimpleImputer

# Importing pipelines
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Importing classifiers
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.dummy import DummyClassifier

# Validation
from sklearn.model_selection import cross_validate

# Importing scores
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
src_path = r".\Data\\Cleaned data\\"
script_path = os.getcwd()
extension = r"csv"

# Lists of which feature belongs to which dtype. Booleans will get treated as numerical features, i.e. 0 or 1.
categorical_features = ["sex", "restecg", "slope", "thal", "cp"]
integer_features = ["age", "trestbps", "chol", "thalach", "ca"]
float_features = ["oldpeak"]
boolean_features = ["fbs", "exang"]

num_cols = integer_features + float_features + boolean_features

# List of classifiers
models = {
         "SVC": SVC(),
         "KNN": KNeighborsClassifier(),
         "Decision Tree": DecisionTreeClassifier(),
         "Random Forest": RandomForestClassifier()}

# Scores that will be used
scores = ["accuracy", "precision", "recall", "f1"]

In [3]:
# Function to set dtypes after lists above.
def set_dtypes(df_dict):
    for df, name in df_dict.items():
        for column in name.columns:
            if column in categorical_features:
                name[column] = name[column].astype("category", copy=False)
            elif column in boolean_features:
                name[column] = name[column].astype("Int64", copy=False)
            elif column in float_features:
                name[column] = pd.to_numeric(name[column], downcast="float", errors="coerce")
            elif column in integer_features:
                name[column] = pd.to_numeric(name[column], downcast="float", errors="coerce")
                name[column] = name[column].astype("Int64", copy=False)
    return df_dict

def classifier_selection(preprocessing, models, df, df_y, scores):
    result_dict = {}
    
    for key, value in models.items():
        pipe = Pipeline([("preprocessing", preprocessing),
                        ("model", value)])
        
        results = cross_validate(pipe, df, df_y, scoring=scores)
        
        result_dict[key] = [results["test_accuracy"].mean(), results["test_precision"].mean(), results["test_recall"].mean(), results["test_f1"].mean()]
        
    return pd.DataFrame(result_dict, index=scores)

In [4]:
@contextmanager
def gen_list(file_path, extension):
    f = os.chdir(file_path)
    yield f
    os.chdir(script_path)
    
with gen_list(src_path, extension) as f:
    csv_list = glob.glob(r"*{}".format(extension))
    dataframe_names = []
    df_dict = {}
    iterator = 0
    for i in csv_list:
        dataframe_names.append(i[:-4]) # Saves the names of the csv files minus .csv and also changes the periods in the name to underscores.
    for j in dataframe_names:
        df_dict[j] = pd.read_csv(csv_list[iterator], sep=None, engine="python")
        
df_dict = set_dtypes(df_dict)
        
# Squishes all Dataframes num column to 0 or 1.
for df, name in df_dict.items():
    name["num"].replace([2, 3, 4], 1, inplace=True)
    
# Concatenating all to one dataframe.
df = pd.concat(df_dict.values(), ignore_index=True)

# Separating target from dataframe
df_y = df["num"]
df.drop(labels="num", axis=1, inplace=True)

In [5]:
# Setting up pipelines for classifiers using simple imputer.

cat_transformer = Pipeline([("imputing", SimpleImputer(strategy="most_frequent")),
                           ("encoding", OneHotEncoder(handle_unknown="ignore", drop="if_binary"))])
num_transformer = Pipeline([("imputing", SimpleImputer(strategy="most_frequent")),
                           ("scaling", MaxAbsScaler())])

preprocessing1 = ColumnTransformer([("cat_cols", cat_transformer, categorical_features),
                                  ("num_cols", num_transformer, num_cols)])

In [6]:
results = classifier_selection(preprocessing1, models, df, df_y, scores)

results.head()

Unnamed: 0,SVC,KNN,Decision Tree,Random Forest
accuracy,0.911723,1.0,1.0,1.0
precision,0.912143,1.0,1.0,1.0
recall,0.893871,1.0,1.0,1.0
f1,0.902859,1.0,1.0,1.0
