In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
import utils.dimension_reduction as dr
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from operator import itemgetter
from tabulate import tabulate
import utils.eval_metrics as em
from sklearn import tree
from sklearn import metrics
import utils.learning_curve as lc
import utils.cross_validation as cv
import utils.oversampling as osm
import pickle

pd.set_option('display.max_columns', None)

In [None]:
data = pd.read_csv("final_dataset.csv")

In [None]:
data.head()

In [None]:
data.drop('SEQN', axis=1, inplace=True)

In [None]:
def show_value_counts(data):
    for column in data.columns:
        unique_values = data[column].nunique()
        if unique_values > 3:
            print(f"Column: {column}")
            print(data[column].value_counts())
            print("\n")

In [None]:
show_value_counts(data)

In [None]:
def get_missing_val_count_df(df):
    missing_values = df.isnull().sum()
    missing_df = pd.DataFrame({'Feature_with_missing_value': missing_values.index, 'Missing_values': missing_values.values})
    return missing_df[missing_df['Missing_values'] > 0]

In [None]:
msdf = get_missing_val_count_df(data)
msdf

In [None]:
missing_column_names = msdf['Feature_with_missing_value'].tolist()

In [None]:
data.shape

In [None]:
y = data['UACR']
X = data.drop(columns=['UACR'])

In [None]:
X.shape, y.shape

In [None]:
# Lets impute missing values using knn Imputer
imputer = KNNImputer(n_neighbors=3, weights='uniform', metric='nan_euclidean')
imputer.fit(X)
X = imputer.transform(X)
X.shape

In [None]:
X_columns = data.iloc[:, :-1]  # Select all columns except the last one
X = pd.DataFrame(X, columns=X_columns.columns)

In [None]:
X.shape

In [None]:
def process_columns(df, column_names):
    for column_name in column_names:
        for i in range(df.shape[0]):
            if df.loc[i, column_name] < 0.5:
                df.loc[i, column_name] = 0
            else:
                df.loc[i, column_name] = 1

In [None]:
process_columns(X, missing_column_names)

In [None]:
X.isna().sum().sum()# no missing values

In [None]:
X.shape, y.shape

In [None]:
X_os,y_os = osm.oversample_data(X, y, method='adasyn', sampling_strategy=0.75)

In [None]:
X_os.shape, y_os.shape

In [None]:
X_significant = dr.get_p_values_significant_features(X_os,y_os)

In [None]:
X_rfe = dr.dim_red_using_rfe(X_significant, y_os, 30)

In [None]:
X_train_full, X_test, y_train_full, y_test = train_test_split(X_rfe, y_os, test_size=0.2, random_state=2)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=2)

In [None]:
print('train data(X_train,y_train) shape: ',X_train.shape, y_train.shape)
print('validation data(X_valid,y_valid) shape: ',X_valid.shape, y_valid.shape)
print('test data(X_test,y_test) shape: ',X_test.shape, y_test.shape)

## SVM Baseline

In [None]:
from sklearn.svm import SVC

In [None]:
# Train classifier

svc_model = SVC()
svc_model.fit(X_train, y_train)

In [None]:
# Predict the model

y_pred_baseline_svm = svc_model.predict(X_valid)

In [None]:
em.metrics_calculation(y_valid, y_pred_baseline_svm, True)

In [None]:
svc_model.get_params()

### After Feature transformation - Standardization

In [None]:
scaler = StandardScaler()
X_std = scaler.fit_transform(X_rfe)

In [None]:
X_train_full, X_test, y_train_full, y_test = train_test_split(X_std, y_os, test_size=0.2, random_state=2)

In [None]:
X_train, X_valid, y_train, y_valid = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=2)

In [None]:
print('train data(X_train,y_train) shape: ',X_train.shape, y_train.shape)
print('validation data(X_valid,y_valid) shape: ',X_valid.shape, y_valid.shape)
print('test data(X_test,y_test) shape: ',X_test.shape, y_test.shape)

In [None]:
from sklearn.svm import SVC

In [None]:
svc_model = SVC()

In [None]:
# Train classifier

svc_model.fit(X_train, y_train)

In [None]:
# Predict the model

y_pred_baseline_svm = svc_model.predict(X_valid)

In [None]:
em.metrics_calculation(y_valid, y_pred_baseline_svm, True)

In [None]:
svc_model.get_params()

### Hyperparameter tuning

In [None]:
# defining a grid search like function for decision tree to get hyper parameters

def svm_grid_search(C, kernel, params, X_train, X_test, y_train, y_test):
    for k in range(0,len(C)):
        for i in range(0, len(kernel)):
            C_val = C[k]
            kernel_val = kernel[i]
        
        svc_model = SVC(C = C_val, kernel = kernel_val, random_state=2023)
        svc_model = svc_model.fit(X_train,y_train)
        y_pred = svc_model.predict(X_test)
        
        accuracy = metrics.accuracy_score(y_test, y_pred)
        F1_score = metrics.f1_score(y_test, y_pred, average='macro')
        precision = metrics.precision_score(y_test, y_pred, average='macro')
        recall = metrics.recall_score(y_test, y_pred, average='macro')
        
        parameters_dict = {'Regularization_parameter':C_val, 'Kernel':kernel_val, 'accuracy': accuracy, 'F1_score':F1_score, 'precision':precision, 'recall':recall}
        params.append(parameters_dict)
        
        print(f'Regularization_parameter:{C_val}, Kernel:{kernel_val}, accuracy:{accuracy}, F1_score:{F1_score}')

In [None]:
C = [1, 10, 100]
kernel = ['linear','rbf']
params = list()

In [None]:
svm_grid_search(C, kernel, params, X_train, X_valid, y_train, y_valid)

In [None]:
# lets sort the metrics in descending order

param_sorted = sorted(params, key=itemgetter('F1_score'), reverse=True)
df = pd.DataFrame.from_dict(param_sorted)
print(tabulate(df, headers = 'keys', tablefmt = 'psql'))

# optimal set of hyperparameters will be the top value in the sorted parameter list

opt_hparams = param_sorted[0]
print('**'*60)
print(f'\noptimal hyper-parameters: {opt_hparams}\n')
print('**'*60)

### Cross validation

In [None]:
# using the optimal hyperparameters test the model robustness using k-fold cross validation

svm_model_tuned = SVC(C = 10, kernel = 'rbf', random_state=2023)
svm_model_tuned.fit(X_train_full, y_train_full)
cv.cross_validation_func(svm_model_tuned, X_train_full, y_train_full, 5)

### Testing SVM with optimal hyperparameters on unseen data

In [None]:
svc_model_opt_p_rfe_adasyn = SVC(C = 10, kernel = 'rbf', random_state=2023)
svc_model_opt_p_rfe_adasyn.fit(X_train_full, y_train_full)
y_pred_opt_SVC = svc_model_opt_p_rfe_adasyn.predict(X_test)
lc.plot_learning_curve(svc_model_opt_p_rfe_adasyn, X_train_full, y_train_full, cv=5)

In [None]:
em.metrics_calculation(y_test, y_pred_opt_SVC, True)

In [None]:
filename = 'svc_model_opt_p_rfe_adasyn.sav'
pickle.dump(svc_model_opt_p_rfe_adasyn, open(filename, 'wb'))

In [None]:
loaded_model = pickle.load(open(filename, 'rb'))
y_pred = loaded_model.predict(X_test)
em.metrics_calculation(y_test, y_pred, True)