## General Library Imports

In [None]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.decomposition import TruncatedSVD
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline


# libraries for models
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import AdaBoostClassifier, BaggingClassifier, GradientBoostingClassifier, RandomForestClassifier
from sklearn import svm
# metrics evaluation libraries
from sklearn.metrics import auc, classification_report, confusion_matrix, roc_curve, RocCurveDisplay

## Data loading

In [None]:
project_data = pd.read_csv("HR Employee Attrition.csv")

## Initial Analysis

In [None]:
project_data.head()

In [None]:
project_data.info()

In [None]:
project_data.describe()

In [None]:
project_data.shape

In [None]:
project_data.columns

In [None]:
project_data.isna().sum()

## Eploratory Data Analysis

### uivariate Analysis

In [None]:
numeric_columns = [column for column in project_data.columns if project_data[column].dtype == 'int64']
print(numeric_columns)

In [None]:
for column in numeric_columns:
    plt.figure(figsize=(12,8))
    sns.kdeplot(data=project_data, x=column, palette="crest")
    plt.show()

In [None]:
categorical_columns = [column for column in project_data.columns if project_data[column].dtype != 'int64']
print(categorical_columns)

In [None]:
for column in categorical_columns:
    plt.figure(figsize=(12,8))
    sns.countplot(x=project_data[column])
    plt.show()

### Bivariate Analysis

In [None]:
i =1
for column in numeric_columns:
    plt.figure(figsize=(12,8))
    sns.kdeplot(data=project_data, x=column, hue="Attrition", fill=True, alpha=.5, palette="crest" )
    sns.set(rc = {'axes.facecolor': 'white'})
    #plt.xlabel(str(column), color='white')
    plt.savefig(str(i)+".png")
    plt.show()
    
    i = i+1

## Correlation Analysis

In [None]:
df1 = project_data.copy()

encoder = LabelEncoder()
for column in categorical_columns:
    df1[column] = encoder.fit_transform(df1[column])

plt.figure(figsize=(30,12))
corr = df1.corr()
sns.heatmap(corr, annot=True, cmap="YlGnBu")
plt.savefig('corr.png')

## Data Preprocessing and Pipelining

In [None]:
X_train=project_data.drop(columns=["Attrition", "YearsWithCurrManager", "YearsSinceLastPromotion", "YearsInCurrentRole"])
y_train=project_data["Attrition"]
X = X_train
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=0.3)

In [None]:
print('Train dataset shape:',X_train.shape)
print('Test dataset shape', y_train.shape)

In [None]:
X

In [None]:
numeric_columns = X_train.select_dtypes(exclude='object').columns
print(numeric_columns)
print('*'*100)
categorical_columns = X_train.select_dtypes(include='object').columns
print(categorical_columns)

In [None]:
numeric_features = Pipeline([
    ('handlingmissingvalues',SimpleImputer(strategy='median')),
    ('scaling',StandardScaler(with_mean=True))
])

print(numeric_features)
print('*'*100)

categorical_features = Pipeline([
    ('handlingmissingvalues',SimpleImputer(strategy='most_frequent')),
    ('encoding', OneHotEncoder()),
    ('scaling', StandardScaler(with_mean=False))
])

print(categorical_features)

processing = ColumnTransformer([
    ('numeric', numeric_features, numeric_columns),
    ('categorical', categorical_features, categorical_columns)
])

processing

## Generic Methods for Model Preparation & Metric Evaliation

In [None]:
"""
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
data_rescaled = scaler.fit_transform(X_train)
pca = PCA().fit(data_rescaled)

% matplotlib inline
import matplotlib.pyplot as plt
plt.rcParams["figure.figsize"] = (12,6)

fig, ax = plt.subplots()
xi = np.arange(1, 11, step=1)
y = np.cumsum(pca.explained_variance_ratio_)

plt.ylim(0.0,1.1)
plt.plot(xi, y, marker='o', linestyle='--', color='b')

plt.xlabel('Number of Components')
plt.xticks(np.arange(0, 11, step=1)) #change from 0-based array index to 1-based human-readable label
plt.ylabel('Cumulative variance (%)')
plt.title('The number of components needed to explain variance')

plt.axhline(y=0.95, color='r', linestyle='-')
plt.text(0.5, 0.85, '95% cut-off threshold', color = 'red', fontsize=16)

ax.grid(axis='x')
plt.show()
"""

In [None]:
from sklearn.decomposition import PCA
def prepare_model(algorithm, nc):
    model = Pipeline(steps= [
        ('processing',processing),
        ('pca', TruncatedSVD(n_components=nc, random_state=12)),
        ('modeling', algorithm)
    ])
    model.fit(X_train, y_train)
    return model

In [None]:
def prepare_model_stacking(algorithm):
    model = Pipeline(steps= [
        ('processing',processing),
        ('pca', TruncatedSVD(n_components=4, random_state=12)),
        ('modeling', algorithm)
    ])
    #model.fit(X_train, y_train)
    return model

In [None]:
def prepare_confusion_matrix(algo, model):
    print(algo)
    plt.figure(figsize=(12,8))
    pred = model.predict(X_test)
    cm = confusion_matrix(y_test, pred)
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, fmt='g', ax=ax)
    plt.show()
    
    # labels, title and ticks
    ax.set_xlabel('Predicted labels');ax.set_ylabel('True labels'); 
    ax.set_title('Confusion Matrix'); 

In [None]:
def prepare_classification_report(algo, model):
    print(algo+' Report :')
    pred = model.predict(X_test)
    print(classification_report(y_test, pred))

In [None]:
def prepare_roc_curve(algo, model):
    print(algo)
    y_pred_proba = model.predict_proba(X_test)[::,1]
    #print(y_test)
    #print(y_pred_proba)
    fpr, tpr, thresholds = roc_curve(y_test,  y_pred_proba)
    roc_auc = auc(fpr, tpr)
    curve = RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc)
    curve.plot()
    plt.show()

## Model Preparation

In [None]:
#gamma_val is sigma
gamma_val = 6
nc = 4
#Euclidean distance
def multiple_sigma(_x1, _x2):
    diff = _x1[:, np.newaxis] - _x2
    normsq = np.square(np.linalg.norm(diff, axis = 2))
    gamma= 1/(nc * gamma_val * gamma_val)
    
    return np.exp(- (normsq) / gamma)

def multiple_sigma_nc_manhattan(_x1, _x2):
    diff = _x1[:, np.newaxis] - _x2
    normsq = np.linalg.norm(diff, axis = 2)
    gamma= 1/(nc * gamma_val * gamma_val)
    
    return np.exp(- (normsq) / gamma)

def multiple_distance(_x1, _x2):
    diff = _x1[:, np.newaxis] - _x2
    normsq = np.square(np.linalg.norm(diff, axis = 2))
    gamma= 1/(gamma_val * gamma_val)
    
    return np.exp(- (4 * normsq) / gamma)

def multiple_distance_manhattan(_x1, _x2):
    diff = _x1[:, np.newaxis] - _x2
    normsq = np.linalg.norm(diff, axis = 2)
    gamma= 1/(gamma_val * gamma_val)
    
    return np.exp(- (4 * normsq) / gamma)
def k_gaussian(_x1, _x2):
    diff = _x1[:, np.newaxis] - _x2
    normsq = np.square(np.linalg.norm(diff, axis = 2))
    
    return np.exp(- (normsq) / 2)


def rbf_gaussian(_x1, _x2):
    diff = _x1[:, np.newaxis] - _x2
    normsq = np.square(np.linalg.norm(diff, axis = 2))
    gamma = 1/(2 * np.square(gamma_val))
    
    return np.exp(- (normsq ) * gamma  ) 

def lrbf_gaussian(_x1, _x2):
    diff = _x1[:, np.newaxis] - _x2
    normsq = np.square(np.linalg.norm(diff, axis = 2))
    gamma = 1/gamma_val
    
    return np.exp(- np.sqrt(normsq ) * gamma  ) 

# ||x-y||^2 / 2 * sigma^(1/nc)
# Manhattan distance ||x-y|| ^ (1/nc) 
def lrbf_gaussian_modified(_x1, _x2):
    diff = _x1[:, np.newaxis] - _x2
    normsq = np.square(np.linalg.norm(diff, axis = 2))
    gamma = 1/(2 * (gamma_val**(1/nc)))
    
    return np.exp(- normsq  * gamma  ) 

def grbf_lrbf_gaussian(_x1, _x2):
    diff = _x1[:, np.newaxis] - _x2
    normsq = np.square(np.linalg.norm(diff, axis = 2))
    lgamma = 1/gamma_val
    gamma = 1/(2 * np.square(gamma_val))
    
    return np.exp(- np.sqrt(normsq ) * lgamma  ) + np.exp(- (normsq ) * gamma  )

def poly_kernel_fn(X, Y):
   
   K = np.zeros((X.shape[0],Y.shape[0]))
   diff = X[:, np.newaxis] - Y
   normsq = np.square(np.linalg.norm(diff, axis = 2))
   gamma = 1/ (gamma_val * nc)
   K = (gamma*X.dot(Y.T))**4 + np.exp(- np.sqrt(normsq ) * gamma  )
   return K


In [None]:
#https://data-flair.training/blogs/svm-kernel-functions/
n=3
#83.38 => Linear+ RBF
#83.57 => Gaussian + RBF
#84.15 => Laplace with squared numerator + RBF
#85.13 => Laplace + RBF

n=4
#85.4 => k_gaussian
#86.2 => Laplace + RBF
#85.03 => np.exp(- np.sqrt(normsq) * normsq/ 2) in rbf_gaussian
#84.45 => nom square


In [None]:

algorithms = [
            ('SVM-multiple-sigma', svm.SVC(kernel=multiple_sigma)),
            ('SVM-multiple-sigma-Manhattan', svm.SVC(kernel=multiple_sigma_nc_manhattan)),
            ('SVM-multiple-distance', svm.SVC(kernel= multiple_distance)),
            ('SVM-multiple-distance-manhattan', svm.SVC(kernel=multiple_distance_manhattan)),
              ('SVM-Gaussian', svm.SVC(kernel=k_gaussian)),
              ('SVM-grbf', svm.SVC(kernel=rbf_gaussian)),
              ('SVM-lrbf', svm.SVC(kernel=lrbf_gaussian)),
              
              ('SVM-grbf_lrbf', svm.SVC(kernel=grbf_lrbf_gaussian)),
              ('SVM-originalrbf', svm.SVC(kernel='rbf', gamma=gamma_val)),
              ('SVM-poly', svm.SVC(kernel=poly_kernel_fn)),
              ('SVM-polyoriginal', svm.SVC(kernel = 'poly')),
              ('SVM-lrbf-modified', svm.SVC(kernel=lrbf_gaussian_modified))
             ]

trained_models = []
model_and_score = {}

for index, tup in enumerate(algorithms):
    model = prepare_model(tup[1], nc)
    model_and_score[tup[0]] = str(model.score(X_train,y_train)*100)+"%"
    trained_models.append((tup[0],model))

In [None]:
print(model_and_score)

In [None]:
for index, tup in enumerate(trained_models):
    prepare_classification_report(tup[0], tup[1])
    print("\n")
    

In [None]:
import sklearn
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
models = [('SVM-custom',prepare_model(svm.SVC(kernel=rbf_gaussian))) , 
            ('SVM-rbf', prepare_model(svm.SVC(kernel='rbf', gamma=1, C=1, decision_function_shape='ovo')))
         
         ]
stacking = sklearn.ensemble.StackingClassifier(estimators=models)

cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=1, random_state=1)
scores = cross_val_score(stacking, X_train, y_train, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')


## Model Evaluation

In [None]:
scores.mean()*100

In [None]:
import plotly.express as px  # for data visualization
import plotly.graph_objects as go # for data visualization
def Plot_3D(X, X_test, y_test, clf):
            
    # Specify a size of the mesh to be used
    mesh_size = 5
    margin = 1

    # Create a mesh grid on which we will run our model
    x_min, x_max = X.iloc[:, 0].fillna(X.mean()).min() - margin, X.iloc[:, 0].fillna(X.mean()).max() + margin
    y_min, y_max = X.iloc[:, 1].fillna(X.mean()).min() - margin, X.iloc[:, 1].fillna(X.mean()).max() + margin
    xrange = np.arange(x_min, x_max, mesh_size)
    yrange = np.arange(y_min, y_max, mesh_size)
    xx, yy = np.meshgrid(xrange, yrange)
            
    # Calculate predictions on grid
    #clf = prepare_model(svm.SVC(kernel='rbf', gamma=1, C=1, decision_function_shape='ovo'))
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])[:, 1]
    Z = Z.reshape(xx.shape)

    # Create a 3D scatter plot with predictions
    fig = px.scatter_3d(x=X_test['rating_difference'], y=X_test['turns'], z=y_test, 
                     opacity=0.8, color_discrete_sequence=['black'])

    # Set figure title and colors
    fig.update_layout(#title_text="Scatter 3D Plot with SVM Prediction Surface",
                      paper_bgcolor = 'white',
                      scene = dict(xaxis=dict(backgroundcolor='white',
                                              color='black',
                                              gridcolor='#f0f0f0'),
                                   yaxis=dict(backgroundcolor='white',
                                              color='black',
                                              gridcolor='#f0f0f0'
                                              ),
                                   zaxis=dict(backgroundcolor='lightgrey',
                                              color='black', 
                                              gridcolor='#f0f0f0', 
                                              )))
    # Update marker size
    fig.update_traces(marker=dict(size=1))

    # Add prediction plane
    fig.add_traces(go.Surface(x=xrange, y=yrange, z=Z, name='SVM Prediction',
                              colorscale='RdBu', showscale=False, 
                              contours = {"z": {"show": True, "start": 0.2, "end": 0.8, "size": 0.05}}))
    fig.show()

In [None]:
for index, tup in enumerate(trained_models):
    prepare_classification_report(tup[0], tup[1])
    print("\n")

In [None]:
def fitting(X, y, C, gamma):
    # Create training and testing samples
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

    # Fit the model
    # Note, available kernels: {‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’, ‘precomputed’}, default=’rbf’
    model = SVC(kernel='rbf', probability=True, C=C, gamma=gamma)
    clf = model.fit(X_train, y_train)

    # Predict class labels on training data
    pred_labels_tr = model.predict(X_train)
    # Predict class labels on a test data
    pred_labels_te = model.predict(X_test)

    # Use score method to get accuracy of the model
    print('----- Evaluation on Test Data -----')
    score_te = model.score(X_test, y_test)
    print('Accuracy Score: ', score_te)
    # Look at classification report to evaluate the model
    print(classification_report(y_test, pred_labels_te))
    print('--------------------------------------------------------')

    print('----- Evaluation on Training Data -----')
    score_tr = model.score(X_train, y_train)
    print('Accuracy Score: ', score_tr)
    # Look at classification report to evaluate the model
    print(classification_report(y_train, pred_labels_tr))
    print('--------------------------------------------------------')
    
    # Return relevant data for chart plotting
    return X_train, X_test, y_train, y_test, clf

In [None]:
for index, tup in enumerate(trained_models):
    prepare_confusion_matrix(tup[0], tup[1])

In [None]:
encoder = LabelEncoder()
y_test = encoder.fit_transform(y_test)

for index, tup in enumerate(trained_models):
    prepare_roc_curve(tup[0], tup[1])

In [None]:
"""
algorithms = [('bagging classifier', BaggingClassifier()), 
              ('KNN classifier', KNeighborsClassifier()), 
              ('Random Forest calssifier', RandomForestClassifier()), 
              ('Adaboost classifier', AdaBoostClassifier()), 
              ('Gradientboot classifier',GradientBoostingClassifier()),
              ('MLP', MLPClassifier()),
              ('SVM-Linear', svm.SVC(kernel='linear', C=1, decision_function_shape='ovo')),
              ('SVM-rbf', svm.SVC(kernel='rbf', gamma=1, C=1, decision_function_shape='ovo')),
              ('SVM-poly', svm.SVC(kernel='poly', degree=3, C=1, decision_function_shape='ovo')),
              ('SVM-sigmoid', svm.SVC(kernel='sigmoid', C=1, decision_function_shape='ovo')),
              ('SVM-custom', svm.SVC(kernel=rbf_gaussian)
              )
             ]

trained_models = []
model_and_score = {}

for index, tup in enumerate(algorithms):
    model = prepare_model(tup[1])
    model_and_score[tup[0]] = str(model.score(X_train,y_train)*100)+"%"
    trained_models.append((tup[0],model))
"""