###  Exercise project 3 – Support Vector Machines. This notebook will create SVM-classification model. I will use dataset from Project_2:HR Employee Attrition dataset (https://www.kaggle.com/datasets/saurabhbadole/hr-employee-attrition). Target variable - "Attrition" column (show how likely someone is to quit).The cleaning/optimisation phase is copied from Project_1, so you can scrow down straight to "scale of  the regression target for SVC" 

In [37]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# pip install scikit-learn
from sklearn.model_selection import train_test_split
from sklearn import metrics, svm
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.pipeline import make_pipeline
from sklearn.calibration import CalibratedClassifierCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler

In [38]:
# load the dataset:
df = pd.read_csv("HR-Employee-Attrition.csv")

df.head()

Unnamed: 0,Age,Attrition,BusinessTravel,DailyRate,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeNumber,...,RelationshipSatisfaction,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,WorkLifeBalance,YearsAtCompany,YearsInCurrentRole,YearsSinceLastPromotion,YearsWithCurrManager
0,41,Yes,Travel_Rarely,1102,Sales,1,2,Life Sciences,1,1,...,1,80,0,8,0,1,6,4,0,5
1,49,No,Travel_Frequently,279,Research & Development,8,1,Life Sciences,1,2,...,4,80,1,10,3,3,10,7,1,7
2,37,Yes,Travel_Rarely,1373,Research & Development,2,2,Other,1,4,...,2,80,0,7,3,3,0,0,0,0
3,33,No,Travel_Frequently,1392,Research & Development,3,4,Life Sciences,1,5,...,3,80,0,8,3,3,8,7,3,0
4,27,No,Travel_Rarely,591,Research & Development,2,1,Medical,1,7,...,4,80,1,6,3,3,2,2,2,2


In [39]:
df.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'DailyRate', 'Department',
       'DistanceFromHome', 'Education', 'EducationField', 'EmployeeCount',
       'EmployeeNumber', 'EnvironmentSatisfaction', 'Gender', 'HourlyRate',
       'JobInvolvement', 'JobLevel', 'JobRole', 'JobSatisfaction',
       'MaritalStatus', 'MonthlyIncome', 'MonthlyRate', 'NumCompaniesWorked',
       'Over18', 'OverTime', 'PercentSalaryHike', 'PerformanceRating',
       'RelationshipSatisfaction', 'StandardHours', 'StockOptionLevel',
       'TotalWorkingYears', 'TrainingTimesLastYear', 'WorkLifeBalance',
       'YearsAtCompany', 'YearsInCurrentRole', 'YearsSinceLastPromotion',
       'YearsWithCurrManager'],
      dtype='object')

In [40]:
columns_to_drop = [
    'Over18',
    'EmployeeCount',
    'EmployeeNumber',
    'Department',
    'EducationField',
    "Education",
    "MaritalStatus",
    "OverTime",
    "JobRole",
    "StandardHours",
    'Gender',
    'HourlyRate',
    'MonthlyRate',
    'RelationshipSatisfaction',
    'DistanceFromHome',
    'PerformanceRating',
    'NumCompaniesWorked',
    'PercentSalaryHike',
]

df = df.drop(columns_to_drop, axis=1)

In [41]:
# use of OneHotEncoder for "BusinessTravel" column , it will create multiple columns with numeric values:
from sklearn.preprocessing import OneHotEncoder
variables = ["BusinessTravel"]
             
# use encoder:
encoder = OneHotEncoder(sparse_output=False).set_output(transform="pandas")
one_hot_encoded = encoder.fit_transform(df[variables]).astype(int)
df = pd.concat([df,one_hot_encoded],axis=1).drop(columns=variables)

In [42]:
# next we will remove one new boolean variable "BusinessTravel_Travel_Rarely" (since it's going to be represented
# by a straight line of zeroes)
df = df.drop("BusinessTravel_Travel_Rarely", axis=1)

In [43]:
from sklearn.preprocessing import LabelEncoder
# list of all boolean variables we want to convert
variables = ['Attrition']

# initalize encoder and convert everything
encoder = LabelEncoder()
df[variables] = df[variables].apply(encoder.fit_transform)

###  X/y -split + VIF -test

In [None]:
 # use everything else except, the target
X = df.drop("Attrition", axis=1)

# our target variable is y
y = df['Attrition']

In [None]:
# Finding columns with potential multicollinearity (VIF-test)
# pip install statsmodels
from statsmodels.stats.outliers_influence import variance_inflation_factor 

# VIF dataframe 
# VIF = Variance Inflation Factor
vif_data = pd.DataFrame() 
vif_data["feature"] = X.columns 
  
# calculating VIF for each feature 
vif_data["VIF"] = [variance_inflation_factor(X.values, i) 
                          for i in range(len(X.columns))] 
  

# variables with high VIF-value 
# can mean multlicollinearity (variables providing same linear
# relationships in the data, confusing the logistic regression
print(vif_data)

In [None]:
columns_to_drop_after_vif = [
    'Age',
    'MonthlyIncome',
    'WorkLifeBalance',
    'YearsAtCompany', 
]

df = df.drop(columns_to_drop_after_vif, axis=1)

#### Explanation how I did cleaning of the dataset is it Project_2. At this stage I have the same amount of rows and columns and dataset is ready for  X/y and train/test split

In [46]:
# use everything else except, the target
X = df.drop("Attrition", axis=1)

# our target variable is y
y = df['Attrition']

In [None]:
# 