In [9]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, r2_score, f1_score
from sklearn.linear_model import LogisticRegression
from tqdm import tqdm
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.pipeline import Pipeline

In [3]:
df = pd.read_csv('Kyphosis.csv')

In [5]:
df

Unnamed: 0,Kyphosis,Age,Number,Start
0,absent,71,3,5
1,absent,158,3,14
2,present,128,4,5
3,absent,2,5,1
4,absent,1,4,15
...,...,...,...,...
76,present,157,3,13
77,absent,26,7,13
78,absent,120,2,13
79,present,42,7,6


In [7]:
X , y = df.drop('Kyphosis', axis = 1) , df['Kyphosis']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=24, stratify=y)

In [15]:
lr = LogisticRegression()
lr.fit(X_train,y_train)
y_pred = lr.predict(X_test)
print("Accuracy is " , accuracy_score(y_test,y_pred))

Accuracy is  0.76


# Optimization

In [40]:
from tqdm import tqdm

scores = []
penalties = ['l2','l1' , 'elasticnet' , None]
l1 = np.linspace(0.001,1,5)
for i in tqdm(range(len(penalties))):
    for l in l1:
        lr =LogisticRegression(penalty=penalties[i] , solver= 'saga' , l1_ratio=l)
        lr.fit(X_train,y_train)
        y_pred = lr.predict(X_test)
        scores.append([penalties[i] , l ,accuracy_score(y_test,y_pred)])
df1 = pd.DataFrame(scores , columns=['C', 'degree', 'F1'])
df1.sort_values('F1',ascending=False ).iloc[0]  

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<00:00, 70.32it/s]


C            l2
degree    0.001
F1          0.8
Name: 0, dtype: object

In [36]:
df1

Unnamed: 0,C,degree,F1
0,l2,0.001,0.8
1,l2,0.25075,0.8
2,l2,0.5005,0.8
3,l2,0.75025,0.8
4,l2,1.0,0.8
5,l1,0.001,0.8
6,l1,0.25075,0.8
7,l1,0.5005,0.8
8,l1,0.75025,0.8
9,l1,1.0,0.8


# Human Resources Dataset

In [53]:
df = pd.read_csv('HR_comma_sep.csv')
df

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,left,promotion_last_5years,Department,salary
0,0.38,0.53,2,157,3,0,1,0,sales,low
1,0.80,0.86,5,262,6,0,1,0,sales,medium
2,0.10,0.77,6,247,4,0,1,0,sales,low
3,0.92,0.85,5,259,5,0,1,0,sales,low
4,0.89,1.00,5,224,5,0,1,0,sales,low
...,...,...,...,...,...,...,...,...,...,...
14990,0.40,0.57,2,151,3,0,1,0,support,low
14991,0.37,0.48,2,160,3,0,1,0,support,low
14992,0.37,0.53,2,143,3,0,1,0,support,low
14993,0.11,0.96,6,280,4,0,1,0,support,low


In [55]:
X , y = df.drop('left', axis = 1) , df['left']

In [71]:
X_dumm = pd.get_dummies(X)
X_dumm

Unnamed: 0,satisfaction_level,last_evaluation,number_project,average_montly_hours,time_spend_company,Work_accident,promotion_last_5years,Department_IT,Department_RandD,Department_accounting,Department_hr,Department_management,Department_marketing,Department_product_mng,Department_sales,Department_support,Department_technical,salary_high,salary_low,salary_medium
0,0.38,0.53,2,157,3,0,0,False,False,False,False,False,False,False,True,False,False,False,True,False
1,0.80,0.86,5,262,6,0,0,False,False,False,False,False,False,False,True,False,False,False,False,True
2,0.10,0.77,6,247,4,0,0,False,False,False,False,False,False,False,True,False,False,False,True,False
3,0.92,0.85,5,259,5,0,0,False,False,False,False,False,False,False,True,False,False,False,True,False
4,0.89,1.00,5,224,5,0,0,False,False,False,False,False,False,False,True,False,False,False,True,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14990,0.40,0.57,2,151,3,0,0,False,False,False,False,False,False,False,False,True,False,False,True,False
14991,0.37,0.48,2,160,3,0,0,False,False,False,False,False,False,False,False,True,False,False,True,False
14992,0.37,0.53,2,143,3,0,0,False,False,False,False,False,False,False,False,True,False,False,True,False
14993,0.11,0.96,6,280,4,0,0,False,False,False,False,False,False,False,False,True,False,False,True,False


In [59]:
X_train, X_test, y_train, y_test = train_test_split(X_dumm, y, test_size=0.3, random_state=24, stratify=y)

In [69]:
from tqdm import tqdm
# scaler = StandardScaler()
scores = []
penalties = ['l2','l1' , 'elasticnet' , None]
l1 = np.linspace(0.001,1,5)
for i in tqdm(range(len(penalties))):
    for l in l1:
        lr =LogisticRegression(penalty=penalties[i] , solver= 'saga' , l1_ratio=l)
        # pipe = Pipeline([('scl', scaler) , ('LR' , lr)])
        # pipe.fit(X_train,y_train)
        lr.fit(X_train,y_train)
        y_pred = lr.predict(X_test)
        scores.append([penalties[i] , l ,accuracy_score(y_test,y_pred)])
df1 = pd.DataFrame(scores , columns=['penalties', 'l1', 'accuracy'])
df1.sort_values('accuracy',ascending=False ).iloc[0]  

100%|████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:06<00:00,  1.56s/it]


penalties          l2
l1              0.001
accuracy     0.759058
Name: 0, dtype: object