<a href="https://colab.research.google.com/github/Rok-sana/ML-course/blob/main/Homework_7.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.linear_model import (LogisticRegression,
                                  LinearRegression)
from sklearn.model_selection import (train_test_split,
                                     KFold)

from sklearn.metrics import (mean_absolute_error,
                             mean_squared_error,
                             r2_score,
                             accuracy_score,
                             f1_score)

from sklearn import preprocessing

from sklearn.preprocessing import(MinMaxScaler,
                                  StandardScaler,
                                  LabelEncoder,
                                  OneHotEncoder)


from category_encoders.target_encoder import TargetEncoder
from category_encoders.m_estimate import MEstimateEncoder
from category_encoders.ordinal import OrdinalEncoder
from category_encoders.cat_boost import CatBoostEncoder
from category_encoders.james_stein import JamesSteinEncoder
from category_encoders.leave_one_out import LeaveOneOutEncoder


In [None]:
#!pip install category_encoders

In [None]:
features = ['buying', 'maint', 'doors', 'persons', 'lug_boot','safety']

In [None]:
df = pd.read_csv('car.data', header=None)    
df.columns = ['buying', 'maint', 'doors', 'persons', 'lug_boot','safety', 'class']

In [None]:
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [None]:
df.dtypes

buying      object
maint       object
doors       object
persons     object
lug_boot    object
safety      object
class       object
dtype: object

In [None]:
def one_hot_encoder(data,feature,keep_first=True):

    one_hot_cols = pd.get_dummies(data[feature])
    
    for col in one_hot_cols.columns:
        one_hot_cols.rename({col:f'{feature}_'+col},axis=1,inplace=True)
    
    new_data = pd.concat([data,one_hot_cols],axis=1)
    new_data.drop(feature,axis=1,inplace=True)
    
    if keep_first == False:
        new_data=new_data.iloc[:,1:]
    
    return new_data

In [None]:
class_label_encod = LabelEncoder()
target_encoded_df = df.copy()
target_encoded_df['class']=class_label_encod.fit_transform(target_encoded_df['class'])

In [None]:
le = LabelEncoder()
label_encoded_df = df.copy()
for col in label_encoded_df.select_dtypes(include='O').columns:
    label_encoded_df[col]=le.fit_transform(label_encoded_df[col])

In [None]:
label_encoded_df

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,3,3,0,0,2,1,2
1,3,3,0,0,2,2,2
2,3,3,0,0,2,0,2
3,3,3,0,0,1,1,2
4,3,3,0,0,1,2,2
...,...,...,...,...,...,...,...
1723,1,1,3,2,1,2,1
1724,1,1,3,2,1,0,3
1725,1,1,3,2,0,1,2
1726,1,1,3,2,0,2,1


In [None]:
def reg_model(data):
    
    X = data.drop('class',axis=1)
    y = data['class']
    
    X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=14,test_size=0.25)
    
    linreg = LinearRegression()
    
    linreg.fit(X_train,y_train)
    
    p = linreg.predict(X_test)
    
    print(f'R-squared: {r2_score(y_test,p)}')
    print('-'*20)
    print(f'Error: {mean_absolute_error(y_test,p)}')
    print('-'*20)
  

In [None]:
reg_model(label_encoded_df)

R-squared: 0.1226020361551019
--------------------
Error: 0.6647011414780535
--------------------


In [None]:
TE_encoder = TargetEncoder()
df_te = TE_encoder.fit_transform(target_encoded_df.copy(), label_encoded_df['class'])

In [None]:
df_te.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,class
0,1.666667,1.666667,1.613426,2.0,1.598958,2.0,2
1,1.666667,1.666667,1.613426,2.0,1.598958,1.307292,2
2,1.666667,1.666667,1.613426,2.0,1.598958,1.352431,2
3,1.666667,1.666667,1.613426,2.0,1.532986,2.0,2
4,1.666667,1.666667,1.613426,2.0,1.532986,1.307292,2


In [None]:
reg_model(df_te)

R-squared: 0.295969821817889
--------------------
Error: 0.6049334378680982
--------------------


In [None]:
df_one_hot_two= df[features].copy()
for col in df[features].columns:
    df_one_hot_two = one_hot_encoder(df_one_hot_two,col)

In [None]:
df_one_hot_two['class'] = target_encoded_df['class'].copy()

In [None]:
reg_model(df_one_hot_two)

R-squared: 0.27087219386754213
--------------------
Error: 0.6124855324074074
--------------------


In [None]:
MEE_encoder = MEstimateEncoder() 
df_mee = MEE_encoder.fit_transform(target_encoded_df, target_encoded_df['class'])

In [None]:
reg_model(df_mee)

R-squared: 0.2959698218178889
--------------------
Error: 0.6049334378680977
--------------------


In [None]:
JSE_encoder = JamesSteinEncoder()
df_jse = JSE_encoder.fit_transform(target_encoded_df, target_encoded_df['class'])

In [None]:
reg_model(df_jse)

R-squared: 0.2959713057755148
--------------------
Error: 0.6049217585405423
--------------------


In [None]:
LOOE_encoder = LeaveOneOutEncoder()
df_looe = LOOE_encoder.fit_transform(target_encoded_df, target_encoded_df['class'])

In [None]:
reg_model(df_looe)

R-squared: 0.2761764110284445
--------------------
Error: 0.6177394307749482
--------------------


In [None]:
CBE_encoder = CatBoostEncoder()
df_cbe = CBE_encoder.fit_transform(target_encoded_df, target_encoded_df['class'])

In [None]:
reg_model(df_cbe)

R-squared: 0.29014093516556705
--------------------
Error: 0.5853782111347352
--------------------


Best result based on mae_error value  shows CatBoostEncoder, but JamesSteinEncoder, MEstimateEncoder, TargetEncoder show better R_squares value.