## Handling imbalanced data ---
### 1. Under sampling
### 2. Over sampling
### 3. Ensembling Techique
### 4. SMOTE (Synthetic Minority Oversampling)

In [1]:
import numpy as np
import pandas as pd

In [2]:
df = pd.read_csv('car_evaluation.csv')
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,outcome
0,vhigh,vhigh,2,2,small,low,unacc
1,vhigh,vhigh,2,2,small,med,unacc
2,vhigh,vhigh,2,2,small,high,unacc
3,vhigh,vhigh,2,2,med,low,unacc
4,vhigh,vhigh,2,2,med,med,unacc


In [3]:
df.shape

(1728, 7)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1728 entries, 0 to 1727
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   buying    1728 non-null   object
 1   maint     1728 non-null   object
 2   doors     1728 non-null   int64 
 3   persons   1728 non-null   int64 
 4   lug_boot  1728 non-null   object
 5   safety    1728 non-null   object
 6   outcome   1728 non-null   object
dtypes: int64(2), object(5)
memory usage: 94.6+ KB


In [5]:
df.isnull().any()

buying      False
maint       False
doors       False
persons     False
lug_boot    False
safety      False
outcome     False
dtype: bool

In [6]:
df['outcome'].value_counts()

unacc    1210
acc       384
good       69
vgood      65
Name: outcome, dtype: int64

In [7]:
from sklearn.preprocessing import LabelEncoder

In [8]:
le = LabelEncoder()

In [9]:
df['buying'] = le.fit_transform(df['buying'])
df['maint'] = le.fit_transform(df['maint'])
df['lug_boot'] = le.fit_transform(df['lug_boot'])
df['safety'] = le.fit_transform(df['safety'])

In [10]:
df.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety,outcome
0,3,3,2,2,2,1,unacc
1,3,3,2,2,2,2,unacc
2,3,3,2,2,2,0,unacc
3,3,3,2,2,1,1,unacc
4,3,3,2,2,1,2,unacc


In [11]:
x = df.iloc[:,:-1]
x.head()

Unnamed: 0,buying,maint,doors,persons,lug_boot,safety
0,3,3,2,2,2,1
1,3,3,2,2,2,2
2,3,3,2,2,2,0
3,3,3,2,2,1,1
4,3,3,2,2,1,2


In [12]:
y = df['outcome']
y.head()

0    unacc
1    unacc
2    unacc
3    unacc
4    unacc
Name: outcome, dtype: object

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.3,random_state=47)

In [15]:
x_train.shape,y_train.shape

((1209, 6), (1209,))

In [16]:
x_test.shape,y_test.shape

((519, 6), (519,))

# Handling imbalanced data

In [17]:
y_train.value_counts()

unacc    848
acc      264
good      50
vgood     47
Name: outcome, dtype: int64

# Before balancing ---

### Model Building

In [19]:
from sklearn.neighbors import KNeighborsClassifier

In [20]:
model = KNeighborsClassifier()

In [21]:
model.fit(x_train,y_train)

KNeighborsClassifier()

In [24]:
y_pred = model.predict(x_test)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [25]:
y_pred_train = model.predict(x_train)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


### Model evaluation

In [26]:
from sklearn.metrics import accuracy_score,classification_report

In [28]:
print(accuracy_score(y_test,y_pred))

0.9017341040462428


In [31]:
print(accuracy_score(y_train,y_pred_train))

0.9561621174524401


In [29]:
pd.crosstab(y_test,y_pred)

col_0,acc,good,unacc,vgood
outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
acc,100,1,19,0
good,9,9,0,1
unacc,13,0,349,0
vgood,8,0,0,10


In [33]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         acc       0.77      0.83      0.80       120
        good       0.90      0.47      0.62        19
       unacc       0.95      0.96      0.96       362
       vgood       0.91      0.56      0.69        18

    accuracy                           0.90       519
   macro avg       0.88      0.71      0.77       519
weighted avg       0.90      0.90      0.90       519



In [34]:
# recall of good
9/(9+9+1)

0.47368421052631576

In [35]:
# recall of vgood
10/(8+10)

0.5555555555555556

In [57]:
# Random data testing
model.predict([[1,2,1,2,2,1]])

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


array(['unacc'], dtype=object)

In [58]:
model.predict([[1,1,5,6,0,2]])

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


array(['good'], dtype=object)

In [59]:
model.predict([[1,3,3,1,4,0]])

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


array(['unacc'], dtype=object)

# After Balancing ---

In [37]:
pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.11.0-py3-none-any.whl (235 kB)
     -------------------------------------- 235.6/235.6 kB 2.4 MB/s eta 0:00:00
Collecting joblib>=1.1.1
  Downloading joblib-1.3.2-py3-none-any.whl (302 kB)
     -------------------------------------- 302.2/302.2 kB 6.3 MB/s eta 0:00:00
Installing collected packages: joblib, imbalanced-learn, imblearn
  Attempting uninstall: joblib
    Found existing installation: joblib 1.1.0
    Uninstalling joblib-1.1.0:
      Successfully uninstalled joblib-1.1.0
Successfully installed imbalanced-learn-0.11.0 imblearn-0.0 joblib-1.3.2
Note: you may need to restart the kernel to use updated packages.


In [38]:
from imblearn.over_sampling import SMOTE

In [39]:
smote = SMOTE()

In [40]:
x_train_smote,y_train_smote = smote.fit_resample(x_train,y_train)

In [41]:
y_train.value_counts()

unacc    848
acc      264
good      50
vgood     47
Name: outcome, dtype: int64

In [42]:
y_train_smote.value_counts()

unacc    848
acc      848
good     848
vgood    848
Name: outcome, dtype: int64

In [43]:
model1 = KNeighborsClassifier()

In [44]:
model1.fit(x_train_smote,y_train_smote)

KNeighborsClassifier()

In [46]:
y_pred1 = model1.predict(x_test)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [50]:
y_pred1_train = model1.predict(x_train_smote)

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [47]:
print(accuracy_score(y_test,y_pred1))

0.8439306358381503


In [51]:
print(accuracy_score(y_train_smote,y_pred1_train))

0.972877358490566


In [52]:
pd.crosstab(y_test,y_pred1)

col_0,acc,good,unacc,vgood
outcome,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
acc,101,8,10,1
good,6,12,1,0
unacc,44,5,313,0
vgood,6,0,0,12


In [56]:
print(classification_report(y_test,y_pred1))

              precision    recall  f1-score   support

         acc       0.64      0.84      0.73       120
        good       0.48      0.63      0.55        19
       unacc       0.97      0.86      0.91       362
       vgood       0.92      0.67      0.77        18

    accuracy                           0.84       519
   macro avg       0.75      0.75      0.74       519
weighted avg       0.87      0.84      0.85       519



In [53]:
# recall of good
12/(12+6+1)

0.631578947368421

In [54]:
# recall of vgood
12/(12+6)

0.6666666666666666

# Hyper Parameter Tuning
## 1. Grid Search
## 2. Randomize Search