## using Adaboost classifier we will prepare a model on fraud data treating those who have taxable_income <= 30000 as "Risky" and others are "Good"

In [1]:
#libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split 

#Parameter tuning
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score

#adaboost library
from sklearn.ensemble import AdaBoostClassifier

In [2]:
df = pd.read_csv("G:\Github\DS-assignments-python\Adaboost algorithm\Fraud_check.csv")

In [3]:
df.head(10)

Unnamed: 0,Undergrad,Marital.Status,Taxable.Income,City.Population,Work.Experience,Urban
0,NO,Single,68833,50047,10,YES
1,YES,Divorced,33700,134075,18,YES
2,NO,Married,36925,160205,30,YES
3,YES,Single,50190,193264,15,YES
4,NO,Married,81002,27533,28,NO
5,NO,Divorced,33329,116382,0,NO
6,NO,Divorced,83357,80890,8,YES
7,YES,Single,62774,131253,3,YES
8,NO,Single,83519,102481,12,YES
9,YES,Divorced,98152,155482,4,YES


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 600 entries, 0 to 599
Data columns (total 6 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Undergrad        600 non-null    object
 1   Marital.Status   600 non-null    object
 2   Taxable.Income   600 non-null    int64 
 3   City.Population  600 non-null    int64 
 4   Work.Experience  600 non-null    int64 
 5   Urban            600 non-null    object
dtypes: int64(3), object(3)
memory usage: 28.2+ KB


In [5]:
df.nunique()

Undergrad            2
Marital.Status       3
Taxable.Income     599
City.Population    598
Work.Experience     31
Urban                2
dtype: int64

In [6]:
df1 = df.copy()

In [7]:
df1["Taxable.Income"] = pd.Series(np.where(df["Taxable.Income"]<= 30000 ,1,0))

In [8]:
df1.rename(columns = {"Taxable.Income":"Tax_Inc_Risky"}, inplace = True)

Treating 1 as Risky and 0 as Good

In [9]:
df1.head(10)

Unnamed: 0,Undergrad,Marital.Status,Tax_Inc_Risky,City.Population,Work.Experience,Urban
0,NO,Single,0,50047,10,YES
1,YES,Divorced,0,134075,18,YES
2,NO,Married,0,160205,30,YES
3,YES,Single,0,193264,15,YES
4,NO,Married,0,27533,28,NO
5,NO,Divorced,0,116382,0,NO
6,NO,Divorced,0,80890,8,YES
7,YES,Single,0,131253,3,YES
8,NO,Single,0,102481,12,YES
9,YES,Divorced,0,155482,4,YES


In [10]:
df1["Tax_Inc_Risky"].value_counts()

0    476
1    124
Name: Tax_Inc_Risky, dtype: int64

In [11]:
124/600

0.20666666666666667

In [12]:
#Creating dataframes with all numerical and dummies
dfd = pd.get_dummies(data = df1 , columns = ['Undergrad','Marital.Status','Urban'], drop_first = True)

In [13]:
dfd.head()

Unnamed: 0,Tax_Inc_Risky,City.Population,Work.Experience,Undergrad_YES,Marital.Status_Married,Marital.Status_Single,Urban_YES
0,0,50047,10,0,0,1,1
1,0,134075,18,1,0,0,1
2,0,160205,30,0,1,0,1
3,0,193264,15,1,0,1,1
4,0,27533,28,0,1,0,0


#### so the data is nicely unbalance..
now adaboost is a algorithm which counters the imbalanced data  by giving more weight to datapoints which are not classified correctly... in this case minority class datapoints
Lets see how it performs

In [14]:
dfd_x = dfd.iloc[:,1:]
dfd_y = dfd.iloc[:,0]

In [15]:
dfd_tr_x,dfd_ts_x,dfd_tr_y,dfd_ts_y = train_test_split(dfd_x,dfd_y, test_size = 0.25,shuffle = True , random_state = 13)

AdaBoostClassifier(base_estimator=None, *, n_estimators=50, learning_rate=1.0, algorithm='SAMME.R', random_state=None)

- base_estimator - model that u wnat to have as classifier , default is decision tree
- n_estimator - number of stumps/tress / classifiaction learners you want to build
- learning rate - Weight applied to each classifier at each boosting iteration. A higher learning rate increases the contribution of each classifier. There is a trade-off between the learning_rate and n_estimators parameters.


In [16]:
#Creating a adaboost model
mark1 = AdaBoostClassifier(n_estimators = 100)

In [17]:
mark1.fit(dfd_tr_x,dfd_tr_y)

AdaBoostClassifier(n_estimators=100)

In [18]:
pred_tr = mark1.predict(dfd_tr_x)
pred_ts = mark1.predict(dfd_ts_x)

In [19]:
np.mean(pred_tr == dfd_tr_y)

0.7844444444444445

In [20]:
np.mean(pred_ts == dfd_ts_y)

0.8333333333333334

In [21]:
pd.crosstab(dfd_tr_y,pred_tr )

col_0,0,1
Tax_Inc_Risky,Unnamed: 1_level_1,Unnamed: 2_level_1
0,348,2
1,95,5


In [22]:
pd.crosstab(dfd_ts_y,pred_ts)

col_0,0,1
Tax_Inc_Risky,Unnamed: 1_level_1,Unnamed: 2_level_1
0,124,2
1,23,1


The minority accuracy sucks even for adaboost.....why...lets try final with staified kfold sampling

In [23]:
markx = AdaBoostClassifier()

In [24]:
cv = StratifiedKFold(n_splits = 10,shuffle=True, random_state = 13)

In [25]:
scores = cross_val_score(markx,dfd_x,dfd_y, scoring = "accuracy" , cv = cv)

In [26]:
print("\n Accuracy is ", np.mean(scores),np.std(scores))


 Accuracy is  0.7816666666666667 0.01166666666666666


In [27]:
scores

array([0.78333333, 0.8       , 0.78333333, 0.76666667, 0.8       ,
       0.78333333, 0.78333333, 0.76666667, 0.76666667, 0.78333333])

even after mainitaining the propartion of minority - majority the accuracy has not increased


Lets try oversampling the minority class
here the ammout is data is less ..so undersampling the data will result in loss of info and also considerably harm prediction models


### oversampling minority class


In [28]:
from imblearn.over_sampling import SMOTE

In [29]:
os = SMOTE(sampling_strategy = 0.75)

In [30]:
dfd_x1,dfd_y1 = os.fit_resample(dfd_x,dfd_y)

In [31]:
print(dfd_x.shape,dfd_x1.shape)

(600, 6) (833, 6)


In [32]:
dfd_tr_x1,dfd_ts_x1,dfd_tr_y1,dfd_ts_y1 = train_test_split(dfd_x1,dfd_y1, test_size = 0.25,shuffle = True , random_state = 13)

In [33]:
print(dfd_y.value_counts(),"\n",dfd_y1.value_counts())

0    476
1    124
Name: Tax_Inc_Risky, dtype: int64 
 0    476
1    357
Name: Tax_Inc_Risky, dtype: int64


In [34]:
#Creating a adaboost model
mark4 = AdaBoostClassifier(n_estimators = 100)

In [35]:
mark4.fit(dfd_tr_x1,dfd_tr_y1)

AdaBoostClassifier(n_estimators=100)

In [36]:
pred_tr1 = mark4.predict(dfd_tr_x1)
pred_ts1 = mark4.predict(dfd_ts_x1)

In [37]:
np.mean(pred_tr1 == dfd_tr_y1)

0.75

In [38]:
np.mean(pred_ts1 == dfd_ts_y1)

0.631578947368421

In [39]:
pd.crosstab(dfd_tr_y1,pred_tr1 )

col_0,0,1
Tax_Inc_Risky,Unnamed: 1_level_1,Unnamed: 2_level_1
0,308,61
1,95,160


In [40]:
pd.crosstab(dfd_ts_y1,pred_ts1)

col_0,0,1
Tax_Inc_Risky,Unnamed: 1_level_1,Unnamed: 2_level_1
0,83,24
1,53,49


Startified sampling doesn't help and oversampling minority with smote doesn't help....the data is too bad..
600 bservation with 200 one type too less....i think need to try on larger dataset..to see if things changes