In [44]:
import warnings
warnings.filterwarnings('ignore')


**Import Repository from UCI**

In [45]:
! pip install  ucimlrepo



**Step 1: Import Libraries**

In [95]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [96]:
from ucimlrepo import fetch_ucirepo

In [97]:
# fetch dataset
adult = fetch_ucirepo(id=2)

In [98]:
adult.data.features

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48837,39,Private,215419,Bachelors,13,Divorced,Prof-specialty,Not-in-family,White,Female,0,0,36,United-States
48838,64,,321403,HS-grad,9,Widowed,,Other-relative,Black,Male,0,0,40,United-States
48839,38,Private,374983,Bachelors,13,Married-civ-spouse,Prof-specialty,Husband,White,Male,0,0,50,United-States
48840,44,Private,83891,Bachelors,13,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male,5455,0,40,United-States


In [99]:
# data (as pandas dataframes)
X = adult.data.features
y = adult.data.targets

In [100]:
X.shape,y.shape

((48842, 14), (48842, 1))

In [101]:
X.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country'],
      dtype='object')

In [102]:
X['native-country'].value_counts()

native-country
United-States                 43832
Mexico                          951
?                               583
Philippines                     295
Germany                         206
Puerto-Rico                     184
Canada                          182
El-Salvador                     155
India                           151
Cuba                            138
England                         127
China                           122
South                           115
Jamaica                         106
Italy                           105
Dominican-Republic              103
Japan                            92
Guatemala                        88
Poland                           87
Vietnam                          86
Columbia                         85
Haiti                            75
Portugal                         67
Taiwan                           65
Iran                             59
Greece                           49
Nicaragua                        49
Peru         

We will remove native- country

In [103]:
X= X.drop(['native-country'], axis=1)

In [104]:
X.shape

(48842, 13)

In [105]:
y.value_counts()

income
<=50K     24720
<=50K.    12435
>50K       7841
>50K.      3846
Name: count, dtype: int64

In [106]:
y.replace({'<=50K': 0, '<=50K.':0, '>50K':1, '>50K.':1}, inplace= True)

In [107]:
y.value_counts()

income
0         37155
1         11687
Name: count, dtype: int64

In [108]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48842 entries, 0 to 48841
Data columns (total 13 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             48842 non-null  int64 
 1   workclass       47879 non-null  object
 2   fnlwgt          48842 non-null  int64 
 3   education       48842 non-null  object
 4   education-num   48842 non-null  int64 
 5   marital-status  48842 non-null  object
 6   occupation      47876 non-null  object
 7   relationship    48842 non-null  object
 8   race            48842 non-null  object
 9   sex             48842 non-null  object
 10  capital-gain    48842 non-null  int64 
 11  capital-loss    48842 non-null  int64 
 12  hours-per-week  48842 non-null  int64 
dtypes: int64(6), object(7)
memory usage: 4.8+ MB


In [60]:
X.isna().sum()

age                 0
workclass         963
fnlwgt              0
education           0
education-num       0
marital-status      0
occupation        966
relationship        0
race                0
sex                 0
capital-gain        0
capital-loss        0
hours-per-week      0
dtype: int64

In [61]:
X.dropna(inplace= True)

In [62]:
X.isna().sum()

age               0
workclass         0
fnlwgt            0
education         0
education-num     0
marital-status    0
occupation        0
relationship      0
race              0
sex               0
capital-gain      0
capital-loss      0
hours-per-week    0
dtype: int64

In [109]:
X_train, X_test, y_train, y_test= train_test_split(X,y, random_state=2529)

In [110]:
X.select_dtypes(include=['object', 'bool'])

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female
...,...,...,...,...,...,...,...
48837,Private,Bachelors,Divorced,Prof-specialty,Not-in-family,White,Female
48838,,HS-grad,Widowed,,Other-relative,Black,Male
48839,Private,Bachelors,Married-civ-spouse,Prof-specialty,Husband,White,Male
48840,Private,Bachelors,Divorced,Adm-clerical,Own-child,Asian-Pac-Islander,Male


In [111]:
X.select_dtypes(include=['object', 'bool']).columns

Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex'],
      dtype='object')

**Save in as Categorical Columns**

In [112]:
categorical_columns= X.select_dtypes(include=['object', 'bool']).columns

In [113]:
X.select_dtypes(include=['int64', 'float64']).columns

Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week'],
      dtype='object')

**Save in a Numerical columns**

In [114]:
numerical_columns= X.select_dtypes(include=['int64', 'float64']).columns

**For numerical there is another way as well**

In [115]:
X.describe().columns

Index(['age', 'fnlwgt', 'education-num', 'capital-gain', 'capital-loss',
       'hours-per-week'],
      dtype='object')

In [116]:
ohe= OneHotEncoder(drop='first')

In [117]:
ss= StandardScaler()

**Making Transfromation Packet**

In [118]:
ColumnTransformer(transformers= [('num', ss, numerical_columns ) ,  ('cat', ohe, categorical_columns)])

In [119]:
preprocessor=ColumnTransformer(transformers= [('num', ss, numerical_columns ) ,  ('cat', ohe, categorical_columns)])

**Create Pipeline**

In [120]:
model= Pipeline(steps=[('data_prep',preprocessor), ('binary_class',LogisticRegression())])

In [121]:
model.fit(X_train, y_train)

In [122]:
y_pred= model.predict(X_test)

In [123]:
accuracy_score(y_test, y_pred)

0.8506264843174187

In [124]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.88      0.93      0.90      9303
           1       0.73      0.59      0.65      2908

    accuracy                           0.85     12211
   macro avg       0.80      0.76      0.78     12211
weighted avg       0.84      0.85      0.84     12211



In [125]:
import joblib

In [127]:
joblib.dump(model, 'adult_classifier.pkl')


['adult_classifier.pkl']

In [128]:
joblib.load('adult_classifier.pkl')