In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
%matplotlib inline

In [2]:
df=pd.read_csv("salary.csv",nrows=5000)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 15 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   age             5000 non-null   int64 
 1   workclass       5000 non-null   object
 2   fnlwgt          5000 non-null   int64 
 3   education       5000 non-null   object
 4   education-num   5000 non-null   int64 
 5   marital-status  5000 non-null   object
 6   occupation      5000 non-null   object
 7   relationship    5000 non-null   object
 8   race            5000 non-null   object
 9   sex             5000 non-null   object
 10  capital-gain    5000 non-null   int64 
 11  capital-loss    5000 non-null   int64 
 12  hours-per-week  5000 non-null   int64 
 13  native-country  5000 non-null   object
 14  salary          5000 non-null   object
dtypes: int64(6), object(9)
memory usage: 586.1+ KB


In [3]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,salary
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df.corr()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week
age,1.0,-0.079396,0.018112,0.063787,0.056413,0.039966
fnlwgt,-0.079396,1.0,-0.060958,-0.000698,-0.00162,-0.00617
education-num,0.018112,-0.060958,1.0,0.112823,0.096051,0.165382
capital-gain,0.063787,-0.000698,0.112823,1.0,-0.033439,0.071881
capital-loss,0.056413,-0.00162,0.096051,-0.033439,1.0,0.079426
hours-per-week,0.039966,-0.00617,0.165382,0.071881,0.079426,1.0


In [5]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

In [6]:
df.salary.unique()

array([' <=50K', ' >50K'], dtype=object)

In [7]:
df['salary']

0        <=50K
1        <=50K
2        <=50K
3        <=50K
4        <=50K
         ...  
4995     <=50K
4996      >50K
4997      >50K
4998     <=50K
4999     <=50K
Name: salary, Length: 5000, dtype: object

In [8]:
df.salary.values

array([' <=50K', ' <=50K', ' <=50K', ..., ' >50K', ' <=50K', ' <=50K'],
      dtype=object)

In [9]:
df.salary=pd.get_dummies(df['salary'],drop_first=True)

In [10]:
df.salary[:5]

0    0
1    0
2    0
3    0
4    0
Name: salary, dtype: uint8

In [11]:
df['salary'].value_counts()

0    3779
1    1221
Name: salary, dtype: int64

In [12]:
type(df.salary)

pandas.core.series.Series

In [13]:
df.salary.dtype

dtype('uint8')

In [14]:
df['salary']=pd.to_numeric(df['salary'])

In [15]:
df.salary.dtype

dtype('uint8')

In [16]:
df.columns

Index(['age', 'workclass', 'fnlwgt', 'education', 'education-num',
       'marital-status', 'occupation', 'relationship', 'race', 'sex',
       'capital-gain', 'capital-loss', 'hours-per-week', 'native-country',
       'salary'],
      dtype='object')

In [17]:
df['education-num'].unique()

array([13,  9,  7, 14,  5, 10, 12, 11,  4, 16, 15,  3,  6,  2,  1,  8],
      dtype=int64)

In [18]:
#sns.jointplot(df.fnlwgt,df.salary)

In [19]:
#sns.pairplot(df)

In [20]:
#workclass,education,occupation,relationship,race,native-country

In [21]:
df1=df.drop(['workclass','education','marital-status','occupation','relationship','race','native-country'],axis='columns')

In [22]:
df1.head()

Unnamed: 0,age,fnlwgt,education-num,sex,capital-gain,capital-loss,hours-per-week,salary
0,39,77516,13,Male,2174,0,40,0
1,50,83311,13,Male,0,0,13,0
2,38,215646,9,Male,0,0,40,0
3,53,234721,7,Male,0,0,40,0
4,28,338409,13,Female,0,0,40,0


In [23]:
df1.sex.unique()

array([' Male', ' Female'], dtype=object)

In [24]:
dummy=pd.get_dummies(df1.sex,drop_first=True)
dummy

Unnamed: 0,Male
0,1
1,1
2,1
3,1
4,0
...,...
4995,0
4996,1
4997,1
4998,1


In [25]:
df2=pd.concat([df1,dummy],axis='columns')

In [26]:
df2.head()

Unnamed: 0,age,fnlwgt,education-num,sex,capital-gain,capital-loss,hours-per-week,salary,Male
0,39,77516,13,Male,2174,0,40,0,1
1,50,83311,13,Male,0,0,13,0,1
2,38,215646,9,Male,0,0,40,0,1
3,53,234721,7,Male,0,0,40,0,1
4,28,338409,13,Female,0,0,40,0,0


In [27]:
df2=df2.drop(['sex'],axis=1)

In [28]:
df2.head()

Unnamed: 0,age,fnlwgt,education-num,capital-gain,capital-loss,hours-per-week,salary,Male
0,39,77516,13,2174,0,40,0,1
1,50,83311,13,0,0,13,0,1
2,38,215646,9,0,0,40,0,1
3,53,234721,7,0,0,40,0,1
4,28,338409,13,0,0,40,0,0


In [29]:
df2.isna().sum()

age               0
fnlwgt            0
education-num     0
capital-gain      0
capital-loss      0
hours-per-week    0
salary            0
 Male             0
dtype: int64

In [30]:
X=df2.drop(['salary'],axis=1)
X.ndim

2

In [31]:
y=df2.salary
y.ndim

1

In [32]:
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size=0.2, random_state=20)

In [33]:
X_train[:3].values

array([[    26, 290286,      9,      0,      0,     40,      1],
       [    19,  63574,     10,      0,      0,     50,      1],
       [    60,  24215,      6,      0,      0,     10,      0]],
      dtype=int64)

In [34]:
scaler=MinMaxScaler()
X_train=scaler.fit_transform(X_train)
X_test=scaler.transform(X_test)

In [35]:
X_train[:3]

array([[0.12328767, 0.26726369, 0.53333333, 0.        , 0.        ,
        0.39795918, 1.        ],
       [0.02739726, 0.04366419, 0.6       , 0.        , 0.        ,
        0.5       , 1.        ],
       [0.5890411 , 0.00484555, 0.33333333, 0.        , 0.        ,
        0.09183673, 0.        ]])

In [42]:
model_params={
    'svc':{
        'model':SVC(gamma='auto'),
        'params':{
            'C':[1,10,20],
            'kernel':['rbf','linear']
        }
    },
    'logistic regression':{
        'model':LogisticRegression(),
        'params':{
            'C':[1,5,10]
        }
    },
    
    'random forest':{
        'model':RandomForestClassifier(),
        'params':{
            'n_estimators':[1,5,10]
        }
    },
    'Naive Bayes':{
        'model':BernoulliNB(),
        'params':{
            'alpha':[1,5,10]
        }
        
    },
    'decision tree':{
        'model':DecisionTreeClassifier(),
        'params':{
            'criterion':['gini','entropr','logloss']
        }
    }
}

In [43]:
scores=[]

for model_name,model_para in model_params.items():
    clf=GridSearchCV(model_para['model'],model_para['params'],cv=5,return_train_score=False)
    clf.fit(X_train,y_train)
    scores.append({
        'model_name':model_name,
        'best_score':clf.best_score_,
        'best_parameter':clf.best_params_
    })

Traceback (most recent call last):
  File "C:\Users\Admin\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Admin\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 903, in fit
    super().fit(
  File "C:\Users\Admin\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 348, in fit
    criterion = CRITERIA_CLF[self.criterion](self.n_outputs_,
KeyError: 'entropr'

Traceback (most recent call last):
  File "C:\Users\Admin\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\Admin\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 903, in fit
    super().fit(
  File "C:\Users\Admin\anaconda3\lib\site-packages\sklearn\tree\_classes.py", line 348, in fit
    criterion = CRITERIA_CLF[self.criterion](self.n_outputs_,
KeyError: 'entropr'

Traceback 

In [44]:
scores
df_s=pd.DataFrame(scores)
df_s

Unnamed: 0,model_name,best_score,best_parameter
0,svc,0.815,"{'C': 20, 'kernel': 'rbf'}"
1,logistic regression,0.813,{'C': 10}
2,random forest,0.81475,{'n_estimators': 10}
3,Naive Bayes,0.782,{'alpha': 1}
4,decision tree,0.761,{'criterion': 'gini'}
