In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score, confusion_matrix , classification_report

In [3]:
df = pd.read_csv("income_evaluation.csv")
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32561 entries, 0 to 32560
Data columns (total 15 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32561 non-null  int64 
 1    workclass       32561 non-null  object
 2    fnlwgt          32561 non-null  int64 
 3    education       32561 non-null  object
 4    education-num   32561 non-null  int64 
 5    marital-status  32561 non-null  object
 6    occupation      32561 non-null  object
 7    relationship    32561 non-null  object
 8    race            32561 non-null  object
 9    sex             32561 non-null  object
 10   capital-gain    32561 non-null  int64 
 11   capital-loss    32561 non-null  int64 
 12   hours-per-week  32561 non-null  int64 
 13   native-country  32561 non-null  object
 14   income          32561 non-null  object
dtypes: int64(6), object(9)
memory usage: 3.7+ MB


In [4]:
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [5]:
df.duplicated().sum()

np.int64(24)

In [6]:
df.drop_duplicates(inplace=True)

In [7]:
df.duplicated().sum()

np.int64(0)

In [8]:
df.nunique()

age                   73
 workclass             9
 fnlwgt            21648
 education            16
 education-num        16
 marital-status        7
 occupation           15
 relationship          6
 race                  5
 sex                   2
 capital-gain        119
 capital-loss         92
 hours-per-week       94
 native-country       42
 income                2
dtype: int64

In [9]:
df.columns

Index(['age', ' workclass', ' fnlwgt', ' education', ' education-num',
       ' marital-status', ' occupation', ' relationship', ' race', ' sex',
       ' capital-gain', ' capital-loss', ' hours-per-week', ' native-country',
       ' income'],
      dtype='object')

In [10]:
np.unique(df[' income'])

array([' <=50K', ' >50K'], dtype=object)

In [11]:
df.drop(' fnlwgt', axis = 1,inplace = True)

In [12]:
df.columns = [i.replace(" ","")for i in df.columns]

In [13]:
df.columns

Index(['age', 'workclass', 'education', 'education-num', 'marital-status',
       'occupation', 'relationship', 'race', 'sex', 'capital-gain',
       'capital-loss', 'hours-per-week', 'native-country', 'income'],
      dtype='object')

In [14]:
df.loc[df["workclass"]==" ?","workclass"] = np.nan
df.loc[df["occupation"]==" ?","occupation"] = np.nan
df.loc[df["native-country"]==" ?","native-country"] = np.nan
df.loc[df["education"]==" ?","education"] = np.nan

In [15]:
df.isnull().sum()

age                  0
workclass         1836
education            0
education-num        0
marital-status       0
occupation        1843
relationship         0
race                 0
sex                  0
capital-gain         0
capital-loss         0
hours-per-week       0
native-country     582
income               0
dtype: int64

In [16]:
df.dropna(inplace=True)


In [17]:
## data type comversion
df["income"] = df["income"].map({" <=50K":0," >50K":1})
df.head()


Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [18]:
df["income"].value_counts()

income
0    22633
1     7506
Name: count, dtype: int64

In [19]:
cat_col = df.select_dtypes(include="object")
cat_col

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba
...,...,...,...,...,...,...,...,...
32556,Private,Assoc-acdm,Married-civ-spouse,Tech-support,Wife,White,Female,United-States
32557,Private,HS-grad,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,United-States
32558,Private,HS-grad,Widowed,Adm-clerical,Unmarried,White,Female,United-States
32559,Private,HS-grad,Never-married,Adm-clerical,Own-child,White,Male,United-States


In [20]:
cat_col.columns

Index(['workclass', 'education', 'marital-status', 'occupation',
       'relationship', 'race', 'sex', 'native-country'],
      dtype='object')

In [21]:
num_col = df.select_dtypes(exclude="object")
num_col

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week,income
0,39,13,2174,0,40,0
1,50,13,0,0,13,0
2,38,9,0,0,40,0
3,53,7,0,0,40,0
4,28,13,0,0,40,0
...,...,...,...,...,...,...
32556,27,12,0,0,38,0
32557,40,9,0,0,40,1
32558,58,9,0,0,40,0
32559,22,9,0,0,20,0


In [22]:
# label encoding
from sklearn.preprocessing import LabelEncoder
# def label_encoder
le = LabelEncoder()
    # df[a] = le.fit_transform(df[a]) 
cat_col_encoded = cat_col.apply(le.fit_transform)
cat_col_encoded

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country
0,5,9,4,0,1,4,1,38
1,4,9,2,3,0,4,1,38
2,2,11,0,5,1,4,1,38
3,2,1,2,5,0,2,1,38
4,2,9,2,9,5,2,0,4
...,...,...,...,...,...,...,...,...
32556,2,7,2,12,5,4,0,38
32557,2,11,2,6,0,4,1,38
32558,2,11,6,0,4,4,0,38
32559,2,11,4,0,3,4,1,38


In [23]:
# cat_col.columns
# label_list =  ['workclass', 'education', 'marital-status', 'occupation',
#        'relationship', 'race', 'sex', 'native-country']
# for i in label_list:
#     label_encoder(i)



In [24]:
df.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [25]:
final_df = pd.concat([cat_col_encoded,num_col],axis=1)
final_df

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,age,education-num,capital-gain,capital-loss,hours-per-week,income
0,5,9,4,0,1,4,1,38,39,13,2174,0,40,0
1,4,9,2,3,0,4,1,38,50,13,0,0,13,0
2,2,11,0,5,1,4,1,38,38,9,0,0,40,0
3,2,1,2,5,0,2,1,38,53,7,0,0,40,0
4,2,9,2,9,5,2,0,4,28,13,0,0,40,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32556,2,7,2,12,5,4,0,38,27,12,0,0,38,0
32557,2,11,2,6,0,4,1,38,40,9,0,0,40,1
32558,2,11,6,0,4,4,0,38,58,9,0,0,40,0
32559,2,11,4,0,3,4,1,38,22,9,0,0,20,0


In [26]:
# x and y
x = final_df.drop("income",axis=1)
y = final_df["income"]

In [27]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=42)

In [28]:
x_test.shape

(6028, 13)

In [29]:
#standard scaling ---- normalize --- range
# from sklearn.preprocessing import StandardScaler
scaler =  StandardScaler()
x_train_scaled =scaler.fit_transform(x_train)
x_test_scaled = scaler.transform(x_test)



In [30]:
# mean = 0 , standard deviation ko 1 and standard deviation  ke kam aat aha

In [31]:
##SVC
from sklearn.svm import SVC
svc = SVC()
svc.fit(x_train_scaled,y_train)

0,1,2
,C,1.0
,kernel,'rbf'
,degree,3
,gamma,'scale'
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [32]:
svc.score(x_test_scaled,y_test)

0.840743198407432

In [33]:
#import grid search cv
from sklearn.model_selection import GridSearchCV
grid = {
    "C": [0.01,0.1,1,10],
    "kernel":["linear","rbf","play","sigmoid"],
    "degree" : [1,3,5,7],
    "gamma" : [0.01,1]
}
svm = SVC()
svm_cv = GridSearchCV(svm,grid,cv = 5) ##
svm_cv.fit(x_train_scaled,y_train)


160 fits failed out of a total of 640.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
160 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 859, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1356, in wrapper
    estimator._validate_params()
  File "c:\Users\user\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 469, in _validate_params
    validate_parameter_constraints(
  File "c:\Users\user\AppData\Local\Programs\Python\Python312\Lib\s

0,1,2
,estimator,SVC()
,param_grid,"{'C': [0.01, 0.1, ...], 'degree': [1, 3, ...], 'gamma': [0.01, 1], 'kernel': ['linear', 'rbf', ...]}"
,scoring,
,n_jobs,
,refit,True
,cv,5
,verbose,0
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,C,10
,kernel,'rbf'
,degree,1
,gamma,0.01
,coef0,0.0
,shrinking,True
,probability,False
,tol,0.001
,cache_size,200
,class_weight,


In [34]:
svm.best_params_
svm_cv.best_score

AttributeError: 'SVC' object has no attribute 'best_params_'