In [111]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report


In [112]:
df = pd.read_csv('income_evaluation.csv')
df.head(5)
# df.info()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [113]:
# remove dublicate
df.isnull().sum()
plt.show()
df.drop_duplicates(inplace=True)


In [114]:
for col in df.columns:
    if 'fnlwgt' in col.lower():
        print(f"Dropping column: {col}")
        df.drop(col, axis=1, inplace=True)
df.head()

Dropping column:  fnlwgt


Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [115]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 32537 entries, 0 to 32560
Data columns (total 14 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   age              32537 non-null  int64 
 1    workclass       32537 non-null  object
 2    education       32537 non-null  object
 3    education-num   32537 non-null  int64 
 4    marital-status  32537 non-null  object
 5    occupation      32537 non-null  object
 6    relationship    32537 non-null  object
 7    race            32537 non-null  object
 8    sex             32537 non-null  object
 9    capital-gain    32537 non-null  int64 
 10   capital-loss    32537 non-null  int64 
 11   hours-per-week  32537 non-null  int64 
 12   native-country  32537 non-null  object
 13   income          32537 non-null  object
dtypes: int64(5), object(9)
memory usage: 3.7+ MB


In [116]:
df.loc[df["workclass"] == "?", "workclass"] = np.NaN
df.loc[df["occupation"] == "?", "occupation"] = np.NaN
df.loc[df["native-country"] == "?", "native-country"] = np.NaN


KeyError: 'workclass'

In [None]:
df.isnull().sum()
df.dropna(inplace=True)


In [None]:
df['income'] = df['income'].map({'<=50k': 0, '>50k': 1})
df.head()

Unnamed: 0,age,workclass,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,


In [None]:
# df.columns = df.columns.str.strip()
df['income'].value_counts()


Series([], Name: count, dtype: int64)

In [None]:
num_col = df.select_dtypes(exclude="object")
num_col.head()

Unnamed: 0,age,education-num,capital-gain,capital-loss,hours-per-week
0,39,13,2174,0,40
1,50,13,0,0,13
2,38,9,0,0,40
3,53,7,0,0,40
4,28,13,0,0,40


In [None]:
cat_col = df.select_dtypes(include="object")
cat_col.head()

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,income
0,State-gov,Bachelors,Never-married,Adm-clerical,Not-in-family,White,Male,United-States,<=50K
1,Self-emp-not-inc,Bachelors,Married-civ-spouse,Exec-managerial,Husband,White,Male,United-States,<=50K
2,Private,HS-grad,Divorced,Handlers-cleaners,Not-in-family,White,Male,United-States,<=50K
3,Private,11th,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,United-States,<=50K
4,Private,Bachelors,Married-civ-spouse,Prof-specialty,Wife,Black,Female,Cuba,<=50K


In [None]:
# #label encoding
from sklearn.preprocessing import LabelEncoder
# def lebel_encoder(a):
#     le = LabelEncoder()
#     df[a] = le.fit_transform(df[a])
le = LabelEncoder()
cat_col_encoded = cat_col.apply(le.fit_transform)
cat_col_encoded

Unnamed: 0,workclass,education,marital-status,occupation,relationship,race,sex,native-country,income
0,7,9,4,1,1,4,1,39,0
1,6,9,2,4,0,4,1,39,0
2,4,11,0,6,1,4,1,39,0
3,4,1,2,6,0,2,1,39,0
4,4,9,2,10,5,2,0,5,0
...,...,...,...,...,...,...,...,...,...
32556,4,7,2,13,5,4,0,39,0
32557,4,11,2,7,0,4,1,39,1
32558,4,11,6,1,4,4,0,39,0
32559,4,11,4,1,3,4,1,39,0


In [None]:
# cat_col.applymap(label_encoder)
cat_col.columns
lebel_list = {'workclass', 'education', 'marital-status', 'occupation', 'relationship','race','sex','native-country'}
for i in lebel_list:
    lebel_encoder(i)

SyntaxError: incomplete input (1981431050.py, line 5)

In [None]:
x = final_df.drop("income",axix=1)
y = final_df[" income"]

In [None]:
# from sklearn.model_selection import train_test_split
# x_train,x_test,y_train,y_test= train_test_split(x,y,test_size=0.2,random_state=42)

NameError: name 'x' is not defined

In [None]:
scaler = StandardScaler()
x_train_scaled = scaler.fit_tranform(x_train)
x_test_scaled = scaler.tranform(x_test)


In [None]:
from sklearn.svm
svc = 
svc.s

In [None]:
from sklearn.model_selection import GridSearchCV
grid = {
    "C": [0.01,0.2,1,10]
    "kernel": ["linear","rbf","play","sigmoid"],
    "degree": [1,3,5,7]
    "game" :  [0,0.1,1]
}
svm = Svc()
svm_cv = GridSearchCV(svm,grid)
svm_cv.fit(x_train_scaled,y_train)


In [None]:
svm_cv.best_params_
svm_cv.best_score_