In [1]:
#import libraries
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [2]:
#import dataset
salary_data = pd.read_csv("SalaryData_Train.csv")
salary_test = pd.read_csv("SalaryData_Test.csv")
salary_data.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


## EDA

In [3]:
#NA values
salary_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30161 entries, 0 to 30160
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   age            30161 non-null  int64 
 1   workclass      30161 non-null  object
 2   education      30161 non-null  object
 3   educationno    30161 non-null  int64 
 4   maritalstatus  30161 non-null  object
 5   occupation     30161 non-null  object
 6   relationship   30161 non-null  object
 7   race           30161 non-null  object
 8   sex            30161 non-null  object
 9   capitalgain    30161 non-null  int64 
 10  capitalloss    30161 non-null  int64 
 11  hoursperweek   30161 non-null  int64 
 12  native         30161 non-null  object
 13  Salary         30161 non-null  object
dtypes: int64(5), object(9)
memory usage: 3.2+ MB


In [4]:
#Duplicate rows
salary_data[salary_data.duplicated()]

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
326,33,Private,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,>50K
364,27,Private,Bachelors,13,Never-married,Craft-repair,Not-in-family,White,Male,0,0,50,United-States,<=50K
518,24,Private,HS-grad,9,Never-married,Handlers-cleaners,Unmarried,Black,Female,0,0,40,United-States,<=50K
525,24,Private,HS-grad,9,Never-married,Craft-repair,Own-child,White,Male,0,0,40,United-States,<=50K
619,33,Private,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,40,United-States,>50K
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
30110,47,Private,HS-grad,9,Married-civ-spouse,Craft-repair,Husband,White,Male,0,0,40,United-States,<=50K
30118,20,Private,HS-grad,9,Never-married,Machine-op-inspct,Own-child,White,Male,0,0,40,United-States,<=50K
30132,22,Private,Some-college,10,Never-married,Adm-clerical,Own-child,White,Male,0,0,40,United-States,<=50K
30157,40,Private,HS-grad,9,Married-civ-spouse,Machine-op-inspct,Husband,White,Male,0,0,40,United-States,>50K


In [5]:
#Delete duplicate rows
salary_data.drop_duplicates(inplace=True, ignore_index=True)

In [6]:
#descriptive statistics
salary_data.describe()

Unnamed: 0,age,educationno,capitalgain,capitalloss,hoursperweek
count,26903.0,26903.0,26903.0,26903.0,26903.0
mean,39.04914,10.144296,1215.443259,98.427499,41.164591
std,13.188825,2.620154,7805.366384,425.493211,12.347051
min,17.0,1.0,0.0,0.0,1.0
25%,29.0,9.0,0.0,0.0,40.0
50%,38.0,10.0,0.0,0.0,40.0
75%,48.0,13.0,0.0,0.0,45.0
max,90.0,16.0,99999.0,4356.0,99.0


###  Feature engineering

In [7]:
x = pd.get_dummies(salary_data.iloc[:,0:-1])
y = LabelEncoder().fit_transform(salary_data.iloc[:,-1])

In [8]:
#Univariate selection - Top 30 features out of 102
from sklearn.feature_selection import SelectKBest,chi2
test = SelectKBest(score_func=chi2,k=30)
fit_us = test.fit(x,y)

#Score dataframe
df = pd.DataFrame(data={"col":x.columns, "scores":fit_us.scores_})
a = (df.sort_values(by="scores",ascending=False)[0:30]).index
a

Int64Index([ 2,  3,  0,  4, 30, 49,  1, 32, 52, 38, 60, 50, 44, 24, 42, 26, 21,
            53,  8, 22, 28, 61, 54, 23, 57, 40, 35, 51, 13, 33],
           dtype='int64')

In [9]:
#Decision tree method
from sklearn.tree import DecisionTreeClassifier
model_dt = DecisionTreeClassifier()
model_dt.fit(x,y)
model_dt.feature_importances_

df2 = pd.DataFrame(data={"col":x.columns, "importance":model_dt.feature_importances_})
b = (df2.sort_values(by="importance",ascending=False)[0:30]).index
b

Int64Index([30,  0,  2,  1,  4,  3,  7, 38, 46,  9, 44,  6, 37, 99,  8, 59, 35,
             5, 61, 60, 42, 10, 48, 41, 54, 47, 21, 45, 57, 50],
           dtype='int64')

### Decision tree and univariate based feature selection are almost simialar, for further calculation consider decision tree based features

In [10]:
#Training data split
x_new = x.iloc[:,b]
y_new = y

In [11]:
#Testing data split
x_test = pd.get_dummies(salary_test.iloc[:,0:-1])
y_test = LabelEncoder().fit_transform(salary_test.iloc[:,-1]) 

x_test_new = x_test.iloc[:,b]
y_test_new = y_test

## Model

In [None]:
# Find optimum parameters using Gridsearch
# clf = SVC()
# para_grid = {"kernel":["poly", "rbf"], "C":[50,15,10,5],"degree":[2,3,4], "gamma":[10,5,3,1,0.1]}
# gsv = GridSearchCV(estimator=clf, param_grid=para_grid,cv=10)
# gsv.fit(x_new,y_new)

# After running for 10 hours i didn't get the results, its taking time i ran it for complete one day

In [15]:
#Model and train data score
clf = SVC(kernel= "poly", degree=3, C=10,class_weight="balanced")
clf.fit(x_new,y_new)
clf.score(x_new,y_new)

0.8017321488309854

In [16]:
#model score on test data
clf.score(x_test_new,y_test_new)

0.80265604249668

In [17]:
#Confusion matrix
from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y_test_new, clf.predict(x_test_new))
matrix

array([[11121,   239],
       [ 2733,   967]], dtype=int64)