### Prepare a classification model using SVM for salary data 

#### Data Description:

#### age -- age of a person
#### workclass	-- A work class is a grouping of work 
#### education	-- Education of an individuals	
#### maritalstatus -- Marital status of an individulas	
#### occupation	 -- occupation of an individuals
#### relationship -- 	
#### race --  Race of an Individual
#### sex --  Gender of an Individual
#### capitalgain --  profit received from the sale of an investment	
#### capitalloss	-- A decrease in the value of a capital asset
#### hoursperweek -- number of hours work per week	
#### native -- Native of an individual
#### Salary -- salary of an individual


In [1]:
# Importing libraries
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix


In [2]:
# importing the training data
data_train = pd.read_csv('SalaryData_Train.csv')
data_train.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,State-gov,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [3]:
data_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 30161 entries, 0 to 30160
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   age            30161 non-null  int64 
 1   workclass      30161 non-null  object
 2   education      30161 non-null  object
 3   educationno    30161 non-null  int64 
 4   maritalstatus  30161 non-null  object
 5   occupation     30161 non-null  object
 6   relationship   30161 non-null  object
 7   race           30161 non-null  object
 8   sex            30161 non-null  object
 9   capitalgain    30161 non-null  int64 
 10  capitalloss    30161 non-null  int64 
 11  hoursperweek   30161 non-null  int64 
 12  native         30161 non-null  object
 13  Salary         30161 non-null  object
dtypes: int64(5), object(9)
memory usage: 3.2+ MB


In [4]:
# importing the test data
data_test = pd.read_csv('SalaryData_Test.csv')
data_test.head()

Unnamed: 0,age,workclass,education,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,25,Private,11th,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,HS-grad,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,Assoc-acdm,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,Some-college,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,34,Private,10th,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K


In [5]:
data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15060 entries, 0 to 15059
Data columns (total 14 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   age            15060 non-null  int64 
 1   workclass      15060 non-null  object
 2   education      15060 non-null  object
 3   educationno    15060 non-null  int64 
 4   maritalstatus  15060 non-null  object
 5   occupation     15060 non-null  object
 6   relationship   15060 non-null  object
 7   race           15060 non-null  object
 8   sex            15060 non-null  object
 9   capitalgain    15060 non-null  int64 
 10  capitalloss    15060 non-null  int64 
 11  hoursperweek   15060 non-null  int64 
 12  native         15060 non-null  object
 13  Salary         15060 non-null  object
dtypes: int64(5), object(9)
memory usage: 1.6+ MB


In [6]:
data_train['workclass'].value_counts()

 Private             22285
 Self-emp-not-inc     2499
 Local-gov            2067
 State-gov            1279
 Self-emp-inc         1074
 Federal-gov           943
 Without-pay            14
Name: workclass, dtype: int64

In [7]:
data_test['workclass'].value_counts()

 Private             11021
 Self-emp-not-inc     1297
 Local-gov            1033
 State-gov             667
 Self-emp-inc          572
 Federal-gov           463
 Without-pay             7
Name: workclass, dtype: int64

In [8]:
# Want to ensure the categories are same in both trainng and testing data
train_object_cols = [cname for cname in data_train.columns if data_train[cname].dtype == 'object']
test_object_cols =  [cname for cname in data_test.columns  if data_test[cname].dtype == 'object']

print(train_object_cols)
print(test_object_cols)

if(train_object_cols == test_object_cols):
    print('Equal')
    
# Tried with other method of equality
if(np.array_equal(train_object_cols, test_object_cols)):
    print('Again equal')

['workclass', 'education', 'maritalstatus', 'occupation', 'relationship', 'race', 'sex', 'native', 'Salary']
['workclass', 'education', 'maritalstatus', 'occupation', 'relationship', 'race', 'sex', 'native', 'Salary']
Equal
Again equal


## Since training and test are similar datasets..We will do data analysis on one data first and then apply the same transformations on testing data. 

In [9]:
data_train.groupby(['education', 'educationno']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,age,workclass,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
education,educationno,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
10th,6,820,820,820,820,820,820,820,820,820,820,820,820
11th,7,1048,1048,1048,1048,1048,1048,1048,1048,1048,1048,1048,1048
12th,8,377,377,377,377,377,377,377,377,377,377,377,377
1st-4th,2,151,151,151,151,151,151,151,151,151,151,151,151
5th-6th,3,288,288,288,288,288,288,288,288,288,288,288,288
7th-8th,4,557,557,557,557,557,557,557,557,557,557,557,557
9th,5,455,455,455,455,455,455,455,455,455,455,455,455
Assoc-acdm,12,1008,1008,1008,1008,1008,1008,1008,1008,1008,1008,1008,1008
Assoc-voc,11,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307,1307
Bachelors,13,5044,5044,5044,5044,5044,5044,5044,5044,5044,5044,5044,5044


In [10]:
# will drop education column as it is kind of label encoded already and gives redundatant information
data_train.drop(['education'], axis=1, inplace=True)
data_train.head()

Unnamed: 0,age,workclass,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,State-gov,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [11]:
train_object_cols.remove('education')

In [12]:
# applying label Encoder to the training dataset

le = LabelEncoder()
for column in train_object_cols:
    data_train[column] = le.fit_transform(data_train[column])    

data_train.head()

Unnamed: 0,age,workclass,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,39,5,13,4,0,1,4,1,2174,0,40,37,0
1,50,4,13,2,3,0,4,1,0,0,13,37,0
2,38,2,9,0,5,1,4,1,0,0,40,37,0
3,53,2,7,2,5,0,2,1,0,0,40,37,0
4,28,2,13,2,9,5,2,0,0,0,40,4,0


In [13]:
# Splitting the X and y variables
y_train = data_train.iloc[:,-1]
X_train = data_train.drop(['Salary'], axis=1)

In [14]:
X_train.head()

Unnamed: 0,age,workclass,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native
0,39,5,13,4,0,1,4,1,2174,0,40,37
1,50,4,13,2,3,0,4,1,0,0,13,37
2,38,2,9,0,5,1,4,1,0,0,40,37
3,53,2,7,2,5,0,2,1,0,0,40,37
4,28,2,13,2,9,5,2,0,0,0,40,4


In [15]:
y_train.head()

0    0
1    0
2    0
3    0
4    0
Name: Salary, dtype: int32

### Will apply the same transformations on the testing data 

In [16]:
data_test.groupby(['education', 'educationno']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,age,workclass,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
education,educationno,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
10th,6,403,403,403,403,403,403,403,403,403,403,403,403
11th,7,571,571,571,571,571,571,571,571,571,571,571,571
12th,8,200,200,200,200,200,200,200,200,200,200,200,200
1st-4th,2,71,71,71,71,71,71,71,71,71,71,71,71
5th-6th,3,161,161,161,161,161,161,161,161,161,161,161,161
7th-8th,4,266,266,266,266,266,266,266,266,266,266,266,266
9th,5,221,221,221,221,221,221,221,221,221,221,221,221
Assoc-acdm,12,499,499,499,499,499,499,499,499,499,499,499,499
Assoc-voc,11,652,652,652,652,652,652,652,652,652,652,652,652
Bachelors,13,2526,2526,2526,2526,2526,2526,2526,2526,2526,2526,2526,2526


In [17]:
# will drop education column as it is kind of label encoded already and gives redundatant information
data_test.drop(['education'], axis=1, inplace=True)
data_test.head()

Unnamed: 0,age,workclass,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,25,Private,7,Never-married,Machine-op-inspct,Own-child,Black,Male,0,0,40,United-States,<=50K
1,38,Private,9,Married-civ-spouse,Farming-fishing,Husband,White,Male,0,0,50,United-States,<=50K
2,28,Local-gov,12,Married-civ-spouse,Protective-serv,Husband,White,Male,0,0,40,United-States,>50K
3,44,Private,10,Married-civ-spouse,Machine-op-inspct,Husband,Black,Male,7688,0,40,United-States,>50K
4,34,Private,6,Never-married,Other-service,Not-in-family,White,Male,0,0,30,United-States,<=50K


In [18]:
# Removing categorical column from object columns array
test_object_cols.remove('education')

In [19]:
for column in test_object_cols:
    data_test[column] = le.fit_transform(data_test[column])    

data_test.head()

Unnamed: 0,age,workclass,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native,Salary
0,25,2,7,4,6,3,2,1,0,0,40,37,0
1,38,2,9,2,4,0,4,1,0,0,50,37,0
2,28,1,12,2,10,0,4,1,0,0,40,37,1
3,44,2,10,2,6,0,2,1,7688,0,40,37,1
4,34,2,6,4,7,1,4,1,0,0,30,37,0


In [20]:
# Splitting the X and y variables
y_test = data_test.iloc[:,-1]
X_test = data_test.drop(['Salary'], axis=1)

In [21]:
X_test.head()

Unnamed: 0,age,workclass,educationno,maritalstatus,occupation,relationship,race,sex,capitalgain,capitalloss,hoursperweek,native
0,25,2,7,4,6,3,2,1,0,0,40,37
1,38,2,9,2,4,0,4,1,0,0,50,37
2,28,1,12,2,10,0,4,1,0,0,40,37
3,44,2,10,2,6,0,2,1,7688,0,40,37
4,34,2,6,4,7,1,4,1,0,0,30,37


In [22]:
y_test.head()

0    0
1    0
2    1
3    1
4    0
Name: Salary, dtype: int32

In [23]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((30161, 12), (30161,), (15060, 12), (15060,))

## Building the model 

In [26]:
clf = SVC()
param_grid = [{'kernel':['rbf'],'gamma':[0.5,0.1,0.01],'C':[0.01,0.001] }]
gsv = RandomizedSearchCV(clf,param_grid,n_iter=5,verbose=300)
gsv.fit(X_train,y_train)


Fitting 5 folds for each of 5 candidates, totalling 25 fits
[CV 1/5; 1/5] START C=0.001, gamma=0.1, kernel=rbf..............................
[CV 1/5; 1/5] END C=0.001, gamma=0.1, kernel=rbf;, score=0.751 total time= 1.0min
[CV 2/5; 1/5] START C=0.001, gamma=0.1, kernel=rbf..............................
[CV 2/5; 1/5] END C=0.001, gamma=0.1, kernel=rbf;, score=0.751 total time= 1.0min
[CV 3/5; 1/5] START C=0.001, gamma=0.1, kernel=rbf..............................
[CV 3/5; 1/5] END C=0.001, gamma=0.1, kernel=rbf;, score=0.751 total time= 1.1min
[CV 4/5; 1/5] START C=0.001, gamma=0.1, kernel=rbf..............................
[CV 4/5; 1/5] END C=0.001, gamma=0.1, kernel=rbf;, score=0.751 total time= 1.1min
[CV 5/5; 1/5] START C=0.001, gamma=0.1, kernel=rbf..............................
[CV 5/5; 1/5] END C=0.001, gamma=0.1, kernel=rbf;, score=0.751 total time= 1.0min
[CV 1/5; 2/5] START C=0.01, gamma=0.1, kernel=rbf...............................
[CV 1/5; 2/5] END C=0.01, gamma=0.1, kernel=

RandomizedSearchCV(estimator=SVC(), n_iter=5,
                   param_distributions=[{'C': [0.01, 0.001],
                                         'gamma': [0.5, 0.1, 0.01],
                                         'kernel': ['rbf']}],
                   verbose=300)

In [27]:
gsv.best_params_ , gsv.best_score_

({'kernel': 'rbf', 'gamma': 0.1, 'C': 0.001}, 0.7510692627331288)

In [35]:
final_model = SVC(kernel ='rbf', gamma = 0.1, C=0.001)
final_model.fit(X_train, y_train)
y_pred_with_rbf = final_model.predict(X_test)
score_rbf = accuracy_score(y_pred_with_rbf, y_test)
print(score_rbf)


0.7543160690571049


### Build a model by taking gamma and C from above and using linear kernel
Eventually it is giving us better accuracy score. 

In [32]:
## just wanted to have an idea what is the score and time taken for linear kernel
from sklearn.model_selection import KFold, cross_val_score
model_withlinearkernel = SVC(kernel ='linear', gamma = 0.1, C=0.001)
result = cross_val_score(model_withlinearkernel, X_train, y_train, cv=KFold(n_splits=5))
result.mean()

0.8054441014297579

In [34]:
model_withlinearkernel.fit(X_train, y_train)
y_predict_with_linearkernel = model_withlinearkernel.predict(X_test)
score_linear = accuracy_score(y_test, y_predict_with_linearkernel)
print(score_linear)

0.804316069057105
0.804316069057105
