In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

# To partition the data
from sklearn.model_selection import train_test_split

# Importing library for Logistic Regression
from sklearn.linear_model import LogisticRegression

# Importing performance metrics: accuracy score and confusion matrix
from sklearn.metrics import accuracy_score, confusion_matrix

In [2]:
data_income = pd.read_csv("income.csv", na_values = [' ?'])
data = data_income.copy()
data2 = data.dropna(axis = 0)

In [3]:
print(data2.columns)

Index(['age', 'JobType', 'EdType', 'maritalstatus', 'occupation',
       'relationship', 'race', 'gender', 'capitalgain', 'capitalloss',
       'hoursperweek', 'nativecountry', 'SalStat'],
      dtype='object')


In [4]:
print(data2['SalStat'])

0         less than or equal to 50,000
1         less than or equal to 50,000
2                  greater than 50,000
3         less than or equal to 50,000
4         less than or equal to 50,000
                     ...              
31973     less than or equal to 50,000
31974     less than or equal to 50,000
31975     less than or equal to 50,000
31976     less than or equal to 50,000
31977     less than or equal to 50,000
Name: SalStat, Length: 30162, dtype: object


# Logistic Regression

### Reindexing the salary status to 0 and 1
Since M.L. algorithms cannot work with categorical data... they need to be converted to numbers

0 = less than 0r equal to 50000

1 = greater than 50000


#### Dataseries = Dataseries.map({1st value to be replaced: corres. replaced value ,  2nd value to be replaced: corres. replaced value})

In [5]:
data2['SalStat'] = data2['SalStat'].map({" less than or equal to 50,000":0," greater than 50,000":1})
print(data2['SalStat'])

0        0
1        0
2        1
3        0
4        0
        ..
31973    0
31974    0
31975    0
31976    0
31977    0
Name: SalStat, Length: 30162, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data2['SalStat'] = data2['SalStat'].map({" less than or equal to 50,000":0," greater than 50,000":1})


#### pd.get_dummies(dataframe, columns = [])
To convert categorical data into dummy variables (0= False, 1= True)

converts the categorical data into number of columns (depending on number of categories present in the column)... each category is split into a separate column

columns: list of columns to be encoded... if nothing is specified all columns are encoded

In [6]:
new_data = pd.get_dummies(data = data2, drop_first = True)
print(new_data)

       age  capitalgain  capitalloss  hoursperweek  SalStat  \
0       45            0            0            28        0   
1       24            0            0            40        0   
2       44            0            0            40        1   
3       27            0            0            40        0   
4       20            0            0            35        0   
...    ...          ...          ...           ...      ...   
31973   34          594            0            60        0   
31974   34            0            0            40        0   
31975   23            0            0            40        0   
31976   42            0            0            40        0   
31977   29            0            0            40        0   

       JobType_ Local-gov  JobType_ Private  JobType_ Self-emp-inc  \
0                       0                 1                      0   
1                       0                 0                      0   
2                       0        

### spliting the columns into independent (x) and dependent (y) variables

##### storing column names in a list

In [8]:
columns_list = list(new_data.columns)
print(len(columns_list))

95


##### separating input variables (i.e. excluding salary status which is a dependent variable)

In [9]:
features = list(set(columns_list) - set(['SalStat']))
print(features)
print(len(features))

['EdType_ Assoc-voc', 'race_ Other', 'race_ Asian-Pac-Islander', 'nativecountry_ Yugoslavia', 'occupation_ Handlers-cleaners', 'EdType_ 5th-6th', 'EdType_ Some-college', 'JobType_ Self-emp-not-inc', 'nativecountry_ Japan', 'nativecountry_ El-Salvador', 'nativecountry_ Holand-Netherlands', 'race_ White', 'relationship_ Not-in-family', 'EdType_ Bachelors', 'occupation_ Transport-moving', 'relationship_ Own-child', 'relationship_ Unmarried', 'EdType_ Doctorate', 'nativecountry_ Vietnam', 'nativecountry_ Taiwan', 'maritalstatus_ Never-married', 'occupation_ Other-service', 'nativecountry_ South', 'nativecountry_ China', 'nativecountry_ Trinadad&Tobago', 'maritalstatus_ Married-civ-spouse', 'EdType_ 1st-4th', 'gender_ Male', 'nativecountry_ Ecuador', 'EdType_ 12th', 'nativecountry_ Canada', 'nativecountry_ Puerto-Rico', 'occupation_ Craft-repair', 'nativecountry_ Columbia', 'occupation_ Protective-serv', 'nativecountry_ Iran', 'nativecountry_ Dominican-Republic', 'nativecountry_ Outlying-US

##### Storing output values (values of salary status) in y

In [10]:
y = new_data['SalStat'].values
print(y)

[0 0 1 ... 0 0 0]


##### Storing input values (from input parameters i.e. variables) in x

In [11]:
x = new_data[features].values
print(x)

[[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 0]]


### Splitting the data into train data and test data

###### train_x, test_x, train_y, test_y = train_test_split(input values, output values, test_size = ratio of the test size, random_state = 0 or 1)
random_state = 0 ... randomizer off, same set of data will be selected everytime the code is run

random_state = 1 ... randomizer on, different set of data will be selected everytime yje code is run 

In [12]:
train_x, test_x, train_y, test_y = train_test_split(x,y, test_size = 0.3, random_state = 0)

#### Making an instance of the model

In [13]:
logistic = LogisticRegression()

## Logistic regression
### METHODS:

##### .fit(training vector (X), target vector relative to X)
to train the model

##### .decision_function(X)
predict confidence scores for samples

##### .get_params ([T / F])
gets the parameters for this estimator

##### .set_params({dictionary of parameters})
to set the parameters

##### .predict(test_data)
to predict the outcome of test data from the trained model

### ATTRIBUTES:

##### .classes_
gives a list of class labels
##### .coef_
coefficient of features in decision function
##### .intercept_
intercept (bias or constant) added to the decision function

#### fitting the values for x and y (training the model)

In [14]:
logistic.fit(train_x, train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

#### to get the coefficients of the logistic regression model

In [15]:
logistic.coef_

array([[-3.85831278e-02, -8.40304337e-02, -1.18074801e-01,
        -5.24380980e-04, -3.39215406e-01, -1.40397600e-01,
        -3.95405787e-01, -5.83640449e-01, -4.81322328e-04,
        -3.96858551e-02, -8.91765543e-04, -3.25199236e-01,
        -4.98501276e-01,  7.22087431e-01, -1.95435632e-01,
        -9.40900020e-01, -5.75498692e-01,  2.48523487e-01,
        -3.37144069e-02,  1.94604386e-03, -1.34633441e+00,
        -7.40559284e-01, -2.65746119e-02, -2.24666699e-02,
        -4.49147804e-03,  1.13173089e+00, -7.17085491e-02,
         1.64048270e-01, -7.24589727e-03, -9.58289553e-02,
         3.59284054e-03, -4.13963548e-02, -3.21882539e-01,
        -2.02082302e-02, -1.42454586e-02, -2.71203971e-03,
        -2.57188921e-02, -7.41996225e-03, -3.90458009e-01,
        -1.20861477e-02,  3.16323050e-04,  9.06103599e-03,
        -3.88280066e-03,  6.83525647e-01, -2.03234141e-02,
        -3.27458558e-03, -4.06336296e-02, -3.29849156e-03,
        -8.13386519e-03, -2.44535773e-01,  2.46340208e-0

#### to get intercept value

In [16]:
logistic.intercept_

array([-1.04072231])

#### Prediction from test data

In [17]:
prediction = logistic.predict(test_x)
print(prediction)

[0 0 0 ... 0 0 0]


### Confusion matrix
1. it is a table used to evaluate the performance of the classification model
2. confusion matrix output gives the number of correct and incorrect predictions and sums up all the values classwise

#### confusion_matrix(actual data, predicted data)
1. diagonal values gives the total number of correctly predicted (classified) samples
2. off diagonal values gives the total number of incorrectly predicted (classified) samples

In [18]:
confusion_matrix = confusion_matrix(test_y, prediction)
print(confusion_matrix)

[[6292  531]
 [ 947 1279]]


1. 6291 (correct)... actually <= 50k and classified as <= 50k
2. 532 (incorrect)... actually <= 50k but classified as >50k
3. 947 (incorrect)... actually > 50k but classified as <= 50k
4. 1279 (correct)... actually > 50k and classified as >50k

### calculating the accuracy score
#### accuracy_score(actual, predicted)

In [19]:
accuracy_score = accuracy_score(test_y, prediction)
print(accuracy_score)

0.8366670350314952


#### getting the total no. of misclassified values

In [20]:
print(f"Misclassified Values: {(test_y != prediction).sum()}")

Misclassified Values: 1478


# Logistic Regression By removing Insignificant Variables

#### Reindexing the salary status to 0 and 1

In [21]:
import pandas as pd
import numpy as np
import seaborn as sns

# To partition the data
from sklearn.model_selection import train_test_split

# Importing library for Logistic Regression
from sklearn.linear_model import LogisticRegression

# Importing performance metrics: accuracy score and confusion matrix
from sklearn.metrics import accuracy_score, confusion_matrix

In [137]:
#data2['SalStat'] = data2['SalStat'].map({" less than or equal to 50,000":0," greater than 50,000":1})
#print(data2['SalStat'])

In [22]:
data_income = pd.read_csv("income.csv", na_values = [' ?'])
data = data_income.copy()
data3 = data.dropna(axis = 0)

In [23]:
data3['SalStat'] = data3['SalStat'].map({" less than or equal to 50,000":0," greater than 50,000":1})
print(data3['SalStat'])

0        0
1        0
2        1
3        0
4        0
        ..
31973    0
31974    0
31975    0
31976    0
31977    0
Name: SalStat, Length: 30162, dtype: int64


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data3['SalStat'] = data3['SalStat'].map({" less than or equal to 50,000":0," greater than 50,000":1})


##### removing insignificant columns:

In [24]:
cols = ['gender','nativecountry', 'race', 'JobType']

In [25]:
data3 = data3.drop(cols, axis = 1)
new_data = pd.get_dummies(data = data3, drop_first = True)
print(new_data)

       age  capitalgain  capitalloss  hoursperweek  SalStat  EdType_ 11th  \
0       45            0            0            28        0             0   
1       24            0            0            40        0             0   
2       44            0            0            40        1             0   
3       27            0            0            40        0             0   
4       20            0            0            35        0             0   
...    ...          ...          ...           ...      ...           ...   
31973   34          594            0            60        0             0   
31974   34            0            0            40        0             0   
31975   23            0            0            40        0             0   
31976   42            0            0            40        0             0   
31977   29            0            0            40        0             0   

       EdType_ 12th  EdType_ 1st-4th  EdType_ 5th-6th  EdType_ 7th-8th  ...

### Following the same steps again

In [26]:
new_columns_list = list(new_data.columns)
print(len(new_columns_list))

44


In [28]:
new_features = list(set(new_columns_list) - set(['SalStat']))
print(new_features)
print(len(new_features))

['EdType_ Assoc-voc', 'EdType_ 5th-6th', 'occupation_ Handlers-cleaners', 'occupation_ Priv-house-serv', 'EdType_ Some-college', 'relationship_ Other-relative', 'relationship_ Wife', 'maritalstatus_ Married-spouse-absent', 'occupation_ Machine-op-inspct', 'relationship_ Not-in-family', 'EdType_ Bachelors', 'occupation_ Transport-moving', 'occupation_ Sales', 'relationship_ Own-child', 'EdType_ Doctorate', 'relationship_ Unmarried', 'occupation_ Other-service', 'maritalstatus_ Never-married', 'occupation_ Armed-Forces', 'EdType_ 7th-8th', 'maritalstatus_ Married-civ-spouse', 'EdType_ 11th', 'EdType_ 1st-4th', 'EdType_ 12th', 'maritalstatus_ Married-AF-spouse', 'age', 'EdType_ Preschool', 'occupation_ Craft-repair', 'occupation_ Tech-support', 'EdType_ HS-grad', 'EdType_ Masters', 'occupation_ Protective-serv', 'occupation_ Farming-fishing', 'maritalstatus_ Widowed', 'maritalstatus_ Separated', 'EdType_ Prof-school', 'EdType_ Assoc-acdm', 'occupation_ Exec-managerial', 'capitalgain', 'ca

In [29]:
new_y = new_data['SalStat'].values
print(new_y)

[0 0 1 ... 0 0 0]


In [30]:
new_x = new_data[new_features].values
print(new_x)

[[ 0  0  0 ... 28  0  0]
 [ 0  0  0 ... 40  0  0]
 [ 0  0  0 ... 40  1  0]
 ...
 [ 0  0  0 ... 40  0  0]
 [ 0  0  0 ... 40  0  0]
 [ 0  0  0 ... 40  1  0]]


In [31]:
new_train_x, new_test_x, new_train_y, new_test_y = train_test_split(new_x,new_y, test_size = 0.3, random_state = 0)

In [32]:
logistic = LogisticRegression()

In [33]:
logistic.fit(new_train_x, new_train_y)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [34]:
logistic.coef_

array([[-8.68279290e-02, -1.39086302e-01, -4.06342993e-01,
        -4.80907010e-02, -5.05365528e-01, -2.66725608e-01,
         2.82456334e-02, -5.27891789e-02, -4.57401118e-01,
        -5.18520186e-01,  6.49495298e-01, -2.16945225e-01,
         4.93135077e-02, -1.08710779e+00,  2.86119637e-01,
        -6.45615212e-01, -8.29389553e-01, -1.54563636e+00,
        -1.51841179e-03, -2.56772050e-01,  9.96722926e-01,
        -3.58670017e-01, -7.03811644e-02, -1.06081705e-01,
         1.70775019e-02, -2.56138675e-03, -2.63933083e-02,
        -3.87932274e-01,  8.60428910e-02, -1.06844049e+00,
         5.87500645e-01,  4.09546756e-02, -3.16295625e-01,
        -9.86143883e-02, -1.95738967e-01,  2.76531260e-01,
        -8.79748803e-02,  9.14928933e-01,  3.17475958e-04,
         6.52633501e-04,  5.50551806e-03,  7.25619845e-01,
        -1.90556424e-01]])

In [35]:
logistic.intercept_

array([-1.42174435])

In [36]:
new_prediction = logistic.predict(new_test_x)
print(prediction)

[0 0 0 ... 0 0 0]


In [37]:
new_confusion_matrix = confusion_matrix(new_test_y, new_prediction)
print(new_confusion_matrix)

[[6305  518]
 [ 983 1243]]


In [38]:
new_accuracy_score = accuracy_score(new_test_y, new_prediction)
print(new_accuracy_score)

0.8341253177146646


In [39]:
print(f"Misclassified Values: {(new_test_y != new_prediction).sum()}")

Misclassified Values: 1501


1. it is observed that there is a marginal dip in accuracy, this is because we are also dropping some of the data
2. since the accuracy is not affected significantly, and the data collection in be reduced significantly...we can keep this as the final model

# KNN Classifier Model
(To classify the records into any one of the salary status categories)

In [40]:
from sklearn.neighbors import KNeighborsClassifier

In [41]:
import matplotlib.pyplot as plt

#### creating an instance for K nearest neighbors classifier

n_neighbors = 5

specifying the k value... so that it will take 5 neighbouring records and it will take the majority of 5 neighbors and clasify the new data

In [42]:
KNN_classifier = KNeighborsClassifier(n_neighbors = 5)

#### fitting the values for X and Y

In [43]:
KNN_classifier.fit(new_train_x, new_train_y)

KNeighborsClassifier()

#### Predicting the test values with the model

In [44]:
KNN_prediction = KNN_classifier.predict(new_test_x)

#### Performance metric check

In [45]:
KNN_confusion_matrix = confusion_matrix(new_test_y, KNN_prediction)
print( KNN_confusion_matrix)

[[6183  640]
 [ 861 1365]]


In [46]:
KNN_accuracy_score = accuracy_score(new_test_y, KNN_prediction)
print(KNN_accuracy_score)

0.8341253177146646


In [47]:
print(f"Misclassified Values: {(new_test_y != KNN_prediction).sum()}")

Misclassified Values: 1501


#### Effect of k value on classifier

##### Calculating error for k values between 1 and 20

In [48]:
Misclassified_samples = []
for i in range(1,20):
    knn = KNeighborsClassifier(n_neighbors = i)
    knn.fit(new_train_x, new_train_y)
    pred_i = knn.predict(new_test_x)
    acc_i = accuracy_score(new_test_y, pred_i)
    print(f"for k = {i}, accuracy is: {acc_i}")
    Misclassified_samples.append((new_test_y != pred_i).sum())
print(Misclassified_samples)

for k = 1, accuracy is: 0.8104762957232844
for k = 2, accuracy is: 0.8335727704718754
for k = 3, accuracy is: 0.8273842413526357
for k = 4, accuracy is: 0.8369985633771687
for k = 5, accuracy is: 0.8341253177146646
for k = 6, accuracy is: 0.8414189413194828
for k = 7, accuracy is: 0.8376616200685159
for k = 8, accuracy is: 0.8403138468339043
for k = 9, accuracy is: 0.8387667145540944
for k = 10, accuracy is: 0.8430765830478506
for k = 11, accuracy is: 0.8378826389656315
for k = 12, accuracy is: 0.8394297712454415
for k = 13, accuracy is: 0.8407558846281357
for k = 14, accuracy is: 0.84208199801083
for k = 15, accuracy is: 0.8394297712454415
for k = 16, accuracy is: 0.841308431870925
for k = 17, accuracy is: 0.8404243562824621
for k = 18, accuracy is: 0.8428555641507349
for k = 19, accuracy is: 0.8409769035252515
[1715, 1506, 1562, 1475, 1501, 1435, 1469, 1445, 1459, 1420, 1467, 1453, 1441, 1429, 1453, 1436, 1444, 1422, 1439]


From Trial and error, it is observed that for k = 10,18 , the accuracy is the highest

In [49]:
KNN_classifier = KNeighborsClassifier(n_neighbors = 18)
KNN_classifier.fit(new_train_x, new_train_y)
KNN_prediction = KNN_classifier.predict(new_test_x)
KNN_confusion_matrix = confusion_matrix(new_test_y, KNN_prediction)
print( KNN_confusion_matrix)
KNN_accuracy_score = accuracy_score(new_test_y, KNN_prediction)
print(KNN_accuracy_score)
print(f"Misclassified Values: {(new_test_y != KNN_prediction).sum()}")

[[6414  409]
 [1013 1213]]
0.8428555641507349
Misclassified Values: 1422
