Project: Machine Learning of Salary and Demographic Factors
Name: Shaohua Feng
Supervisor:

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
from sklearn.metrics import confusion_matrix
#from sklearn.metrics import plot_confusion_matrix

In [None]:
# from sklearn.metrics import plot_confusion_matrix doesn't work, so
!pip install plot_confusion_matrix

Collecting plot_confusion_matrix
  Downloading plot_confusion_matrix-0.0.2-py3-none-any.whl (3.6 kB)
Installing collected packages: plot_confusion_matrix
Successfully installed plot_confusion_matrix-0.0.2


In [None]:
#######################################
## Read data and data wrangling
#######################################
# read in data loaded in google drive
file_path_1 = '/content/drive/My Drive/adult.data'
adult_1= pd.read_csv(file_path_1,header=None)
file_path_2 = '/content/drive/My Drive/adult.test.txt'
adult_2= pd.read_csv(file_path_2,header=None)
adult=pd.concat([adult_1, adult_2], ignore_index=True)

In [None]:
# add column names
cols=['age','workclass','fnlwgt','education','education-num','marital-status','occupation','relationship','race','sex','capital-gain','capital-loss','hours-per-week','native-country','label']
adult.columns=cols

# add y column to data frame. target=1 for label '>50k' and y=0 for label '<=50k'
adult['target']=np.where(adult['label']==' >50K',1,0)
#adult['target'] = adult['target'].astype(bool)
#
print(adult.describe())
adult.dtypes
adult.info()

                age        fnlwgt  education-num  capital-gain  capital-loss  \
count  48842.000000  4.884200e+04   48842.000000  48842.000000  48842.000000   
mean      38.643585  1.896641e+05      10.078089   1079.067626     87.502314   
std       13.710510  1.056040e+05       2.570973   7452.019058    403.004552   
min       17.000000  1.228500e+04       1.000000      0.000000      0.000000   
25%       28.000000  1.175505e+05       9.000000      0.000000      0.000000   
50%       37.000000  1.781445e+05      10.000000      0.000000      0.000000   
75%       48.000000  2.376420e+05      12.000000      0.000000      0.000000   
max       90.000000  1.490400e+06      16.000000  99999.000000   4356.000000   

       hours-per-week        target  
count    48842.000000  48842.000000  
mean        40.422382      0.160538  
std         12.391444      0.367108  
min          1.000000      0.000000  
25%         40.000000      0.000000  
50%         40.000000      0.000000  
75%         4

In [None]:
# Data Manipulation: replace '?' with None
adult['workclass']=adult['workclass'].replace(' ?',None)
adult['occupation']=adult['occupation'].replace(' ?',None)
adult['native-country']=adult['native-country'].replace(' ?',None)

In [None]:
# charactegorical columns
cols_cat=['workclass','education','marital-status','occupation','relationship','race','sex','native-country']

for x in cols_cat:
  adult[x] = adult[x].astype('category')
  #print(x)

adult.dtypes

age                  int64
workclass         category
fnlwgt               int64
education         category
education-num        int64
marital-status    category
occupation        category
relationship      category
race              category
sex               category
capital-gain         int64
capital-loss         int64
hours-per-week       int64
native-country    category
label               object
target               int64
dtype: object

In [None]:
# delete missing value
adult_cleaned=adult.dropna()
print(len(adult_cleaned))

45222


In [None]:
# drop original label
del adult_cleaned['label']
adult_cleaned.head(5)

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,target
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,0
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,0
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,0
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,0
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,0


In [None]:
############################
# Functions used by the machine learning algorithms
#Normalizing numeric data
def normalize(x):
  if x.dtype == 'int' or x.dtype == 'float':
    return ((x - min(x)) / (max(x) - min(x)))
  else:
    return x

In [None]:
#Converting categorical data to dummmy/one-hot variables
def dummy(x):
  cat_col=['workclass','education','marital-status','occupation','relationship','race','sex','native-country']
  x = pd.get_dummies(x, columns=cat_col, prefix = cat_col)
  return x
#print the dataset
#adult_new.head(5)


In [None]:
# Print out Accuracy
def printAcc(y_test,y_pred):
  from sklearn.metrics import accuracy_score

  accuracy = accuracy_score(y_test, y_pred)
  print(f"Accuracy: {accuracy}")

In [None]:
# print out confusion matrix
def printConfusion(y_test, y_pred):

  from sklearn.metrics import confusion_matrix

  cf=confusion_matrix(y_test, y_pred)
  print(cf)
  tn, fp, fn, tp=cf.ravel()
  print ("TP: ", tp,", FP: ", fp,", TN: ", tn,", FN:", fn)

In [None]:
#print precision, recall, and accuracy from the perspective of each of the class (0 and 1 for German dataset)
def printReport(y_test,y_pred):

  from sklearn.metrics import classification_report
  from sklearn import metrics

  print(classification_report(y_test, y_pred))

In [None]:
#################################################
# Decision tree
#################################################

# Decision tree without oversampling and normalization
# I want to use it as baseline to proof that oversampling and normalization improves decision tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# Create a deep copy of the data frame adult_cleaned and name it adult_new
# adult_new is not normalized or oversampled
# I will use adult_new as the base line
adult_new=adult_cleaned.copy(deep=True)

#X = list(set(list(adult_cleaned)) - set(['target']))
X = adult_new.drop('target', axis=1)
y = adult_new['target']

# dummy variable
X=dummy(X)

# split train and test. test size is 0.35
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=42)

# create decision tree classifier
dtc = DecisionTreeClassifier(random_state=52)
# Train the model on the training data
dtc.fit(X_train, y_train)
# Make predictions on the test data
y_pred = dtc.predict(X_test)

print("Decision Tree for data without normalization and oversampling")
# Print Accuracy
printAcc(y_test,y_pred)

# Print Confusion Matrix
print("")
print("Confustion Matrix")
printConfusion(y_test, y_pred)

# Print Diagnosis
print("")
printReport(y_test,y_pred)



Decision Tree for data without normalization and oversampling
Accuracy: 0.7998483699772555

Confustion Matrix
[[11507  1697]
 [ 1471  1153]]
TP:  1153 , FP:  1697 , TN:  11507 , FN: 1471

              precision    recall  f1-score   support

           0       0.89      0.87      0.88     13204
           1       0.40      0.44      0.42      2624

    accuracy                           0.80     15828
   macro avg       0.65      0.66      0.65     15828
weighted avg       0.81      0.80      0.80     15828



In [None]:
# Decision tree with normalization but no oversampling

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split

# Create a deep copy of the data frame adult_cleaned and name it adult_new
# adult_new is not normalized or oversampled
# I will use adult_new as the base line
adult_dt_norm = adult_cleaned.copy(deep=True)
X = adult_dt_norm.drop('target', axis=1)
y = adult_dt_norm['target']

# Normalization the numerical columns
num_cols = ['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']
# why this code doesn't work?
#X[num_cols] = X[num_cols].apply(lambda x:normalize(x), axis=0)
X_num = X[num_cols]
X_num_normalized = X_num.apply(normalize, axis=0)
# combine the normalized numerical columns with the categorical columns
X = pd.concat([X_num_normalized, X.drop(num_cols, axis=1)], axis=1)

#print(X.head(5))
#print(X.tail(5))
# dummy variable
X=dummy(X)
print(X.head(5))
# split train and test. test size is 0.35
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.35, random_state=52)

# create decision tree classifier
dtc_norm = DecisionTreeClassifier(random_state=52)
# Train the model on the training data
dtc_norm.fit(X_train, y_train)
# Make predictions on the test data
y_pred = dtc_norm.predict(X_test)

print("Decision Tree for data with normalization but no oversampling")
# Print Accuracy
printAcc(y_test,y_pred)

# Print Confusion Matrix
print("")
print("Confustion Matrix")
printConfusion(y_test, y_pred)

# Print Diagnosis
print("")
printReport(y_test,y_pred)






        age    fnlwgt  education-num  capital-gain  capital-loss  \
0  0.301370  0.043350       0.800000       0.02174           0.0   
1  0.452055  0.047274       0.800000       0.00000           0.0   
2  0.287671  0.136877       0.533333       0.00000           0.0   
3  0.493151  0.149792       0.400000       0.00000           0.0   
4  0.150685  0.219998       0.800000       0.00000           0.0   

   hours-per-week  workclass_ Federal-gov  workclass_ Local-gov  \
0        0.397959                       0                     0   
1        0.122449                       0                     0   
2        0.397959                       0                     0   
3        0.397959                       0                     0   
4        0.397959                       0                     0   

   workclass_ Never-worked  workclass_ Private  ...  native-country_ Portugal  \
0                        0                   0  ...                         0   
1                        0

In [None]:
# Decision tree with normalization and random oversampling
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import RandomOverSampler

# Create a deep copy of the data frame adult_dt_norm_ros
# X is normalized and random oversampled
adult_dt_norm_ros = adult_cleaned.copy(deep=True)
X = adult_dt_norm_ros.drop('target', axis=1)
y = adult_dt_norm_ros['target']

# Normalization the numerical columns
num_cols = ['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']
X_num = X[num_cols]
X_num_normalized = X_num.apply(normalize, axis=0)
# combine the normalized numerical columns with the categorical columns
X = pd.concat([X_num_normalized, X.drop(num_cols, axis=1)], axis=1)

# dummy variable
X=dummy(X)

# split train and test. test size is 0.35
X_train_norm, X_test_norm, y_train_norm, y_test_norm = train_test_split(X, y, test_size=0.35, random_state=52)

#Random Oversampling
ros = RandomOverSampler(sampling_strategy='auto', random_state=52)
X_train_ros, y_train_ros = ros.fit_resample(X_train_norm, y_train_norm)

# create decision tree classifier for normalized data with random oversampling
dtc_norm_ros = DecisionTreeClassifier(random_state=52)
# Train the model on the training data
dtc_norm.fit(X_train_ros, y_train_ros)
# Make predictions on the test data
y_pred_norm_ros = dtc_norm.predict(X_test_norm)

print("Decision Tree for data with normalization and random oversampling")
# Print Accuracy
printAcc(y_test_norm,y_pred_norm_ros)

# Print Confusion Matrix
print("")
print("Confustion Matrix")
printConfusion(y_test_norm, y_pred_norm_ros)

# Print Diagnosis
print("")
printReport(y_test_norm,y_pred_norm_ros)

Decision Tree for data with normalization and random oversampling
Accuracy: 0.8037022997220116

Confustion Matrix
[[11656  1570]
 [ 1537  1065]]
TP:  1065 , FP:  1570 , TN:  11656 , FN: 1537

              precision    recall  f1-score   support

           0       0.88      0.88      0.88     13226
           1       0.40      0.41      0.41      2602

    accuracy                           0.80     15828
   macro avg       0.64      0.65      0.64     15828
weighted avg       0.80      0.80      0.80     15828



In [None]:
# Cross validated decision tree normalized no over sampling
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
# StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Create a deep copy of the data frame adult_cleaned and name it adult_c_dt_norm_ros
# X is normalized and random oversampled
adult_c_dt = adult_cleaned.copy(deep=True)
X = adult_c_dt.drop('target', axis=1)
y = adult_c_dt['target']

# Normalization the numerical columns
num_cols = ['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']
X_num = X[num_cols]
X_num_normalized = X_num.apply(normalize, axis=0)
# combine the normalized numerical columns with the categorical columns
X = pd.concat([X_num_normalized, X.drop(num_cols, axis=1)], axis=1)

# dummy variable
X=dummy(X)

# Create a Decision Tree classifier
c_dt= DecisionTreeClassifier()

# Create a pipeline with oversampling and the decision tree classifier
model = Pipeline([('ros', ros), ('dt', c_dt_rom)])

# Set up cross-validation using StratifiedKFold
cv = KFold(n_splits=10, shuffle=True, random_state=52)

# Perform cross-validation and obtain predicted labels
y_pred = cross_val_predict(c_dt,X,y, cv=cv)

# Calculate and print classification report
print("Classification Report:\n", classification_report(y, y_pred))

# Perform cross-validation
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv)

# Print the mean accuracy across all folds
print("Mean Accuracy:", scores.mean())


Classification Report:
               precision    recall  f1-score   support

           0       0.89      0.88      0.88     37714
           1       0.41      0.43      0.42      7508

    accuracy                           0.80     45222
   macro avg       0.65      0.65      0.65     45222
weighted avg       0.81      0.80      0.81     45222

Mean Accuracy: 0.8046967325976435


In [None]:
# cross validated decision tree normalized random oversampling
# I use pipeline!
from sklearn.model_selection import cross_val_score, cross_val_predict, KFold
# StratifiedKFold
from sklearn.tree import DecisionTreeClassifier
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
from sklearn.metrics import classification_report
import pandas as pd

# Create a deep copy of the data frame adult_cleaned and name it adult_c_dt_norm_ros
# X is normalized and random oversampled
adult_c_dt_norm_ros = adult_cleaned.copy(deep=True)
X = adult_c_dt_norm_ros.drop('target', axis=1)
y = adult_c_dt_norm_ros['target']

# Normalization the numerical columns
num_cols = ['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']
X_num = X[num_cols]
X_num_normalized = X_num.apply(normalize, axis=0)
# combine the normalized numerical columns with the categorical columns
X = pd.concat([X_num_normalized, X.drop(num_cols, axis=1)], axis=1)

# dummy variable
X=dummy(X)

# Create a RandomOverSampler
ros = RandomOverSampler()

# Create a Decision Tree classifier
c_dt_rom = DecisionTreeClassifier()

# Apply oversampling to X and y
X_resampled, y_resampled = ros.fit_resample(X, y)


# Create a pipeline with oversampling and the decision tree classifier
model = Pipeline([('ros', ros), ('dt', c_dt_rom)])

# Set up cross-validation using StratifiedKFold
cv = KFold(n_splits=10, shuffle=True, random_state=52)

# Perform cross-validation and obtain predicted labels
y_pred = cross_val_predict(model, X_resampled,y_resampled, cv=cv)

# Calculate and print classification report
print("Classification Report:\n", classification_report(y_resampled, y_pred))

# Perform cross-validation
scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv)

# Print the mean accuracy across all folds
print("Mean Accuracy:", scores.mean())

#################### comments: how to print out all those stuff?

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.88      0.93     37714
           1       0.89      1.00      0.94     37714

    accuracy                           0.94     75428
   macro avg       0.94      0.94      0.94     75428
weighted avg       0.94      0.94      0.94     75428

Mean Accuracy: 0.8071071215644292


In [None]:
##########################################
# Logistic regression
##########################################

# Logistic regression with cross validation and no oversampling
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold

# Create a deep copy of the data frame adult_cleaned and name it adult_c_dt_norm_ros
# X is normalized and random oversampled
adult_c_log = adult_cleaned.copy(deep=True)
X = adult_c_log.drop('target', axis=1)
y = adult_c_log['target']

# Normalization the numerical columns
num_cols = ['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']
X_num = X[num_cols]
X_num_normalized = X_num.apply(normalize, axis=0)
# combine the normalized numerical columns with the categorical columns
X = pd.concat([X_num_normalized, X.drop(num_cols, axis=1)], axis=1)

# dummy variable
X=dummy(X)

# create a logistic model
model = LogisticRegression(max_iter=1000)  # Increase max_iter if needed for convergence

# Set up 10-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=52)

# Perform cross-validation and get the accuracy scores for each fold
scores = cross_val_score(model, X, y, cv=kfold)

# Print the accuracy for each fold and the mean accuracy
for i, score in enumerate(scores, 1):
    print(f'Fold {i}: {score}')

print(f'Mean Accuracy: {scores.mean()}')

Fold 1: 0.8425823568428035
Fold 2: 0.853415874419633
Fold 3: 0.8423264042459089
Fold 4: 0.8491817779743477
Fold 5: 0.8524988942945599
Fold 6: 0.8429898275099513
Fold 7: 0.8516143299425033
Fold 8: 0.8398938522777533
Fold 9: 0.8403361344537815
Fold 10: 0.8423264042459089
Mean Accuracy: 0.8457165856207152


In [None]:
# Logistic regression with cross validation and random oversampling

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, KFold
from imblearn.over_sampling import RandomOverSampler

# Create a deep copy of the data frame adult_cleaned and name it adult_c_dt_norm_ros
# X is normalized and random oversampled
adult_c_log_ros = adult_cleaned.copy(deep=True)
X = adult_c_log_ros.drop('target', axis=1)
y = adult_c_log_ros['target']

# Normalization the numerical columns
num_cols = ['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']
X_num = X[num_cols]
X_num_normalized = X_num.apply(normalize, axis=0)
# combine the normalized numerical columns with the categorical columns
X = pd.concat([X_num_normalized, X.drop(num_cols, axis=1)], axis=1)

# dummy variable
X=dummy(X)

# create a logistic model
model = LogisticRegression(max_iter=1000)  # Increase max_iter if needed for convergence

# Set up 10-fold cross-validation
kfold = KFold(n_splits=10, shuffle=True, random_state=52)

# Create a RandomOverSampler
ros = RandomOverSampler()

# Apply oversampling to X and y
X_resampled, y_resampled = ros.fit_resample(X, y)

# Perform cross-validation and get the accuracy scores for each fold
scores = cross_val_score(model, X_resampled, y_resampled, cv=kfold)

# Print the accuracy for each fold and the mean accuracy
for i, score in enumerate(scores, 1):
    print(f'Fold {i}: {score}')

print(f'Mean Accuracy: {scores.mean()}')

Fold 1: 0.7816518626541163
Fold 2: 0.7850987670688055
Fold 3: 0.785496486808962
Fold 4: 0.7923902956383402
Fold 5: 0.7866896460294313
Fold 6: 0.7839056078483362
Fold 7: 0.779132970966459
Fold 8: 0.783375314861461
Fold 9: 0.7859984089101034
Fold 10: 0.7817555025192257
Mean Accuracy: 0.784549486330524


In [None]:
################################################
# KNN
################################################
# KNN with cross validation no oversampling
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, KFold

# Create a deep copy of the data frame adult_cleaned and name it adult_c_knn
adult_c_knn = adult_cleaned.copy(deep=True)
X = adult_c_knn.drop('target', axis=1)
y = adult_c_knn['target']

# Normalization the numerical columns
num_cols = ['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']
X_num = X[num_cols]
X_num_normalized = X_num.apply(normalize, axis=0)
# combine the normalized numerical columns with the categorical columns
X = pd.concat([X_num_normalized, X.drop(num_cols, axis=1)], axis=1)

# dummy
X=dummy(X)

# Create a KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5)

# Set up 10 folds crossvalidation
kfold = KFold(n_splits=10, shuffle=True, random_state=52)

scores = cross_val_score(classifier, X, y, cv=kfold)

# Print the accuracy for each fold and the mean accuracy
for i, score in enumerate(scores, 1):
    print(f'Fold {i}: {score}')

print(f'Mean Accuracy: {scores.mean()}')



Fold 1: 0.8248949812071634
Fold 2: 0.8299801017024099
Fold 3: 0.8239716939407342
Fold 4: 0.8261831048208758
Fold 5: 0.8343653250773994
Fold 6: 0.8210968597965502
Fold 7: 0.8292790800530738
Fold 8: 0.82375055285272
Fold 9: 0.8290579389650597
Fold 10: 0.8250773993808049
Mean Accuracy: 0.8267657037796792


In [None]:
# KNN model with cross validation and random oversampling
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score, KFold
from imblearn.over_sampling import RandomOverSampler
from imblearn.pipeline import Pipeline
from sklearn.metrics import classification_report

# Create a deep copy of the data frame adult_cleaned and name it adult_c_knn_ros
adult_c_knn_ros = adult_cleaned.copy(deep=True)
X = adult_c_knn_ros.drop('target', axis=1)
y = adult_c_knn_ros['target']

# Normalization the numerical columns
num_cols = ['age','fnlwgt','education-num','capital-gain','capital-loss','hours-per-week']
X_num = X[num_cols]
X_num_normalized = X_num.apply(normalize, axis=0)
# combine the normalized numerical columns with the categorical columns
X = pd.concat([X_num_normalized, X.drop(num_cols, axis=1)], axis=1)

# dummy
X=dummy(X)

# Create a KNeighborsClassifier
classifier = KNeighborsClassifier(n_neighbors=5)

# Set up 10 folds crossvalidation
kfold = KFold(n_splits=10, shuffle=True, random_state=52)

# Create a RandomOverSampler
ros = RandomOverSampler()

# Apply oversampling to X and y
X_resampled, y_resampled = ros.fit_resample(X, y)

scores = cross_val_score(classifier, X_resampled, y_resampled, cv=kfold)

# Print the accuracy for each fold and the mean accuracy
for i, score in enumerate(scores, 1):
    print(f'Fold {i}: {score}')

print(f'Mean Accuracy: {scores.mean()}')



Fold 1: 0.8459498873127402
Fold 2: 0.8515179636749304
Fold 3: 0.8462150338061779
Fold 4: 0.8508550974413364
Fold 5: 0.8471430465332096
Fold 6: 0.843298422378364
Fold 7: 0.8467453267930531
Fold 8: 0.8451544478324274
Fold 9: 0.8475205515778308
Fold 10: 0.8473879607531158
Mean Accuracy: 0.8471787738103187


Questions:

To do list:
1. pracitcs lambda. don't understand it


