# DS7331 Mini-Lab
#### Group 2: Hollie Gardner, Cleveland Johnson, Shelby Provost
[Dataset Source](https://archive-beta.ics.uci.edu/ml/datasets/census+income)<br/>
[Github Repo](https://github.com/ShelbyP27/DS7331-Project)

In [76]:
#import libraries
import pandas as pd
import numpy as np
import os

# data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# data preprocessing 
from sklearn.preprocessing import StandardScaler
from sklearn import preprocessing

#prediction models
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn import metrics as mt

### Loading and Prepping Data 


In [92]:
# Importing the census dataset using pandas
# Reading the CSV file after converting file to csv and removing superfluous spaces via Excel.
df = pd.read_csv('https://raw.githubusercontent.com/ShelbyP27/DS7331-Project/main/adult-data.csv')

# Getting a first look at the dataset
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [93]:
#Cleaning up data set
df = df.replace(to_replace='?',value=np.nan) # replace '?' with NaN (not a number)
df.dropna(inplace=True) # Removing na values
df.duplicated(subset=None, keep='first')
df['income'] = df['income'].map({'<=50K': 0, '>50K': 1}).astype(int)

In [94]:
# Removing attributes not considered usefull
del df['workclass']
del df['fnlwgt']
del df['education']
del df['occupation']
del df['capital-gain']
del df['capital-loss']

#Replace Native Country with Immigrant atribute
if 'native-country' in df:
    df['immigrant'] = np.where(df['native-country']!= 'United-States', 1, 0)
    del df['native-country']
df.head()

Unnamed: 0,age,education-num,marital-status,relationship,race,sex,hours-per-week,income,immigrant
0,39,13,Never-married,Not-in-family,White,Male,40,0,0
1,50,13,Married-civ-spouse,Husband,White,Male,13,0,0
2,38,9,Divorced,Not-in-family,White,Male,40,0,0
3,53,7,Married-civ-spouse,Husband,Black,Male,40,0,0
4,28,13,Married-civ-spouse,Wife,Black,Female,40,0,1


In [95]:

# Standardize continous features
# df['age'] = sc.fit(df['age'])
# df['hours-per-week'] = sc.fit(df['hours-per-week'])
# df['education-num'] = sc.fit(df['hours-per-week'])

# One-hot encode Categorical 
tmp_df = pd.get_dummies(df['marital-status'], prefix = 'Marital')
df = pd.concat((df, tmp_df), axis =1)

tmp_df = pd.get_dummies(df['relationship'], prefix = 'Rel')
df = pd.concat((df, tmp_df), axis =1)

tmp_df = pd.get_dummies(df['race'], prefix = 'Race')
df = pd.concat((df, tmp_df), axis =1)


df['IsMale'] = df.sex == 'Male'
df.IsMale = df.IsMale.astype(np.int)

if 'sex' in df:
    del df['sex']
    
if 'marital-status' in df:
    del df['marital-status']
    
if'relationship' in df:
    del df['relationship']

if 'race' in df:
    del df['race']
    

In [96]:
# Separate features from the response
if 'income' in df:
    y = df['income'].values
    del df['income']
    X = df.values
# Train / Test split
sc = StandardScaler()
sc.fit(X)
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state=1)


In [102]:
#Logistic Regression

lr = LogisticRegression(C=1.0, random_state=1, solver='lbfgs')

lr.fit(x_train, y_train)
y_pred = lr.predict(x_test)

print('accuracy', mt.accuracy_score(y_test, y_pred))
print('confusion matrix\n', mt.confusion_matrix(y_test, y_pred))

accuracy 0.8198242996850654
confusion matrix
 [[4157  326]
 [ 761  789]]


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [103]:
#Linear SVM

train_svm = SVC(kernel = 'linear', C=1.0)

train_svm.fit(x_train, y_train)

y_pred = train_svm.predict(x_test)

print('Accuracy: %.3f' % accuracy_score(y_test, y_pred))
print('Classification Error: %.3f' % (1 - (accuracy_score(y_test, y_pred))))

Accuracy: 0.820
Classification Error: 0.180
