In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import missingno as msno
%matplotlib inline
#from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

In [2]:
# loading the file into a dataframe
df = pd.read_csv('income_evaluation.csv', header=0)
df.head()

Unnamed: 0,age,workclass,fnlwgt,education,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male,2174,0,40,United-States,<=50K
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male,0,0,13,United-States,<=50K
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male,0,0,40,United-States,<=50K
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0,0,40,United-States,<=50K
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0,0,40,Cuba,<=50K


In [4]:
df.shape

(32561, 15)

In [None]:
# stripping spaces from the column names
df.columns = df.columns.str.replace(' ', '')

In [None]:
# droppig unwanted columns
df.drop(columns=['fnlwgt', 'capital-gain', 'capital-loss', 'relationship'], inplace=True)
df.head()

In [None]:
# renaming columns
df = df.rename(columns={'education-num': 'edu_years',
                        'marital-status': 'marital_status',
                        'hours-per-week': 'hours_per_week',
                        'native-country': 'native_country'})
df.head()

In [None]:
# looking at valibles' dt
df.info()

In [None]:
# # stripping whitespaces
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

In [None]:
# checking dataframe's shape
df.shape

In [None]:
# looking at missing data
msno.matrix(df);

In [None]:
# checking value one
ones = df.hours_per_week.value_counts()
ones.loc[ones.index == 1]
# looks fine

In [None]:
# transforming the target variable into 1 and 0
df['income_code'] = 0
df.loc[(df.income == '<=50K') , 'income_code'] = 0 
df.loc[(df.income == '>50K') , 'income_code'] = 1 
df.head(10)

In [None]:
# transforming the sex variable into 1 and 0
df['sex_code'] = 0
df.loc[(df.sex == 'Male') , 'sex_code'] = 0 
df.loc[(df.sex == 'Female') , 'sex_code'] = 1 
df.head(10)

In [None]:
# dropping not coded sex and income 
df.drop(columns=['sex', 'income'], inplace=True)
df.head()

# First baseline LogReg

In [None]:
#split dataset in features and target variable
#feature_cols = ['age', 'workclass', 'education', 'edu_years','marital_status','occupation','race', 'sex', 'hours_per_week','native_country']
feature_cols = ['age', 'edu_years','hours_per_week', 'sex_code']

X = df[feature_cols] # Features
y = df.income_code # Target variable

In [None]:
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=0)

In [None]:
# import the class
from sklearn.linear_model import LogisticRegression

# instantiate the model (using the default parameters)
logreg = LogisticRegression()

# fit the model with data
logreg.fit(X_train,y_train)

#
y_pred=logreg.predict(X_test)

In [None]:
# import the metrics class
from sklearn import metrics
cnf_matrix = metrics.confusion_matrix(y_test, y_pred)
cnf_matrix

In [None]:
class_names=[0,1] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)
# create heatmap
sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="YlGnBu" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual label')
plt.xlabel('Predicted label')

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precision:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))

In [None]:
y_pred_proba = logreg.predict_proba(X_test)[::,1]
fpr, tpr, _ = metrics.roc_curve(y_test,  y_pred_proba)
auc = metrics.roc_auc_score(y_test, y_pred_proba)
plt.plot(fpr,tpr,label="data 1, auc="+str(auc))
plt.legend(loc=4)
plt.show()

# First baseline DT

In [None]:
# Load libraries
import pandas as pd
from sklearn.tree import DecisionTreeClassifier # Import Decision Tree Classifier

In [None]:
#split dataset in features and target variable
#feature_cols = ['age', 'workclass', 'education', 'edu_years','marital_status','occupation','race', 'sex', 'hours_per_week','native_country']
feature_cols = ['age', 'edu_years','hours_per_week', 'sex_code']
X = df[feature_cols] # Features
y = df.income_code # Target variable

In [None]:
# Split dataset into training set and test set
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

In [None]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier()

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

In [None]:
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
# Create Decision Tree classifer object
clf = DecisionTreeClassifier(criterion="entropy", max_depth=5)

# Train Decision Tree Classifer
clf = clf.fit(X_train,y_train)

#Predict the response for test dataset
y_pred = clf.predict(X_test)

# Model Accuracy, how often is the classifier correct?
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))

In [None]:
from sklearn.tree import export_graphviz
from sklearn.externals.six import StringIO  
from IPython.display import Image  
import pydotplus

dot_data = StringIO()
export_graphviz(clf, out_file=dot_data,  
                filled=True, rounded=True,
                special_characters=True,feature_names = feature_cols,class_names=['0','1'])
graph = pydotplus.graph_from_dot_data(dot_data.getvalue())  
graph.write_png('income.png')
Image(graph.create_png())

# Fixing botched values

## Exploring 'workclass'

In [None]:
print(df.workclass.value_counts())
sns.countplot(y=df['workclass'], data=df, orient="h");
# need to deal with ? values

In [None]:
df.shape

In [None]:
# Drop useless values
df.drop(df.loc[df['workclass']==' Without-pay'].index, inplace=True)
df.shape

In [None]:
# Drop useless values
df.drop(df.loc[df['workclass']==' Never-worked'].index, inplace=True)
df.shape

In [None]:
print(df.workclass.value_counts())
sns.countplot(y=df['workclass'], data=df, orient="h");

In [None]:
df.drop(df.loc[df['workclass']=='Without-pay'].index, inplace=True)

## Exploring 'education'

In [None]:
print(df.education.value_counts())
sns.countplot(y=df['education'], data=df, orient="h");


In [None]:
# remove preschool values
df.drop(df.loc[df['education']=='Preschool'].index, inplace=True)
df.shape

In [None]:
print(df.education.value_counts())
sns.countplot(y=df['education'], data=df, orient="h");

## Exploring 'marital_status'

In [None]:
print(df.marital_status.value_counts())
sns.countplot(y=df['marital_status'], data=df, orient="h");
# combine divorced and seaprated
# combine widowed and married spouse absent

In [None]:
# remove married-af-spouse
df.drop(df.loc[df['marital_status']=='Married-AF-spouse'].index, inplace=True)
df.shape

In [None]:
print(df.marital_status.value_counts())
sns.countplot(y=df['marital_status'], data=df, orient="h");

## Exploring 'occupation'

In [None]:
print(df.occupation.value_counts())
sns.countplot(y=df['occupation'], data=df, orient="h");
# need to deal with ? values
# combine Priv-house-serv and Handlers-cleaners
# combine Other-service, Protective-serv 

In [None]:
# remove armed forces
df.drop(df.loc[df['occupation']=='Armed-Forces'].index, inplace=True)
df.shape

In [None]:
print(df.occupation.value_counts())
sns.countplot(y=df['occupation'], data=df, orient="h");

## Exploring 'race'

In [None]:
print(df.race.value_counts())
sns.countplot(y=df['race'], data=df);
# very imbalanced
# combine Other and Asian-Pac-Islander, Amer-Indian-Eskimo? or reduce white ???

## Exploring 'native_country'

In [None]:
df.native_country.value_counts()
#sns.countplot(y=df['native_country'], data=df)
# might need to create a different category (regions)

## Exploring 'income'

In [None]:
print(df.income_code.value_counts())
sns.countplot(y=df['income_code'], data=df);
# inbalanced

In [None]:
print(df.sex_code.value_counts())
sns.countplot(y=df['sex_code'], data=df);

In [None]:
questions_and = df.loc[(df['workclass']=='?')
                       #&(df['education']=='?')
                       #&(df['edu_years']=='?')
                       #&(df['marital_status']=='?')
                       &(df['occupation']=='?')
                       #&(df['race']=='?')
                       #&(df['sex']=='?')
                       #&(df['hours_per_week']=='?')
                       &(df['native_country']=='?')]
                       #&(df['income']=='?')]
questions_and.shape
# remove these 27 values

In [None]:
df.shape

In [None]:
# Drop useless values
df.drop(df.loc[(df['workclass']=='?')&(df['occupation']=='?')&(df['native_country']=='?')].index, inplace=True)
df.shape

In [None]:
questions_and = df.loc[(df['workclass']=='?')
                       #&(df['education']==' ?')
                       #&(df['edu_years']==' ?')
                       #&(df['marital_status']==' ?')
                       &(df['occupation']=='?')]
                       #&(df['race']==' ?')
                       #&(df['sex']==' ?')
                       #&(df['hours_per_week']==' ?')
                       #&(df['native_country']==' ?')]
                       #&(df['income']==' ?')]
questions_and.shape
# nohting to remove here

In [None]:
questions_or = df.loc[(df['workclass']=='?')
                       |(df['education']=='?')
                       #&(df['edu_years']=='?')
                       |(df['marital_status']=='?')
                       |(df['occupation']=='?')
                       |(df['race']=='?')
                       #|(df['sex_code']=='?')
                       #&(df['hours_per_week']=='?')
                       |(df['native_country']=='?')]
                       #|(df['income_code']=='?')]
questions_or.shape

In [None]:
questions_class_job = df.loc[#(df['age']==' ?')
                       (df['workclass']=='?')
                       #&(df['education']==' ?')
                       #&(df['edu_years']==' ?')
                       #&(df['marital_status']==' ?')
                       | (df['occupation']=='?')]
                       #&(df['race']==' ?')
                       #&(df['sex']==' ?')
                       #&(df['hours_per_week']==' ?')
                       #|(df['native_country']==' ?')]
                       #&(df['income']==' ?')]
questions_class_job.shape
# try and fill in the missing values ?

In [None]:
questions_country = df.loc[(df['native_country']=='?')]
questions_country.shape
# remove if country is missing

In [None]:
# Drop no country values
df.drop(df.loc[df['native_country']=='?'].index, inplace=True)
df.shape

In [None]:
# Drop useless values from both workclass and occupation columns
df.drop(df.loc[(df['workclass']=='?')&(df['occupation']=='?')].index, inplace=True)
df.shape

In [None]:
print(df.shape)
print(df.income_code.value_counts())
sns.countplot(y=df['income_code'], data=df);

# done with ? signs

In [None]:
df.drop(df.loc[(df['workclass']=='?')
                       |(df['education']=='?')
                       #&(df['edu_years']=='?')
                       |(df['marital_status']=='?')
                       |(df['occupation']=='?')
                       |(df['race']=='?')
                       #|(df['sex_code']=='?')
                       #&(df['hours_per_week']=='?')
                       |(df['native_country']=='?')].index, inplace=True)
                       #|(df['income_code']=='?')]
df.shape

## Dealing with strange values

In [None]:
# reforming native country values into US vs regions
df.native_country.value_counts()

In [None]:
# Grouping the country variable
df.loc[(df.native_country == 'United-States') 
       | (df.native_country == 'Puerto-Rico')
       | (df.native_country == 'Outlying-US(Guam-USVI-etc)'),
       'native_country'] = 'US'
df.loc[(df.native_country == "Germany")
       | (df.native_country == "England")
       | (df.native_country == "Italy")
       | (df.native_country == "Poland")
       | (df.native_country == "Portugal")
       | (df.native_country == "France")
       | (df.native_country == "Greece")
       | (df.native_country == "Ireland")
       | (df.native_country == "Yugoslavia")
       | (df.native_country == "Hungary")
       | (df.native_country == "Scotland")
       | (df.native_country == "Holand-Netherlands"), 'native_country'] = 'Europe'
df.loc[(df.native_country == 'Canada') 
       | (df.native_country == 'Mexico')
       | (df.native_country == 'El-Salvador')
       | (df.native_country == 'Nicaragua')
       | (df.native_country == 'Dominican-Republic')
       | (df.native_country == 'Dominican-Republic'), 'native_country'] = 'North_AM'
df.loc[(df.native_country == 'Mexico')
       | (df.native_country == 'El-Salvador')
       | (df.native_country == 'Nicaragua')
       | (df.native_country == 'Dominican-Republic')
       | (df.native_country == 'Dominican-Republic')
       | (df.native_country == 'Cuba')
       | (df.native_country == 'Jamaica')
       | (df.native_country == 'Guatemala')
       | (df.native_country == 'Haiti')
       | (df.native_country == 'Honduras'), 'native_country'] = 'Central_AM'
df.loc[(df.native_country == 'Columbia') 
       | (df.native_country == 'Peru')
       | (df.native_country == 'Ecuador')
       | (df.native_country == 'Trinadad&Tobago')
       | (df.native_country == 'Cambodia'), 'native_country'] = 'South_AM'
df.loc[(df.native_country == 'China')
       | (df.native_country == 'Japan')
       | (df.native_country == 'Taiwan')
       | (df.native_country == 'Hong')
       | (df.native_country == 'India'), 'native_country'] = 'Asia'
df.loc[(df.native_country == 'Philippines')
       | (df.native_country == 'Vietnam')
       | (df.native_country == 'Laos')
       | (df.native_country == 'Thailand'), 'native_country'] = 'SEA'
df.loc[df.native_country == 'Iran', 'native_country'] = 'Middle_East'

In [None]:
# dropping useless values
df.drop(df.loc[df['native_country']=='South'].index, inplace=True)
df.drop(df.loc[df['native_country']=='?'].index, inplace=True)

In [None]:
print(df.native_country.value_counts())
sns.countplot(y=df['native_country'], data=df);

In [None]:
# reforming race values into white, black vs other
df.loc[(df.race == 'Asian-Pac-Islander') 
       | (df.race == 'Amer-Indian-Eskimo'), 'race'] = 'Other'

In [None]:
print(df.race.value_counts())
sns.countplot(y=df['race'], data=df);

In [None]:
# reforming marital status values
df.loc[(df.marital_status == 'Separated') , 'marital_status'] = 'Divorced'
df.loc[(df.marital_status == 'Married-spouse-absent') , 'marital_status'] = 'Widowed'

In [None]:
df.marital_status.value_counts()

In [None]:
print(df.marital_status.value_counts())
sns.countplot(y=df['marital_status'], data=df, orient="h");

In [None]:
# reforming occupation values ( combine Other-service, Protective-serv)
df.loc[(df.occupation == 'Priv-house-serv') , 'occupation'] = 'Handlers-cleaners'
df.loc[(df.occupation == 'Protective-serv') , 'occupation'] = 'Other-service'

In [None]:
df.occupation.value_counts()

In [None]:
print(df.occupation.value_counts())
sns.countplot(y=df['occupation'], data=df, orient="h");

# Final variable visualisations

In [None]:
print(df.workclass.value_counts())
sns.countplot(y=df['workclass'], data=df, orient="h");

In [None]:
df.loc[(df.education == '12th') , 'education'] = 'HS-grad'
df.loc[(df.education == '9th')|(df.education == '10th')|(df.education == '11th') , 'education'] = '9th-11th'
print(df.education.value_counts())
sns.countplot(y=df['education'], data=df, orient="h");

In [None]:
df.loc[(df.marital_status == 'Divorced')|(df.marital_status == 'Widowed') , 'marital_status'] = 'Divorced/Widowed'
print(df.marital_status.value_counts())
sns.countplot(y=df['marital_status'], data=df, orient="h");

In [None]:
print(df.occupation.value_counts())
sns.countplot(y=df['occupation'], data=df, orient="h");

In [None]:
print(df.race.value_counts())
sns.countplot(y=df['race'], data=df);

In [None]:
print(df.native_country.value_counts())
sns.countplot(y=df['native_country'], data=df);

In [None]:
print(df.sex_code.value_counts())
sns.countplot(y=df['sex_code'], data=df);

In [None]:
print(df.shape)
print(df.income_code.value_counts())
sns.countplot(y=df['income_code'], data=df);

In [None]:
df.to_csv('data_cleaned.csv')

# Outlier detection

In [None]:
from sklearn.ensemble import IsolationForest
from sklearn.covariance import EllipticEnvelope
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler
from sklearn.svm import OneClassSVM
from sklearn.neighbors import LocalOutlierFactor

In [None]:
sns.boxplot(x=df.age);

In [None]:
sns.boxplot(x=df.edu_years);

In [None]:
sns.boxplot(x=df.hours_per_week);

In [None]:
# isolation forrest
cols = ['age', 'edu_years', 'hours_per_week']

In [None]:
fig, axs = plt.subplots(1, 3, figsize=(52, 22), facecolor='w', edgecolor='k')
axs = axs.ravel()

for i, column in enumerate(cols):
    isolation_forest = IsolationForest(behaviour="new", contamination='auto')
    isolation_forest.fit(df[column].values.reshape(-1,1))
    
    xx = np.linspace(df[column].min(), df[column].max(), len(df)).reshape(-1,1)
    anomaly_score = isolation_forest.decision_function(xx)
    outlier = isolation_forest.predict(xx)
    
    axs[i].plot(xx, anomaly_score, label='anomaly score')
    axs[i].fill_between(xx.T[0], np.min(anomaly_score), np.max(anomaly_score), 
                     where=outlier==-1, color='r', 
                     alpha=.4, label='outlier region')
    axs[i].legend()
    axs[i].set_title(column)

In [None]:
# Outlier detection using SD

In [None]:
def out_std(s, nstd=3.0, return_thresholds=False):
    
    data_mean, data_std = s.mean(), s.std()
    cut_off = data_std * nstd
    lower, upper = data_mean - cut_off, data_mean + cut_off
    if return_thresholds:
        return lower, upper
    else:
        return [True if x < lower or x > upper else False for x in s]

In [None]:
# outlier_mask is a boolean list identifies the indices of the outliers
outlier_mask = out_std(df['age'], nstd=3.0)
# first 10 elements

In [None]:
outliers = df['age'][outlier_mask]
outliers.head()

In [None]:
outliers.sort_values()

In [None]:
df.shape

In [None]:
df.drop(df.loc[df['age']>= 78].index, inplace=True)
df.shape

In [None]:
plt.figure(figsize=(8,6))
sns.distplot(df['age'], kde=False);
plt.vlines(df['age'][outlier_mask], ymin=0, ymax=110, linestyles='dashed');

In [None]:
# removing outliers in edu_years
# outlier_mask is a boolean list identifies the indices of the outliers
outlier_mask2 = out_std(df['edu_years'], nstd=3.0)
# first 10 elements

In [None]:
outliers_edu = df['edu_years'][outlier_mask2]
outliers_edu.shape

In [None]:
plt.figure(figsize=(8,6))
sns.distplot(df['edu_years'], kde=False);
plt.vlines(df['edu_years'][outlier_mask2], ymin=0, ymax=110, linestyles='dashed');

In [None]:
outliers_edu.sort_values()

In [None]:
df.loc[df.edu_years==2].shape

In [None]:
df.shape

In [None]:
df.drop(df.loc[df['edu_years']<= 2].index, inplace=True)
df.shape

In [None]:
# removing outliers in hrs
# outlier_mask is a boolean list identifies the indices of the outliers
outlier_mask3 = out_std(df['hours_per_week'], nstd=3.0)
# first 10 elements

In [None]:
outliers_hrs = df['hours_per_week'][outlier_mask3]
outliers_hrs.shape

In [None]:
plt.figure(figsize=(8,6))
sns.distplot(df['hours_per_week'], kde=False);
plt.vlines(df['hours_per_week'][outlier_mask3], ymin=0, ymax=110, linestyles='dashed');

In [None]:
outliers_hrs.value_counts()

In [None]:
df.drop(df.loc[df['hours_per_week']<= 5].index, inplace=True)
df.drop(df.loc[df['hours_per_week']>= 77].index, inplace=True)
df.shape

In [None]:
df.to_csv('data_cleaned_no_outliers.csv')

In [5]:
df.shape

(32561, 15)