


### Table of Contents 
1. [Standard Imports](#import)
2. [Get the Data](#data)
3. [Explore the Data](#explore)
4. [Prepare the Data](#prepare)
5. [Model Selection & Evaluation](#model_selection)
6. [Classification Metrics](#metrics)
7. [Final Model Evaluation](#final_model)
 

## Standard Imports<a name="import"></a>


In [None]:
# standard imports
import pandas as pd
import numpy as np



## Get the Data<a name="data"></a>


In [None]:
#encoding all "?" values as NA/NaN.
census_income=pd.read_csv('census_income.csv', na_values=['?'])
census_income.head()

In [None]:
# check basic info about dataset and notice missing values
census_income.info()

In [None]:

income_label= census_income['income']



In [None]:

income_features=census_income.drop('income',axis=1)
income_features.head(5)

In [None]:

from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test=train_test_split(income_features,income_label, test_size=0.2, random_state=42)


## Explore the Data<a name="explore"></a>


### Prepare the Data<a name="prepare"></a>
.  

### Custom Transformer
  

 a custom transformer that takes the numerical columns and performs the following transformations:
 
 an input parameter called `create_new_column` with a default value of `True` that performs the following two data preparation steps when its value is `True`, but skips these steps and just returns the DataFrame as is when being passed a value of `False`.
   - Adds an attribute to the end of the numerical data (i.e. new last column) that is the result of the `days_per_week` column multiplied by the `hours_per_day` column.  We are creating this column to better compare the amount of hours worked between the individuals.
   - Since they are not needed with the new column, delete the `days_per_week` and `hours_per_day` columns.


In [None]:

num_features=X_train[['age','fnlwgt','education_num','capital_gain','capital_loss','days_per_week','hours_per_day']]

from sklearn.base import BaseEstimator, TransformerMixin

age_1,fnlwgt_1,education_num_1,capital_gain_1,capital_loss_1=0,1,2,3,4
days_per_week_1=5
hours_per_day_1=6

class CensusIncomeTransformer(BaseEstimator, TransformerMixin):
    def __init__(self, create_new_column=True):
       self.create_new_column=create_new_column
        
    def fit(self,X,y=None):
        return self

    def transform(self, X):
       num_features_1=  X[:,[age_1,fnlwgt_1,education_num_1,capital_gain_1,capital_loss_1]]
       if self.create_new_column:
         new_last_column=   X[:,days_per_week_1] * X[:,hours_per_day_1] 
         return  np.c_[num_features_1, new_last_column]
          
       else: 
         return np.c_[num_features_1,X[:,[days_per_week_1,hours_per_day_1]]]




### Pipelines



In [None]:

from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

num_pipeline= make_pipeline(SimpleImputer(strategy='mean'),CensusIncomeTransformer(),StandardScaler())

In [None]:

cat_pipeline=make_pipeline((SimpleImputer(strategy='most_frequent')), (OneHotEncoder(drop='first', sparse_output=False)))

### Column Transformer



In [None]:

from sklearn.compose import ColumnTransformer

num_attributes=['age','fnlwgt','education_num', 'capital_gain', 'capital_loss','days_per_week','hours_per_day']
cat_attributes=['workclass','education','marital_status','occupation','relationship','race','sex','native_country']

preprocessing= ColumnTransformer(transformers=[
    ('num_pipeline', num_pipeline, num_attributes),
    ('cat_pipeline',cat_pipeline, cat_attributes)
], remainder='passthrough')

X_train_prepared= preprocessing.fit_transform(X_train)
X_train_prepared

## Model Selection<a name="model_selection"></a>
  Logistic Regression model, a Stochastic Gradient Descent classifier and a Random Forest classifier.

In [None]:
### Logistic Regression Classifier ###

from sklearn.linear_model import LogisticRegression

# instantiate a Logistic Regression Class 
# increasing the maximum number of iterations taken for the solvers to converge
log_clf = LogisticRegression(random_state=42, max_iter=1000)

# fit the model
log_clf.fit(X_train_prepared, y_train)

In [None]:
from sklearn.model_selection import cross_val_score

# check the accuracy scores
cross_val_score(log_clf, X_train_prepared, y_train, cv=3, scoring="accuracy")

In [None]:
### Stochastic Gradient Descent Classifier ###

from sklearn.linear_model import SGDClassifier

# instantiate SGD CLassifier Class
sgd_clf = SGDClassifier(random_state=42)

# fit the model 
sgd_clf.fit(X_train_prepared, y_train)

In [None]:
# check the accuracy scores
cross_val_score(sgd_clf, X_train_prepared, y_train, cv=3, scoring="accuracy")

In [None]:
### Random Forest Classifier ###

from sklearn.ensemble import RandomForestClassifier

# instantiate a Random Forest Classifier Class using default parameters
# we won't do it in this assignment, but normally we would want to perform a grid search to 
# find the best parameters to use
rnd_for_clf = RandomForestClassifier(random_state=0)

# fit the model
rnd_for_clf.fit(X_train_prepared, y_train)

In [None]:
# check the accuracy scores
cross_val_score(rnd_for_clf, X_train_prepared, y_train, cv=3, scoring="accuracy")

## Classification Metrics<a name="metrics"></a>



In [None]:

from sklearn.model_selection import cross_val_predict

y_train_pred=cross_val_predict(log_clf,X_train_prepared,y_train,cv=3)
y_train_pred

In [None]:

from sklearn.metrics import precision_score,recall_score,f1_score

precision=precision_score(y_train,y_train_pred,pos_label=">50K")
precision= round(precision,2)
precision

In [None]:


recall=round(recall_score(y_train,y_train_pred,pos_label=">50K"),2)
recall

In [None]:

f1=round(f1_score(y_train,y_train_pred,pos_label=">50K"),2)
f1

## Final Model Evaluation<a name="final_model"></a>



In [None]:
### ENTER CODE HERE ###
from sklearn.metrics import accuracy_score

X_test_prepared=preprocessing.transform(X_test)
final_predictions=log_clf.predict(X_test_prepared)
final_accuracy=accuracy_score(y_test,final_predictions)
final_accuracy=round(final_accuracy,2)

final_accuracy

