## Lab 3: Data Preparation

In [1]:
import pandas as pd
import scipy as sp
import numpy as np

## 0. Data Loading

In [2]:
#load data and extract data
names = ['age', 'workclass', 'fnlwgt', 'edu', 'edu-num', 'maritalstatus', 'occupation', 'relationship', 'race', 'sex','capital-gain','capital-loss','hours-per-week','native-country','income']
df = pd.read_csv('adult.data', names=names)
print( "Total columns: ", len(df.columns))
#df.head()                 # display all columns
df[df.columns[:10]].head() # display the first 10 columns

Total columns:  15


Unnamed: 0,age,workclass,fnlwgt,edu,edu-num,maritalstatus,occupation,relationship,race,sex
0,39,State-gov,77516,Bachelors,13,Never-married,Adm-clerical,Not-in-family,White,Male
1,50,Self-emp-not-inc,83311,Bachelors,13,Married-civ-spouse,Exec-managerial,Husband,White,Male
2,38,Private,215646,HS-grad,9,Divorced,Handlers-cleaners,Not-in-family,White,Male
3,53,Private,234721,11th,7,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male
4,28,Private,338409,Bachelors,13,Married-civ-spouse,Prof-specialty,Wife,Black,Female


## 1. Standardization 

In [6]:
################################################
# Standardize data (0 mean, 1 stdev)
################################################
from sklearn.preprocessing import StandardScaler # Standard Scaler is a class in sklearn.preprocessing
# that helps standardize data. It has two methods: fit and transform. Fit computes the mean and std to be used,
# and transform applies the standardization to a given dataset.

# your code below
# create the scaler
scaler = StandardScaler() # scaler is an object of StandardScaler class

# standardize the 'fnlwgt' column
df['fnlwgt'] = scaler.fit_transform(df[['fnlwgt']]) # fit_transform is a method of StandardScaler class
# that computes the mean and std to be used, and applies the standardization to a given dataset.
# df is a dataframe, so df[['fnlwgt']] is a dataframe with only one column 'fnlwgt'

# print the first five rows to verify changes
df[df.columns[:5]].head() # the changes are in the 'fnlwgt' column is now standardized



Unnamed: 0,age,workclass,fnlwgt,edu,edu-num
0,1.0,State-gov,-1.063611,Bachelors,13
1,2.0,Self-emp-not-inc,-1.008707,Bachelors,13
2,1.0,Private,0.245079,HS-grad,9
3,2.0,Private,0.425801,11th,7
4,0.0,Private,1.408176,Bachelors,13


## 2. Discretization (uniform transformation)


In [11]:
#Import the class
from sklearn.preprocessing import KBinsDiscretizer
 
# your code below
# create the discretizer 
kbins = KBinsDiscretizer(n_bins=5, encode='ordinal', strategy='uniform') # kbins is an object of KBinsDiscretizer class,
# n_bins is the number of bins to produce, encode is the method used to encode the transformed result, 
# strategy is the strategy for the discretization

# now discretize the 'age' column
df['age'] = kbins.fit_transform(df[['age']]) # df is a dataframe, so df[['age']] is a dataframe with only one column 'age'
# kbins.fit_transform is a method of KBinsDiscretizer class that computes the mean and std to be used,
# and applies the discretization to a given dataset.


# print the first five rows to verify changes
df[df.columns[:5]].head() # the changes we see is that the 'age' column is now discretized into 5 bins,
# and the 'fnlwgt' column is now standardized





Unnamed: 0,age,workclass,fnlwgt,edu,edu-num
0,1.0,State-gov,-1.063611,Bachelors,13
1,2.0,Self-emp-not-inc,-1.008707,Bachelors,13
2,1.0,Private,0.245079,HS-grad,9
3,2.0,Private,0.425801,11th,7
4,0.0,Private,1.408176,Bachelors,13


## 3. One-hot encoding


In [9]:
from sklearn.preprocessing import OneHotEncoder
 
# your code below
# create the one-hot encoder
encoder = OneHotEncoder() # encoder is an object of OneHotEncoder class, 
# OneHotEncoder is a class in sklearn.preprocessing that helps one-hot encode data.
# It has two methods: fit and transform. Fit computes the mean and std to be used,
# and transform applies the one-hot encoding to a given dataset.

# one-hot encode the 'race' column
race_encoded = encoder.fit_transform(df[['race']]) # race_encoded is a sparse matrix,
# encoder is an object of OneHotEncoder class, fit_transform is a method of OneHotEncoder class that computes the mean and std to be used,
# and applies the one-hot encoding to a given dataset.  

# transform the encoded values to a dataframe
race_df = pd.DataFrame(race_encoded.toarray(), columns=encoder.get_feature_names_out(['race'])) # race_df is a dataframe,
# pd.DataFrame is a method of pandas library that converts a numpy array to a dataframe,
# race_encoded is a sparse matrix, toarray() is a method of sparse matrix that converts a sparse matrix to a numpy array,
# columns is a parameter of pd.DataFrame that specifies the column names of the dataframe,
# encoder is an object of OneHotEncoder class, get_feature_names_out is a method of OneHotEncoder class that returns the feature names of the encoded data.

# print the rows of the new dataframe
race_df.head() # since its head it will default to five




Unnamed: 0,race_ Amer-Indian-Eskimo,race_ Asian-Pac-Islander,race_ Black,race_ Other,race_ White
0,0.0,0.0,0.0,0.0,1.0
1,0.0,0.0,0.0,0.0,1.0
2,0.0,0.0,0.0,0.0,1.0
3,0.0,0.0,1.0,0.0,0.0
4,0.0,0.0,1.0,0.0,0.0
