In [1]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings("ignore")

### One Hot Encoding

In [2]:
# creating dataframe

phone_df = pd.DataFrame([
        ["Phone 1", "Blue", 2018, 500],
        ["Phone 2", "Black", 2016, 450],
        ["Phone 3", "Blue", 2017, 600],
        ["Phone 4", "Grey", 2018, 800]])

phone_df.columns = ["Phone", "Color", "Year", "Price"]

In [3]:
# created df
phone_df

Unnamed: 0,Phone,Color,Year,Price
0,Phone 1,Blue,2018,500
1,Phone 2,Black,2016,450
2,Phone 3,Blue,2017,600
3,Phone 4,Grey,2018,800


In [4]:
# importing label encoder

from sklearn.preprocessing import LabelEncoder

le_phone = LabelEncoder()
le_color = LabelEncoder()

In [5]:
# label encoding the categorical features

phone_df["phone_enc"] = le_phone.fit_transform(phone_df["Phone"])
phone_df["color_enc"] = le_color.fit_transform(phone_df["Color"])

In [6]:
phone_df

Unnamed: 0,Phone,Color,Year,Price,phone_enc,color_enc
0,Phone 1,Blue,2018,500,0,1
1,Phone 2,Black,2016,450,1,0
2,Phone 3,Blue,2017,600,2,1
3,Phone 4,Grey,2018,800,3,2


In [7]:
from sklearn.preprocessing import OneHotEncoder

In [8]:
# one hot encoding

phone_ohe = OneHotEncoder()
color_ohe = OneHotEncoder()

phone_ohe_array = phone_ohe.fit_transform(phone_df["phone_enc"].values.reshape(-1,1)).toarray()

color_ohe_array = color_ohe.fit_transform(phone_df["color_enc"].values.reshape(-1,1)).toarray()

In [9]:
phone_ohe_array

array([[1., 0., 0., 0.],
       [0., 1., 0., 0.],
       [0., 0., 1., 0.],
       [0., 0., 0., 1.]])

In [11]:
# adding the ohe'd features back to the original dataset

phone_df_ohe = pd.DataFrame(phone_ohe_array, columns= ["Phone_"+str(int(i))for i in range(phone_ohe_array.shape[1])])
phone_df = pd.concat([phone_df, phone_df_ohe], axis=1)

In [12]:
phone_df

Unnamed: 0,Phone,Color,Year,Price,phone_enc,color_enc,Phone_0,Phone_1,Phone_2,Phone_3
0,Phone 1,Blue,2018,500,0,1,1.0,0.0,0.0,0.0
1,Phone 2,Black,2016,450,1,0,0.0,1.0,0.0,0.0
2,Phone 3,Blue,2017,600,2,1,0.0,0.0,1.0,0.0
3,Phone 4,Grey,2018,800,3,2,0.0,0.0,0.0,1.0


In [13]:
color_df_ohe = pd.DataFrame(color_ohe_array, columns= ["Color_"+str(int(i))for i in range(color_ohe_array.shape[1])])
phone_df = pd.concat([phone_df, color_df_ohe], axis=1)

phone_df

Unnamed: 0,Phone,Color,Year,Price,phone_enc,color_enc,Phone_0,Phone_1,Phone_2,Phone_3,Color_0,Color_1,Color_2
0,Phone 1,Blue,2018,500,0,1,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,Phone 2,Black,2016,450,1,0,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,Phone 3,Blue,2017,600,2,1,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,Phone 4,Grey,2018,800,3,2,0.0,0.0,0.0,1.0,0.0,0.0,1.0


In [14]:
phone_df.drop(["Phone", "Color", "phone_enc", "color_enc"], axis=1)

Unnamed: 0,Year,Price,Phone_0,Phone_1,Phone_2,Phone_3,Color_0,Color_1,Color_2
0,2018,500,1.0,0.0,0.0,0.0,0.0,1.0,0.0
1,2016,450,0.0,1.0,0.0,0.0,1.0,0.0,0.0
2,2017,600,0.0,0.0,1.0,0.0,0.0,1.0,0.0
3,2018,800,0.0,0.0,0.0,1.0,0.0,0.0,1.0


### Get_Dummies

In [3]:
phone_df

Unnamed: 0,Phone,Color,Year,Price
0,Phone 1,Blue,2018,500
1,Phone 2,Black,2016,450
2,Phone 3,Blue,2017,600
3,Phone 4,Grey,2018,800


In [4]:
phone_df_dummies = pd.get_dummies(phone_df, columns=["Phone", "Color"])
phone_df_dummies

Unnamed: 0,Year,Price,Phone_Phone 1,Phone_Phone 2,Phone_Phone 3,Phone_Phone 4,Color_Black,Color_Blue,Color_Grey
0,2018,500,1,0,0,0,0,1,0
1,2016,450,0,1,0,0,1,0,0
2,2017,600,0,0,1,0,0,1,0
3,2018,800,0,0,0,1,0,0,1


### Crosstab

In [5]:
# loading the dataset

adult_df = pd.read_csv("adult.csv")
adult_df.head(10)

Unnamed: 0,age,workclass,education_level,education-num,marital-status,occupation,relationship,race,sex,capital-gain,capital-loss,hours-per-week,native-country,income
0,39,State-gov,Bachelors,13.0,Never-married,Adm-clerical,Not-in-family,White,Male,2174.0,0.0,40.0,United-States,<=50K
1,50,Self-emp-not-inc,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,13.0,United-States,<=50K
2,38,Private,HS-grad,9.0,Divorced,Handlers-cleaners,Not-in-family,White,Male,0.0,0.0,40.0,United-States,<=50K
3,53,Private,11th,7.0,Married-civ-spouse,Handlers-cleaners,Husband,Black,Male,0.0,0.0,40.0,United-States,<=50K
4,28,Private,Bachelors,13.0,Married-civ-spouse,Prof-specialty,Wife,Black,Female,0.0,0.0,40.0,Cuba,<=50K
5,37,Private,Masters,14.0,Married-civ-spouse,Exec-managerial,Wife,White,Female,0.0,0.0,40.0,United-States,<=50K
6,49,Private,9th,5.0,Married-spouse-absent,Other-service,Not-in-family,Black,Female,0.0,0.0,16.0,Jamaica,<=50K
7,52,Self-emp-not-inc,HS-grad,9.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,0.0,0.0,45.0,United-States,>50K
8,31,Private,Masters,14.0,Never-married,Prof-specialty,Not-in-family,White,Female,14084.0,0.0,50.0,United-States,>50K
9,42,Private,Bachelors,13.0,Married-civ-spouse,Exec-managerial,Husband,White,Male,5178.0,0.0,40.0,United-States,>50K


In [6]:
pd.crosstab(adult_df.occupation, adult_df.education_level)

education_level,10th,11th,12th,1st-4th,5th-6th,7th-8th,9th,Assoc-acdm,Assoc-voc,Bachelors,Doctorate,HS-grad,Masters,Preschool,Prof-school,Some-college
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
Adm-clerical,59,100,49,5,8,20,20,278,267,752,5,2028,102,3,11,1833
Armed-Forces,0,0,1,0,0,0,0,0,0,1,0,5,2,0,1,4
Craft-repair,232,266,89,28,68,166,140,166,370,323,4,2882,33,6,9,1238
Exec-managerial,42,50,18,5,6,27,22,237,232,1977,83,1182,762,1,63,1277
Farming-fishing,70,67,29,33,52,105,44,25,85,112,1,567,14,17,7,252
Handlers-cleaners,108,176,54,25,58,64,72,32,43,77,0,934,5,5,0,393
Machine-op-inspct,149,153,60,36,87,128,101,51,93,87,1,1515,12,12,0,485
Other-service,279,366,124,53,94,141,139,110,155,243,0,1892,34,21,7,1150
Priv-house-serv,8,18,8,14,19,17,16,2,5,11,1,86,0,2,0,25
Prof-specialty,13,34,12,4,2,11,4,203,245,2178,424,336,1260,1,651,630


In [7]:
pd.crosstab(adult_df.occupation, adult_df.education_level, margins=True, margins_name="Total")

education_level,10th,11th,12th,1st-4th,5th-6th,7th-8th,9th,Assoc-acdm,Assoc-voc,Bachelors,Doctorate,HS-grad,Masters,Preschool,Prof-school,Some-college,Total
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1
Adm-clerical,59,100,49,5,8,20,20,278,267,752,5,2028,102,3,11,1833,5540
Armed-Forces,0,0,1,0,0,0,0,0,0,1,0,5,2,0,1,4,14
Craft-repair,232,266,89,28,68,166,140,166,370,323,4,2882,33,6,9,1238,6020
Exec-managerial,42,50,18,5,6,27,22,237,232,1977,83,1182,762,1,63,1277,5984
Farming-fishing,70,67,29,33,52,105,44,25,85,112,1,567,14,17,7,252,1480
Handlers-cleaners,108,176,54,25,58,64,72,32,43,77,0,934,5,5,0,393,2046
Machine-op-inspct,149,153,60,36,87,128,101,51,93,87,1,1515,12,12,0,485,2970
Other-service,279,366,124,53,94,141,139,110,155,243,0,1892,34,21,7,1150,4808
Priv-house-serv,8,18,8,14,19,17,16,2,5,11,1,86,0,2,0,25,232
Prof-specialty,13,34,12,4,2,11,4,203,245,2178,424,336,1260,1,651,630,6008


In [8]:
pd.crosstab(adult_df.occupation, adult_df.sex, values=adult_df.age, aggfunc="mean")

sex,Female,Male
occupation,Unnamed: 1_level_1,Unnamed: 2_level_1
Adm-clerical,36.995442,37.650829
Armed-Forces,,31.785714
Craft-repair,39.746032,38.922524
Exec-managerial,39.657773,43.238498
Farming-fishing,39.578947,41.446209
Handlers-cleaners,35.011858,32.262131
Machine-op-inspct,38.520807,37.401929
Other-service,35.602952,34.397045
Priv-house-serv,44.137615,33.928571
Prof-specialty,38.656378,41.603417


In [9]:
# grouping with crosstab

pd.crosstab([adult_df["marital-status"], adult_df["workclass"]], adult_df["sex"])

Unnamed: 0_level_0,sex,Female,Male
marital-status,workclass,Unnamed: 2_level_1,Unnamed: 3_level_1
Divorced,Federal-gov,141,94
Divorced,Local-gov,385,138
Divorced,Private,2846,1814
Divorced,Self-emp-inc,45,98
Divorced,Self-emp-not-inc,156,267
Divorced,State-gov,212,101
Married-AF-spouse,Federal-gov,2,1
Married-AF-spouse,Private,16,8
Married-AF-spouse,Self-emp-not-inc,1,2
Married-AF-spouse,State-gov,2,0
