# One Hot Encoding - variables with many categories

In [1]:
import pandas as pd
import numpy as np
pd.set_option('display.max_columns', None)
df = pd.read_csv('taxis.csv', usecols=['color', 'payment', 'pickup_zone', 'dropoff_zone', 'dropoff_borough'])
df.head()

Unnamed: 0,color,payment,pickup_zone,dropoff_zone,dropoff_borough
0,yellow,credit card,Lenox Hill West,UN/Turtle Bay South,Manhattan
1,yellow,cash,Upper West Side South,Upper West Side South,Manhattan
2,yellow,credit card,Alphabet City,West Village,Manhattan
3,yellow,credit card,Hudson Sq,Yorkville West,Manhattan
4,yellow,credit card,Midtown East,Yorkville West,Manhattan


In [2]:
for col in df.columns:
    print(col, ': \t', len(df[col].unique()), 'labels')

color : 	 2 labels
payment : 	 3 labels
pickup_zone : 	 195 labels
dropoff_zone : 	 204 labels
dropoff_borough : 	 6 labels


In [3]:
# columns obtain after one hot encoding on variables
pd.get_dummies(df, drop_first=True).shape

(6433, 401)

#### From 5 intial categories we ended up with 401 variables

In [4]:
df.pickup_zone.value_counts().sort_values(ascending=False).head(10)

Midtown Center                  230
Upper East Side South           211
Penn Station/Madison Sq West    210
Clinton East                    208
Midtown East                    198
Upper East Side North           186
Times Sq/Theatre District       184
Union Sq                        180
Lincoln Square East             177
Murray Hill                     162
Name: pickup_zone, dtype: int64

In [5]:
# Making a list with top 20 most frequent categories of feature.
top_10 = [x for x in df.pickup_zone.value_counts().sort_values(ascending=False).head(10).index]
top_10

['Midtown Center',
 'Upper East Side South',
 'Penn Station/Madison Sq West',
 'Clinton East',
 'Midtown East',
 'Upper East Side North',
 'Times Sq/Theatre District',
 'Union Sq',
 'Lincoln Square East',
 'Murray Hill']

In [6]:
# now we make binary labels
for label in top_10:
    df[label]=np.where(df['pickup_zone']==label, 1, 0)
    
df[['pickup_zone']+top_10].head(20)

Unnamed: 0,pickup_zone,Midtown Center,Upper East Side South,Penn Station/Madison Sq West,Clinton East,Midtown East,Upper East Side North,Times Sq/Theatre District,Union Sq,Lincoln Square East,Murray Hill
0,Lenox Hill West,0,0,0,0,0,0,0,0,0,0
1,Upper West Side South,0,0,0,0,0,0,0,0,0,0
2,Alphabet City,0,0,0,0,0,0,0,0,0,0
3,Hudson Sq,0,0,0,0,0,0,0,0,0,0
4,Midtown East,0,0,0,0,1,0,0,0,0,0
5,Times Sq/Theatre District,0,0,0,0,0,0,1,0,0,0
6,Battery Park City,0,0,0,0,0,0,0,0,0,0
7,Murray Hill,0,0,0,0,0,0,0,0,0,1
8,East Harlem South,0,0,0,0,0,0,0,0,0,0
9,Lincoln Square East,0,0,0,0,0,0,0,0,1,0


In [12]:
# get whole set of dummy variables, for all the categorical variables

def one_hot_top_10(df, feature, top_10_variable):
    # function to create the dummy variables for the most frequent labels
    # we can vary number of most frequent labels that we encode
    
    for label in top_10_variable:
        df[feature+'  '+label]=np.where(df[feature]==label, 1, 0)



In [13]:
# Read the data again
df = pd.read_csv('taxis.csv', usecols=['color', 'payment', 'dropoff_zone', 'dropoff_borough', 'pickup_zone',])
df = df.copy()

one_hot_top_10(df, 'pickup_zone', top_10)
df.head()

Unnamed: 0,color,payment,pickup_zone,dropoff_zone,dropoff_borough,pickup_zone Midtown Center,pickup_zone Upper East Side South,pickup_zone Penn Station/Madison Sq West,pickup_zone Clinton East,pickup_zone Midtown East,pickup_zone Upper East Side North,pickup_zone Times Sq/Theatre District,pickup_zone Union Sq,pickup_zone Lincoln Square East,pickup_zone Murray Hill
0,yellow,credit card,Lenox Hill West,UN/Turtle Bay South,Manhattan,0,0,0,0,0,0,0,0,0,0
1,yellow,cash,Upper West Side South,Upper West Side South,Manhattan,0,0,0,0,0,0,0,0,0,0
2,yellow,credit card,Alphabet City,West Village,Manhattan,0,0,0,0,0,0,0,0,0,0
3,yellow,credit card,Hudson Sq,Yorkville West,Manhattan,0,0,0,0,0,0,0,0,0,0
4,yellow,credit card,Midtown East,Yorkville West,Manhattan,0,0,0,0,1,0,0,0,0,0


In [15]:
d_top_10 = [x for x in df.dropoff_zone.value_counts().sort_values(ascending=False).head(10).index]
d_top_10

['Upper East Side North',
 'Murray Hill',
 'Midtown Center',
 'Upper East Side South',
 'Midtown East',
 'Times Sq/Theatre District',
 'Lincoln Square East',
 'Clinton East',
 'East Village',
 'Penn Station/Madison Sq West']

In [16]:
df = pd.read_csv('taxis.csv', usecols=['color', 'payment', 'pickup_zone', 'dropoff_zone', 'dropoff_borough'])

# Dropping zone
one_hot_top_10(df,'dropoff_zone',d_top_10)
df.head()

Unnamed: 0,color,payment,pickup_zone,dropoff_zone,dropoff_borough,dropoff_zone Upper East Side North,dropoff_zone Murray Hill,dropoff_zone Midtown Center,dropoff_zone Upper East Side South,dropoff_zone Midtown East,dropoff_zone Times Sq/Theatre District,dropoff_zone Lincoln Square East,dropoff_zone Clinton East,dropoff_zone East Village,dropoff_zone Penn Station/Madison Sq West
0,yellow,credit card,Lenox Hill West,UN/Turtle Bay South,Manhattan,0,0,0,0,0,0,0,0,0,0
1,yellow,cash,Upper West Side South,Upper West Side South,Manhattan,0,0,0,0,0,0,0,0,0,0
2,yellow,credit card,Alphabet City,West Village,Manhattan,0,0,0,0,0,0,0,0,0,0
3,yellow,credit card,Hudson Sq,Yorkville West,Manhattan,0,0,0,0,0,0,0,0,0,0
4,yellow,credit card,Midtown East,Yorkville West,Manhattan,0,0,0,0,0,0,0,0,0,0


# advantages
1. Straightforward to implement.
2. Does not require hrs of variable exploration.
3. Does not expand massively the featue space(number of columns in the dataset)

# disadvantages
1. Does not ad any info that amay make the variable more predictive
2. Does not keep the info of the ignored labels