In [23]:
import pandas as pd
import numpy as np
import string 

In [2]:
columns = ['Plate ID', 'Registration State', 'Vehicle Make', 'Vehicle Color', 'Street Name']
path = '../../pandas-workout-data/data/nyc-parking-violations-2020.csv'
df = pd.read_csv(filepath_or_buffer=path, usecols=columns)

In [3]:
df.head(2)

Unnamed: 0,Plate ID,Registration State,Vehicle Make,Street Name,Vehicle Color
0,J58JKX,NJ,HONDA,43 ST,BK
1,KRE6058,PA,ME/BE,UNION ST,BLK


Determine how many different vehicle colors (the ``Vehicle Color`` column) there are.

In [4]:
unique_colors = df['Vehicle Color'].value_counts()
unique_colors # There are 1896 different colors recorded for parking tickets

Vehicle Color
WH       2344858
GY       2307704
BK       2066374
WHITE    1061234
BL        775124
          ...   
GR;            1
GYMVH          1
BK6            1
GYF            1
GRYW           1
Name: count, Length: 1896, dtype: int64

Look at the 30 most common colors, and identify colors that appear multiple times but are written differently. For example, the color ``WHITE`` is also written ``WT``, ``WT``., and ``WHT``.

In [5]:
unique_colors.head(30)

Vehicle Color
WH       2344858
GY       2307704
BK       2066374
WHITE    1061234
BL        775124
RD        483298
BLACK     465110
GREY      306787
BROWN     292348
SILVE     191477
GR        182929
BLUE      178298
RED       161693
TN        120576
BR        102204
YW         98700
BLK        91539
OTHER      60245
GREEN      58765
GL         54851
GRY        46527
MR         42812
GRAY       40854
WHT        35433
YELLO      32792
WHI        29760
OR         28100
BK.        27830
WT         25583
WT.        24593
Name: count, dtype: int64

Prepare a Python dict in which the keys represent the various color-name inputs and the values represent the values you want them to have in the end. I suggest using longer names, such as WHITE, rather than shorter ones.

In [6]:
color_map = {
    'WH': 'WHITE', 'GY':'GRAY', 'BK':'BLACK',
    'BL':'BLUE', 'RD':'RED', 'SILVE':'SILVER',
    'GR':'GRAY', 'TN':'TAN', 'BR':'BROWN',
    'YW':'YELLO', 'BLK':'BLACK', 'GRY':'GRAY',
    'WHT':'WHITE', 'WHI':'WHITE', 'OR':'ORANGE',
    'BK.':'BLACK', 'WT':'WHITE', 'WT.':'WHITE'
}

Replace the existing (old) colors with your translations. How many colors are there now?

In [7]:
df.loc[:, 'Vehicle Color'] = df.loc[:, 'Vehicle Color'].replace(color_map)

In [8]:
unique_colors = df['Vehicle Color'].value_counts()
unique_colors # There are 1880 different colors recorded for parking tickets

Vehicle Color
WHITE    3521461
BLACK    2650853
GRAY     2578014
BLUE      953422
RED       644991
          ...   
GRYW           1
LB             1
WHI`           1
SO VE          1
BL/BK          1
Name: count, Length: 1880, dtype: int64

In [9]:
unique_colors.head(30)

Vehicle Color
WHITE     3521461
BLACK     2650853
GRAY      2578014
BLUE       953422
RED        644991
BROWN      394552
GREY       306787
SILVER     191477
TAN        141667
YELLO      131492
OTHER       60245
GREEN       58765
GL          54851
MR          42812
ORANGE      28100
GY.         22460
GOLD        21687
SIL         20116
BLU         15240
SL.         13145
LTGY        13055
ORANG       11506
SL          10343
LTG         10093
BL.          9649
LT/          8976
PR           7518
DK/          7498
W            7367
RD.          7128
Name: count, dtype: int64

Look through the top 50 colors now that you have removed a bunch of them. Are there any you could still clean up? Are there any you cannot figure out? Can you identify some consistent typos and errors in the colors?

In [12]:
unique_colors.head(50)

Vehicle Color
WHITE     3521461
BLACK     2650853
GRAY      2578014
BLUE       953422
RED        644991
BROWN      394552
GREY       306787
SILVER     191477
TAN        141667
YELLO      131492
OTHER       60245
GREEN       58765
GL          54851
MR          42812
ORANGE      28100
GY.         22460
GOLD        21687
SIL         20116
BLU         15240
SL.         13145
LTGY        13055
ORANG       11506
SL          10343
LTG         10093
BL.          9649
LT/          8976
PR           7518
DK/          7498
W            7367
RD.          7128
DKGY         6004
GYGY         5039
BLK.         4853
GRN          4829
B            4145
WH.          3811
BRO          3802
DKG          3702
PURPL        3635
BRN          3582
BKGY         3504
WHBL         3489
DKBL         2912
GN           2883
WHT.         2796
BN           2787
BLUE.        2638
WHGY         2381
UNKNO        2205
RED.         2141
Name: count, dtype: int64

# Beyond the exercise

Run value_counts on the Vehicle Make column, and look at some of the vehicle names. (There are more than 5,200 distinct makes, which almost certainly indicates that there is a lot of inconsistency in this data.) What problems do you see? Write a function that, given a value, cleans it up -- putting the name in all caps, removing punctuation, and standardizing whatever names you can, and then use the apply method to fix up the column. How many distinct vehicle makes are there when you're done?

In [22]:
unique_vehicle_make = df['Vehicle Make'].value_counts()
unique_vehicle_make

Vehicle Make
TOYOT    1395273
HONDA    1343265
FORD     1328063
NISSA    1119587
CHEVR     711464
          ...   
CORSE          1
KANI           1
KAWAL          1
ME/IN          1
JIHDO          1
Name: count, Length: 5210, dtype: int64

In [24]:
# The author could have used regular expressions, but decided to make it a bit easier to follow

In [25]:
def clean_name(one_string):

    if not isinstance(one_string, str):
        return one_string

    output = ''

    for character in one_string.strip().upper(): # Remove leading and trailing characters from a string. (By default white space)
        if character in string.ascii_uppercase:
            output += character
    
    return output

In [26]:
df['Vehicle Make'] = df['Vehicle Make'].apply(clean_name)

In [28]:
len(unique_vehicle_make) - len(df['Vehicle Make'].value_counts())

295

How standardized are the street names in system? What changes could you apply to improve things?

In [31]:
df['Street Name'].value_counts()

Street Name
Broadway        180225
3rd Ave         133003
5th Ave          78211
2nd Ave          75533
Madison Ave      75419
                 ...  
MICKEL AVE           1
SHORE ROAD 3         1
FAYETTE              1
EAST 151             1
SHIEL AVE            1
Name: count, Length: 57757, dtype: int64

In [34]:
# Let's do some experiments to see how standardized things are


# For example, it sometimes says E 110th St and sometimes says E 110 ST
null_values = df['Street Name'].isnull().sum()
null_values

np.int64(1417)

In [35]:
s = df['Street Name'].dropna()
s

0                      43 ST
1                   UNION ST
2            CLERMONT AVENUE
3               DIVISION AVE
4                   GRAND ST
                  ...       
12495729             3RD AVE
12495730      PELHAM PARK DR
12495731           LYDIG AVE
12495732         E 68 STREET
12495733    W/S/O 182 STREET
Name: Street Name, Length: 12494317, dtype: object

In [39]:
s.str.contains('110').sum()

np.int64(9767)

In [40]:
s[s.str.contains('110')]

148               E 110 ST
26151       E 110TH STREET
33597             E 110 ST
40927              110 AVE
41200             W 110 ST
                 ...      
12487914         110th Ave
12489163         110th Ave
12489989        E 110th St
12490718          110th St
12492593        W 110th St
Name: Street Name, Length: 9767, dtype: object

In [41]:
s[s.str.contains('110')].value_counts()

Street Name
W 110th St              2970
110th St                2388
E 110th St              2048
WB 110TH AVE/BRINKER     922
110th Ave                704
                        ... 
I/O 110TH STREET           1
S/O 1100 BARBEY ST         1
S/W C/O 110 ST             1
OPP 1101 2ND AVENUE        1
E 110  ST                  1
Name: count, Length: 73, dtype: int64

In [None]:
# Sometimes it says BWAY and sometimes BROADWAY ...

# So to clean things up, we would need to standardize whether we use st/nd/rd/th, and if/when
# we abbreviate street names, and HOW we do that. Also, there is a separate column for the
# cross street, so it shouldn't be in the "Street Name" column.  A mess!  (Or an opportunity...)

s.str.contains('BWAY') | s.str.contains('BROADWAY')

0           False
1           False
2           False
3           False
4           False
            ...  
12495729    False
12495730    False
12495731    False
12495732    False
12495733    False
Name: Street Name, Length: 12494317, dtype: bool

In [43]:
df.head(1)

Unnamed: 0,Plate ID,Registration State,Vehicle Make,Street Name,Vehicle Color
0,J58JKX,NJ,HONDA,43 ST,BLACK


In [44]:
s[s.str.contains('BWAY') | s.str.contains('BROADWAY')]

59          BROADWAY
122         BROADWAY
182         BROADWAY
2625        BROADWAY
2799        BROADWAY
              ...   
12495299    BROADWAY
12495377        BWAY
12495426    BROADWAY
12495635    BROADWAY
12495675    BROADWAY
Name: Street Name, Length: 67924, dtype: object

In [53]:
59 in df.index.to_numpy()

True

In [54]:
df.iloc[59]

Plate ID              T782902C
Registration State          NY
Vehicle Make             CHEVR
Street Name           BROADWAY
Vehicle Color            BLACK
Name: 59, dtype: object

In [55]:
s[s.str.contains('BWAY') | s.str.contains('BROADWAY')].value_counts()

Street Name
SB BROADWAY @ 252ND     21939
NB BROADWAY @ W 228T    13367
BROADWAY                10771
SB BROADWAY @ W 196T     6623
NB BROADWAY @ W 120T     5691
                        ...  
S/O BROADWAY                1
S/O 6601 BROADWAY           1
R/O 1785 BROADWAY           1
S/O 5825 BROADWAY           1
F/O 5141 BROADWAY           1
Name: count, Length: 181, dtype: int64

Would you need to clean up the Registration State column? Why or why not?

In [56]:

# We have 68 "states," which includes Canadian provinces and some other countries
# So this seems pretty reasonsable, although perhaps some additional cleanup is needed.
df['Registration State'].value_counts()

Registration State
NY    9753643
NJ    1096110
PA     338779
FL     174056
CT     165205
       ...   
PE         18
SK          8
MX          7
NT          3
YT          2
Name: count, Length: 68, dtype: int64