In [1]:
import pandas as pd
import numpy as np

In [2]:
path_file = '../../pandas-workout-data/data/nyc-parking-violations-2020.csv'
columns = ['Date First Observed',
            'Registration State', 'Plate ID',
            'Issue Date', 'Vehicle Make',
            'Street Name', 'Vehicle Color']

In [3]:
df = pd.read_csv(filepath_or_buffer=path_file, usecols=columns)

In [4]:
df.index

RangeIndex(start=0, stop=12495734, step=1)

### Set the data frame’s index to the Issue Date column.

In [5]:
df = df.set_index('Issue Date') # Notice that set_index returns a new data frame based on the original one, which we assign back to df.

In [6]:
df.index

Index(['05/08/1972 12:00:00 AM', '08/29/1977 12:00:00 AM',
       '10/03/1988 12:00:00 AM', '01/03/1990 12:00:00 AM',
       '02/14/1990 12:00:00 AM', '07/21/1990 12:00:00 AM',
       '09/19/1990 12:00:00 AM', '10/14/1990 12:00:00 AM',
       '07/25/1991 12:00:00 AM', '01/01/2000 12:00:00 AM',
       ...
       '10/01/2030 12:00:00 AM', '10/29/2030 12:00:00 AM',
       '03/03/2031 12:00:00 AM', '12/30/2031 12:00:00 AM',
       '06/25/2033 12:00:00 AM', '01/03/2040 12:00:00 AM',
       '04/19/2045 12:00:00 AM', '01/17/2049 12:00:00 AM',
       '12/19/2063 12:00:00 AM', '06/04/2064 12:00:00 AM'],
      dtype='object', name='Issue Date', length=12495734)

### Determine what vehicle makes were most frequently ticketed on January 2, 2020.

In [7]:
df.loc['01/02/2020 12:00:00 AM', 'Vehicle Make']

Issue Date
01/02/2020 12:00:00 AM    MAZDA
01/02/2020 12:00:00 AM    TOYOT
01/02/2020 12:00:00 AM    NISSA
01/02/2020 12:00:00 AM     FORD
01/02/2020 12:00:00 AM      HIN
                          ...  
01/02/2020 12:00:00 AM    JAGUA
01/02/2020 12:00:00 AM    HONDA
01/02/2020 12:00:00 AM    BUICK
01/02/2020 12:00:00 AM    ME/BE
01/02/2020 12:00:00 AM    FRUEH
Name: Vehicle Make, Length: 31520, dtype: object

In [8]:
df.loc['01/02/2020 12:00:00 AM', 'Vehicle Make'].value_counts().head(3)

Vehicle Make
TOYOT    3829
HONDA    3593
FORD     3164
Name: count, dtype: int64

### Determine the five streets on which cars got the most tickets on June 1, 2020.

In [9]:
df.loc['01/02/2020 12:00:00 AM', 'Street Name'].value_counts().head(5)

Street Name
EB HORACE HARDING EX    345
WB BRUCKNER BLVD @ B    193
WB ATLANTIC AVE @ CL    175
EB E 233RD ST @ KATO    173
SB MAIN ST @ 82ND DR    171
Name: count, dtype: int64

### Set the index to Vehicle Color.

In [10]:
df

Unnamed: 0_level_0,Plate ID,Registration State,Vehicle Make,Street Name,Date First Observed,Vehicle Color
Issue Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
05/08/1972 12:00:00 AM,J58JKX,NJ,HONDA,43 ST,0,BK
08/29/1977 12:00:00 AM,KRE6058,PA,ME/BE,UNION ST,0,BLK
10/03/1988 12:00:00 AM,444326R,NJ,LEXUS,CLERMONT AVENUE,0,BLACK
01/03/1990 12:00:00 AM,F728330,OH,CHEVR,DIVISION AVE,0,
02/14/1990 12:00:00 AM,FMY9090,NY,JEEP,GRAND ST,0,GREY
...,...,...,...,...,...,...
01/03/2040 12:00:00 AM,62161MM,NY,FORD,3RD AVE,0,BR
04/19/2045 12:00:00 AM,GYE7330,NY,HONDA,PELHAM PARK DR,0,BLK
01/17/2049 12:00:00 AM,HNY4802,NY,FORD,LYDIG AVE,0,GY
12/19/2063 12:00:00 AM,T687081C,NY,TOYOT,E 68 STREET,0,BLK


In [11]:
#df['Issue Date'] = df.index
#df = df.set_index('Vehicle Color')

In [12]:
df = df.reset_index() 
df = df.set_index('Vehicle Color')

In [13]:
df

Unnamed: 0_level_0,Issue Date,Plate ID,Registration State,Vehicle Make,Street Name,Date First Observed
Vehicle Color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BK,05/08/1972 12:00:00 AM,J58JKX,NJ,HONDA,43 ST,0
BLK,08/29/1977 12:00:00 AM,KRE6058,PA,ME/BE,UNION ST,0
BLACK,10/03/1988 12:00:00 AM,444326R,NJ,LEXUS,CLERMONT AVENUE,0
,01/03/1990 12:00:00 AM,F728330,OH,CHEVR,DIVISION AVE,0
GREY,02/14/1990 12:00:00 AM,FMY9090,NY,JEEP,GRAND ST,0
...,...,...,...,...,...,...
BR,01/03/2040 12:00:00 AM,62161MM,NY,FORD,3RD AVE,0
BLK,04/19/2045 12:00:00 AM,GYE7330,NY,HONDA,PELHAM PARK DR,0
GY,01/17/2049 12:00:00 AM,HNY4802,NY,FORD,LYDIG AVE,0
BLK,12/19/2063 12:00:00 AM,T687081C,NY,TOYOT,E 68 STREET,0


In [14]:
df

Unnamed: 0_level_0,Issue Date,Plate ID,Registration State,Vehicle Make,Street Name,Date First Observed
Vehicle Color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BK,05/08/1972 12:00:00 AM,J58JKX,NJ,HONDA,43 ST,0
BLK,08/29/1977 12:00:00 AM,KRE6058,PA,ME/BE,UNION ST,0
BLACK,10/03/1988 12:00:00 AM,444326R,NJ,LEXUS,CLERMONT AVENUE,0
,01/03/1990 12:00:00 AM,F728330,OH,CHEVR,DIVISION AVE,0
GREY,02/14/1990 12:00:00 AM,FMY9090,NY,JEEP,GRAND ST,0
...,...,...,...,...,...,...
BR,01/03/2040 12:00:00 AM,62161MM,NY,FORD,3RD AVE,0
BLK,04/19/2045 12:00:00 AM,GYE7330,NY,HONDA,PELHAM PARK DR,0
GY,01/17/2049 12:00:00 AM,HNY4802,NY,FORD,LYDIG AVE,0
BLK,12/19/2063 12:00:00 AM,T687081C,NY,TOYOT,E 68 STREET,0


### Determine the most common make of vehicles that were either red or blue.

In [15]:
with pd.option_context('display.max_rows', 2000):
    print(df.index.value_counts())

Vehicle Color
WH       2344858
GY       2307704
BK       2066374
WHITE    1061234
BL        775124
RD        483298
BLACK     465110
GREY      306787
BROWN     292348
SILVE     191477
GR        182929
BLUE      178298
RED       161693
TN        120576
BR        102204
YW         98700
BLK        91539
OTHER      60245
GREEN      58765
GL         54851
GRY        46527
MR         42812
GRAY       40854
WHT        35433
YELLO      32792
WHI        29760
OR         28100
BK.        27830
WT         25583
WT.        24593
GY.        22460
GOLD       21687
TAN        21091
SIL        20116
BLU        15240
SL.        13145
LTGY       13055
ORANG      11506
SL         10343
LTG        10093
BL.         9649
LT/         8976
PR          7518
DK/         7498
W           7367
RD.         7128
DKGY        6004
GYGY        5039
BLK.        4853
GRN         4829
B           4145
WH.         3811
BRO         3802
DKG         3702
PURPL       3635
BRN         3582
BKGY        3504
WHBL        3489


In [16]:
df.index.isin(['BLUE', 'RED']) # Mask

array([False, False, False, ..., False, False, False], shape=(12495734,))

In [17]:
df.loc[df.index.isin(['BLUE', 'RED']), 'Vehicle Make']

Vehicle Color
RED     CMCKU
BLUE    CADIL
RED     DODGE
RED      JEEP
BLUE      MCI
        ...  
BLUE      BMW
BLUE    CHRYS
RED     MITSU
RED       NaN
BLUE    TINDO
Name: Vehicle Make, Length: 339991, dtype: object

In [18]:
df.loc[df.index.isin(['BLUE']), 'Vehicle Make'].value_counts().head(1)

Vehicle Make
HONDA    22627
Name: count, dtype: int64

In [19]:
df.loc[df.index.isin(['RED']), 'Vehicle Make'].value_counts().head(1)

Vehicle Make
HONDA    16726
Name: count, dtype: int64

The correct answer is the below one, because the question says: Determine the most common make of vehicles that were ``either red or blue.``

In [20]:
df.loc[['RED', 'BLUE'], 'Vehicle Make'].value_counts().head(1)

Vehicle Make
HONDA    39353
Name: count, dtype: int64

### Beyond the exercise

What three car makes were most often ticketed from January 2 through January 10?

In [21]:
df

Unnamed: 0_level_0,Issue Date,Plate ID,Registration State,Vehicle Make,Street Name,Date First Observed
Vehicle Color,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
BK,05/08/1972 12:00:00 AM,J58JKX,NJ,HONDA,43 ST,0
BLK,08/29/1977 12:00:00 AM,KRE6058,PA,ME/BE,UNION ST,0
BLACK,10/03/1988 12:00:00 AM,444326R,NJ,LEXUS,CLERMONT AVENUE,0
,01/03/1990 12:00:00 AM,F728330,OH,CHEVR,DIVISION AVE,0
GREY,02/14/1990 12:00:00 AM,FMY9090,NY,JEEP,GRAND ST,0
...,...,...,...,...,...,...
BR,01/03/2040 12:00:00 AM,62161MM,NY,FORD,3RD AVE,0
BLK,04/19/2045 12:00:00 AM,GYE7330,NY,HONDA,PELHAM PARK DR,0
GY,01/17/2049 12:00:00 AM,HNY4802,NY,FORD,LYDIG AVE,0
BLK,12/19/2063 12:00:00 AM,T687081C,NY,TOYOT,E 68 STREET,0


In [22]:
df = df.reset_index()
df

Unnamed: 0,Vehicle Color,Issue Date,Plate ID,Registration State,Vehicle Make,Street Name,Date First Observed
0,BK,05/08/1972 12:00:00 AM,J58JKX,NJ,HONDA,43 ST,0
1,BLK,08/29/1977 12:00:00 AM,KRE6058,PA,ME/BE,UNION ST,0
2,BLACK,10/03/1988 12:00:00 AM,444326R,NJ,LEXUS,CLERMONT AVENUE,0
3,,01/03/1990 12:00:00 AM,F728330,OH,CHEVR,DIVISION AVE,0
4,GREY,02/14/1990 12:00:00 AM,FMY9090,NY,JEEP,GRAND ST,0
...,...,...,...,...,...,...,...
12495729,BR,01/03/2040 12:00:00 AM,62161MM,NY,FORD,3RD AVE,0
12495730,BLK,04/19/2045 12:00:00 AM,GYE7330,NY,HONDA,PELHAM PARK DR,0
12495731,GY,01/17/2049 12:00:00 AM,HNY4802,NY,FORD,LYDIG AVE,0
12495732,BLK,12/19/2063 12:00:00 AM,T687081C,NY,TOYOT,E 68 STREET,0


In [23]:
df = df.set_index('Issue Date')

In [24]:
df.index

Index(['05/08/1972 12:00:00 AM', '08/29/1977 12:00:00 AM',
       '10/03/1988 12:00:00 AM', '01/03/1990 12:00:00 AM',
       '02/14/1990 12:00:00 AM', '07/21/1990 12:00:00 AM',
       '09/19/1990 12:00:00 AM', '10/14/1990 12:00:00 AM',
       '07/25/1991 12:00:00 AM', '01/01/2000 12:00:00 AM',
       ...
       '10/01/2030 12:00:00 AM', '10/29/2030 12:00:00 AM',
       '03/03/2031 12:00:00 AM', '12/30/2031 12:00:00 AM',
       '06/25/2033 12:00:00 AM', '01/03/2040 12:00:00 AM',
       '04/19/2045 12:00:00 AM', '01/17/2049 12:00:00 AM',
       '12/19/2063 12:00:00 AM', '06/04/2064 12:00:00 AM'],
      dtype='object', name='Issue Date', length=12495734)

In [25]:
df = df.sort_index()
df

Unnamed: 0_level_0,Vehicle Color,Plate ID,Registration State,Vehicle Make,Street Name,Date First Observed
Issue Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
01/01/2000 12:00:00 AM,RED,6542FR,99,CMCKU,E 54 ST,0
01/01/2019 12:00:00 AM,BLK,T753575C,NY,TOYOT,101 AVDE,0
01/01/2019 12:00:00 AM,GOLD,JMJ4156,99,HONDA,BELMUNT,0
01/01/2019 12:00:00 AM,GY,JMU9251,NY,HONDA,175 ST,0
01/01/2019 12:00:00 AM,WHT,AV32960,CT,MITSU,W/O 106 ST,0
...,...,...,...,...,...,...
12/31/2020 12:00:00 AM,BLUE,GKM2944,NY,HONDA,ATLANTIC AVE,0
12/31/2020 12:00:00 AM,,M54FDG,NJ,JEEP,8TH ST,0
12/31/2020 12:00:00 AM,GL,DDH3509,NY,LEXUS,E/S/O 129TH STREET,0
12/31/2020 12:00:00 AM,YELLO,Y100808C,NY,NISSA,E 171,0


In [26]:
df.loc['01/02/2020 12:00:00 AM']

Unnamed: 0_level_0,Vehicle Color,Plate ID,Registration State,Vehicle Make,Street Name,Date First Observed
Issue Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
01/02/2020 12:00:00 AM,WH,HSU1133,NY,VOLKS,NB OCEAN PKWY @ BEVE,0
01/02/2020 12:00:00 AM,BL,JCC7346,NY,AUDI,EB 73RD AVE @ 153RD,0
01/02/2020 12:00:00 AM,BR,HLG6754,NY,BUICK,SB CROSS BAY BLVD @,0
01/02/2020 12:00:00 AM,GY,JAA5010,NY,HYUND,WB UNION TPKE @ WOOD,0
01/02/2020 12:00:00 AM,BK,JKB4113,NY,BMW,NB BAYCHESTER AVE @,0
...,...,...,...,...,...,...
01/02/2020 12:00:00 AM,WH,HRK9026,NY,HONDA,EB W 14TH STREET @ 5,0
01/02/2020 12:00:00 AM,GY,JKZ7470,NY,BMW,NB ROGERS AVE @ ERAS,0
01/02/2020 12:00:00 AM,GY,GKB5891,NY,JEEP,SB FRESH POND RD @ W,0
01/02/2020 12:00:00 AM,GY,HTC9218,NY,AUDI,WB W 14TH STREET @ 6,0


In [27]:
df.loc['01/02/2020 12:00:00 AM':'01/10/2020 23:59:59', 'Vehicle Make'].value_counts().head(3)

Vehicle Make
FORD     38958
TOYOT    37096
HONDA    35962
Name: count, dtype: int64

How many tickets did the second-most-ticketed car get in 2020? (And why am I not interested in the most-ticketed plate?) What state was that car from, and was it always ticketed in the same location?

In [30]:
df = df.reset_index()

In [33]:
df

Unnamed: 0,Issue Date,Vehicle Color,Plate ID,Registration State,Vehicle Make,Street Name,Date First Observed
0,01/01/2000 12:00:00 AM,RED,6542FR,99,CMCKU,E 54 ST,0
1,01/01/2019 12:00:00 AM,BLK,T753575C,NY,TOYOT,101 AVDE,0
2,01/01/2019 12:00:00 AM,GOLD,JMJ4156,99,HONDA,BELMUNT,0
3,01/01/2019 12:00:00 AM,GY,JMU9251,NY,HONDA,175 ST,0
4,01/01/2019 12:00:00 AM,WHT,AV32960,CT,MITSU,W/O 106 ST,0
...,...,...,...,...,...,...,...
12495729,12/31/2020 12:00:00 AM,BLUE,GKM2944,NY,HONDA,ATLANTIC AVE,0
12495730,12/31/2020 12:00:00 AM,,M54FDG,NJ,JEEP,8TH ST,0
12495731,12/31/2020 12:00:00 AM,GL,DDH3509,NY,LEXUS,E/S/O 129TH STREET,0
12495732,12/31/2020 12:00:00 AM,YELLO,Y100808C,NY,NISSA,E 171,0


In [34]:
df['Plate ID'].value_counts().head(2)

Plate ID
BLANKPLATE    8882
2704819       1535
Name: count, dtype: int64

In [35]:
df = df.set_index('Plate ID')
df

Unnamed: 0_level_0,Issue Date,Vehicle Color,Registration State,Vehicle Make,Street Name,Date First Observed
Plate ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
6542FR,01/01/2000 12:00:00 AM,RED,99,CMCKU,E 54 ST,0
T753575C,01/01/2019 12:00:00 AM,BLK,NY,TOYOT,101 AVDE,0
JMJ4156,01/01/2019 12:00:00 AM,GOLD,99,HONDA,BELMUNT,0
JMU9251,01/01/2019 12:00:00 AM,GY,NY,HONDA,175 ST,0
AV32960,01/01/2019 12:00:00 AM,WHT,CT,MITSU,W/O 106 ST,0
...,...,...,...,...,...,...
GKM2944,12/31/2020 12:00:00 AM,BLUE,NY,HONDA,ATLANTIC AVE,0
M54FDG,12/31/2020 12:00:00 AM,,NJ,JEEP,8TH ST,0
DDH3509,12/31/2020 12:00:00 AM,GL,NY,LEXUS,E/S/O 129TH STREET,0
Y100808C,12/31/2020 12:00:00 AM,YELLO,NY,NISSA,E 171,0


In [None]:
df.loc['2704819', 'Registration State'] # It's from Indiana

Plate ID
2704819    IN
2704819    IN
2704819    IN
2704819    IN
2704819    IN
           ..
2704819    IN
2704819    IN
2704819    IN
2704819    IN
2704819    IN
Name: Registration State, Length: 1535, dtype: object

In [37]:
# was it always ticketed in the same place?  No, but there were a lot in the same area...
df.loc['2704819', 'Street Name'].value_counts()

Street Name
8th Ave        395
Penn Plz       230
7th Ave         92
9th Ave         63
Broadway        57
              ... 
Fashion Ave      1
W 47 STREET      1
10TH AVE         1
N/S 34 ST        1
W 43 ST          1
Name: count, Length: 113, dtype: int64

Would it be useful to set the index to "Date First Observed"? Why or why not?

In [39]:

# Not very useful -- the value is set to 0 for 99% of the values!

df = df.reset_index()
df['Date First Observed'].value_counts()

Date First Observed
0           12371344
20200311         887
20200205         795
20200212         793
20200310         770
              ...   
20190204           1
20200920           1
20181129           1
20201209           1
20201230           1
Name: count, Length: 465, dtype: int64