In [50]:
import pandas as pd
import numpy as np
import html
import re
from number_parser import parse_ordinal


In [51]:
df = pd.read_csv("currentOpenInspections_p.csv")

In [52]:
df.shape

(79203, 20)

In [53]:
pd.set_option('display.max_columns', None)

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 79203 entries, 0 to 79202
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   Borough                 79203 non-null  object 
 1   RestaurantName          79195 non-null  object 
 2   SeatingChoice           79203 non-null  object 
 3   LegalBusinessName       79202 non-null  object 
 4   BusinessAddress         79203 non-null  object 
 5   RestaurantInspectionID  79203 non-null  int64  
 6   IsSidewayCompliant      0 non-null      float64
 7   IsRoadwayCompliant      79203 non-null  object 
 8   SkippedReason           19866 non-null  object 
 9   InspectedOn             79203 non-null  object 
 10  AgencyCode              67240 non-null  object 
 11  Postcode                79203 non-null  int64  
 12  Latitude                71320 non-null  float64
 13  Longitude               71320 non-null  float64
 14  CommunityBoard          71320 non-null

In [55]:
# checking numerical columns in the dataset

numerics = ['int16', 'int32', 'int64', 'float16', 'float32', 'float64']

df.select_dtypes(include=numerics).shape[1]

# so there are 10numerical columns in the dataset and rest 10 are non-numerical columns s

10

In [56]:
# checking the percentage of null values in each column
missing_values_percentage = round(df.isnull().sum().sort_values(ascending=False)/len(df) * 100, 2)
missing_values_percentage

IsSidewayCompliant        100.00
SkippedReason              74.92
AgencyCode                 15.10
BBL                        10.48
BIN                        10.48
Latitude                    9.95
CensusTract                 9.95
CouncilDistrict             9.95
CommunityBoard              9.95
Longitude                   9.95
NTA                         9.95
RestaurantName              0.01
LegalBusinessName           0.00
Postcode                    0.00
InspectedOn                 0.00
IsRoadwayCompliant          0.00
RestaurantInspectionID      0.00
BusinessAddress             0.00
SeatingChoice               0.00
Borough                     0.00
dtype: float64

### there are 2 columns in the dataset which have more than 50 percent null values 

## Changes need to be done 
- Drop IsSidewayCompliant column.
- Drop SkippedReason column.
- Change Dtype of InspectedOn column from oject to datetime
- Removing html tags from the columns
- Break down the businessAddress column into BuildingNumber and Street columns
- Clean the Street column
- Replace BusinessAddress column with combined values of BuildingNumber + Street + Borough + "NY"
- Change the position of columns

### dropping IsSidewayCompliant column.


In [57]:
df = df.drop(["IsSidewayCompliant"],axis=1)

### dropping SkippedReason column.


In [58]:
df = df.drop(["SkippedReason"],axis=1)

### chainging the data type of column named InspectedOn


In [59]:
df['InspectedOn'] = pd.to_datetime(df['InspectedOn'])


In [60]:
df.iloc[[76663,16438,37353]]

Unnamed: 0,Borough,RestaurantName,SeatingChoice,LegalBusinessName,BusinessAddress,RestaurantInspectionID,IsRoadwayCompliant,InspectedOn,AgencyCode,Postcode,Latitude,Longitude,CommunityBoard,CouncilDistrict,CensusTract,BIN,BBL,NTA
76663,Manhattan,SUCKER PUNCH SPORTS BAR,roadway,SUCKER PUNCH SPORTS CLUB LLC,344 3 AVENUE,100910,Skipped Inspection,2023-06-23 10:56:58,DOT,10010,40.740321,-73.982076,6.0,2.0,68.0,1018110.0,1008810000.0,Gramercy
16438,Queens,Paraiso Colombiano Restaurant,both,106 20 Food Corp.,106 20 Corona Av,1476,Non-Compliant,2020-07-05 08:15:13,DOT,11368,40.743727,-73.856279,4.0,21.0,439.0,4447721.0,4019350000.0,Corona
37353,Queens,NEW MALAYSIA RESTAURANT,roadway,MALAY RESTAURANT INC,13517 40 ROAD,31071,Reset,2020-12-22 09:05:24,,11354,,,,,,,,


In [61]:
df.sample(10)

Unnamed: 0,Borough,RestaurantName,SeatingChoice,LegalBusinessName,BusinessAddress,RestaurantInspectionID,IsRoadwayCompliant,InspectedOn,AgencyCode,Postcode,Latitude,Longitude,CommunityBoard,CouncilDistrict,CensusTract,BIN,BBL,NTA
15836,Queens,M. Wells Steakhouse,both,M. Wells Crescent LLC,43-15 Crescent Street,13189,Skipped Inspection,2020-07-23 17:28:26,DOT,11101,40.748674,-73.942323,2.0,26.0,19.0,4436807.0,4004340000.0,Hunters Point-Sunnyside-West Maspeth
20711,Brooklyn,Roberta's,both,"Nineteen Twenty Four, Inc",271 Moore Street,19848,Compliant,2020-08-25 15:40:02,DOT,11206,40.704891,-73.93368,1.0,34.0,485.0,3325964.0,3031010000.0,Bushwick South
71121,Brooklyn,Nuevo M+?xico Bar Restaurant,both,Nuevo M+?xico Inc.,489 5th Avenue,92190,Compliant,2022-11-11 10:30:01,DOT,11215,40.667599,-73.987603,6.0,39.0,139.0,3022922.0,3010230000.0,Park Slope-Gowanus
30184,Manhattan,Le Pain Quotidien - Bryant Park,both,APQ Bryant Park NY LLC,70 West 40th Street,30855,Reset,2020-12-22 09:05:24,,10018,40.753266,-73.984379,5.0,4.0,84.0,1016059.0,1008410000.0,Midtown-Midtown South
43613,Queens,Taqueria Coatzingo,roadway,76 Taqueria Corp,76-05 Roosevelt Ave,43437,Compliant,2021-01-26 17:41:32,DOT,11372,40.747063,-73.889413,3.0,25.0,287.0,4439244.0,4012870000.0,Jackson Heights
25473,Manhattan,Park Bar,both,"Steeplechase, Inc.",15 East 15th Street,24585,Skipped Inspection,2020-09-30 14:16:45,DOT,10003,40.736499,-73.992635,5.0,2.0,52.0,1087180.0,1008438000.0,Hudson Yards-Chelsea-Flatiron-Union Square
65181,Brooklyn,Fritebar,sidewalk,NA Frites Inc.,8503 3rd Avenue,722,Non-Compliant,2020-07-03 16:51:56,DOT,11209,40.624182,-74.030753,10.0,43.0,62.0,3153101.0,3060340000.0,Bay Ridge
67216,Manhattan,Fiaschetteria Pistoia,both,Fiaschetteria LTD,647 East 11th street,8574,Non-Compliant,2020-07-13 14:26:55,DOB,10009,40.72672,-73.977678,3.0,2.0,28.0,1004914.0,1003940000.0,Lower East Side
29499,Manhattan,Thyme and Tonic,both,Spring Natural Corp.,474 Columbus,30206,Reset,2020-12-22 09:05:24,,10024,,,,,,,,
78121,Bronx,Evo Cocktail Lounge & Restaurant,both,Mi Gente Caf+? Inc.,1306 Unionport Rd.,102438,Under Review,2023-08-07 17:34:13,DOT,10462,40.834275,-73.853738,9.0,18.0,222.0,2028985.0,2039340000.0,Parkchester


In [62]:
# Making seatingChoice title
df["SeatingChoice"] = df["SeatingChoice"].str.capitalize()

### function to remove the html tags from the values in column


In [63]:
def html_content_remover(column):
    return column.apply(lambda x: html.unescape(str(x)))

In [64]:
df["RestaurantName"] = html_content_remover(df["RestaurantName"])
df["LegalBusinessName"] = html_content_remover(df["LegalBusinessName"])


### Function to make business entity abbreviations correct

In [65]:
def make_title_and_corrections(df, column):
    df[column] = df[column].str.title()
    
    df[column] = df[column].fillna("")
    
    pattern = r'\b(\d+)(Th|Nd|St|Rd)\b'                     # For names like 12th , 2nd etc
    pattern_1 = r'\b([A-Za-z]+)(\'[A-Z])\b'                 # for names like Sam's, Macdonald's
    corp_pattern = r'\bCorp(?:oration|orations)?(?![.])\b'
    inc_pattern = r'\bInc(?:orporated)?(?![.])\b'

    df[column] = df[column].apply(lambda x : re.sub(pattern, lambda m: m.group(1) + m.group(2).lower(), x))
    df[column] = df[column].apply(lambda x: re.sub(pattern_1,lambda m : m.group(1) + m.group(2).lower(),x ))
    
    df[column] = df[column].str.replace("Llc","LLC")
    df[column] = df[column].str.replace(inc_pattern,"Inc.",regex=True)
    df[column] = df[column].str.replace(corp_pattern,"Corp.",regex=True)
    

In [66]:
make_title_and_corrections(df,"RestaurantName")
make_title_and_corrections(df,"LegalBusinessName")

## Taking out buildingNumber and Street form business_Address column

In [67]:
def building(df,column):
    """return two lists contianing buildingNumbers and street_name"""
    
    building_list = []
    street_name = []
    for i in df[column]:
        split_address = i.split()                                                # splitting the column on space
        building_number = split_address[0] if len(split_address) > 1 else None   # taking the first value of split as builidng number 
        street = " ".join(split_address[1:]) if building_number else " ".join(split_address)   # taking all values strting from 1 as street from split_address
        building_list.append(building_number)
        street_name.append(street)
    return building_list,street_name



        
df["BuildingNumber"], df["Street"] = building(df, "BusinessAddress")

        

In [68]:
# checking nulls in BuildingNumber
df[df["BuildingNumber"].isna()]

Unnamed: 0,Borough,RestaurantName,SeatingChoice,LegalBusinessName,BusinessAddress,RestaurantInspectionID,IsRoadwayCompliant,InspectedOn,AgencyCode,Postcode,Latitude,Longitude,CommunityBoard,CouncilDistrict,CensusTract,BIN,BBL,NTA,BuildingNumber,Street
6065,Brooklyn,Mr Fulton,Both,Bk Fulton Corp.,156a,79872,Skipped Inspection,2022-04-25 15:52:54,DOT,11201,,,,,,,,,,156a
6066,Brooklyn,Mr Fulton,Both,Bk Fulton Corp.,156a,79873,Pre-Suspension,2022-04-25 15:52:54,DOT,11201,,,,,,,,,,156a
6496,Brooklyn,Mr Fulton,Both,Bk Fulton Corp.,156a,80233,Suspended and Deactivated,2022-05-02 00:00:01,,11201,,,,,,,,,,156a
17639,Brooklyn,Mr Fulton,Both,Bk Fulton Corp.,156a,16700,Skipped Inspection,2020-08-01 11:37:53,DOT,11201,,,,,,,,,,156a
24423,Brooklyn,Mr Fulton,Both,Bk Fulton Corp.,156a,23541,Skipped Inspection,2020-09-21 17:48:22,DOT,11201,,,,,,,,,,156a
30928,Brooklyn,Mr Fulton,Both,Bk Fulton Corp.,156a,31542,Reset,2020-12-22 09:05:24,,11201,,,,,,,,,,156a
33282,Brooklyn,Mr Fulton,Both,Bk Fulton Corp.,156a,26403,Skipped Inspection,2020-10-28 12:00:41,DOT,11201,,,,,,,,,,156a
43421,Brooklyn,Mr Fulton,Both,Bk Fulton Corp.,156a,43258,Skipped Inspection,2021-01-26 13:49:49,DOT,11201,,,,,,,,,,156a
43422,Brooklyn,Mr Fulton,Both,Bk Fulton Corp.,156a,43259,Skipped Inspection,2021-01-26 13:50:43,DOT,11201,,,,,,,,,,156a
52551,Brooklyn,Mr Fulton,Both,Bk Fulton Corp.,156a,5570,Skipped Inspection,2020-07-08 20:15:14,DEP,11201,,,,,,,,,,156a


In [69]:
# Droping Nulls from df subset buildingNumber
df.dropna(subset=["BuildingNumber"], inplace=True)


In [70]:
# Checking nulls in street column
df[df["Street"].isna()]

Unnamed: 0,Borough,RestaurantName,SeatingChoice,LegalBusinessName,BusinessAddress,RestaurantInspectionID,IsRoadwayCompliant,InspectedOn,AgencyCode,Postcode,Latitude,Longitude,CommunityBoard,CouncilDistrict,CensusTract,BIN,BBL,NTA,BuildingNumber,Street


## Cleaning street column 

In [71]:
def column_tansformation(df,column):
    df[column] = df[column].str.title()
    
    
    def what(x):

        def is_ordinal(word):
            ordinal_words = ['First', 'Second', 'Third', 'Fourth', 'Fifth', 'Sixth', 'Seventh', 'Eighth', 'Ninth', 'Tenth']
            return word in ordinal_words

        l = x.split()

        for i in range(len(l)):
            if is_ordinal(l[i]):
                parsed_word = parse_ordinal(l[i])
                l[i] = str(parsed_word)

        return " ".join(l)
            
    
    
    def ordinal(m):
        n = int(m.group())
        if 5 <= n <= 20 :
            suffix = 'th'
        else:
            remainder = n % 10
            if remainder == 1:
                suffix = 'st'
            elif remainder == 2:
                suffix = 'nd'
            elif remainder == 3:
                suffix = 'rd'
            else:
                suffix = 'th'
        output = str(n) + suffix
        return output
    
    pattern = r'\b(\d+)(Th|Nd|St|Rd)\b'
    ave_pattern = r'\bAve\.?$|\bAve\b|\bAv\b'
    st_pattern = r'\bSt\.?$\b|\bSt\b'
    blvd_pattern = r'\bBlvd\b'
    number = r'\b\d+\b'
    
    
    df[column] = df[column].apply(lambda x : re.sub(pattern, lambda m: m.group(1) + m.group(2).lower(), str(x)))
    df[column] = df[column].str.replace(ave_pattern, 'Avenue', regex = True)
    df[column] = df[column].str.replace(st_pattern, 'Street', regex = True)
    df[column] = df[column].str.replace(blvd_pattern, 'Boulevard', regex = True)
    df[column] = df[column].apply(what)
    df[column] = df[column].str.replace(re.compile(number), lambda x: ordinal(x), regex=True)
    

    
    
    
column_tansformation(df,"Street")

## Combining the Building number + Street + Borough columns + NY = Business Address

In [72]:
df["BusinessAddress"] = df.apply(lambda row: f"{row['BuildingNumber']}, {row['Street']}, {row['Borough']}, NY", axis=1)

In [73]:
df.sample(5)

Unnamed: 0,Borough,RestaurantName,SeatingChoice,LegalBusinessName,BusinessAddress,RestaurantInspectionID,IsRoadwayCompliant,InspectedOn,AgencyCode,Postcode,Latitude,Longitude,CommunityBoard,CouncilDistrict,CensusTract,BIN,BBL,NTA,BuildingNumber,Street
66441,Brooklyn,Weather Up & Co,Both,Ward 8 LLC,"589, Vanderbilt Avenue, Brooklyn, NY",7848,Skipped Inspection,2020-07-12 14:27:55,DOB,11238,40.679936,-73.968035,8.0,35.0,203.0,3027904.0,3011380000.0,Prospect Heights,589,Vanderbilt Avenue
36767,Manhattan,A&T Fish And Chips LLC,Roadway,A&T Fish And Chips LLC,"85, 1st Avenue, Manhattan, NY",36979,For HIQA Review,2021-01-11 09:31:34,,10003,40.725937,-73.986503,3.0,2.0,38.0,1077690.0,1004478000.0,East Village,85,1st Avenue
29995,Brooklyn,Abilene,Both,Leahlala LLC,"442, Court Street, Brooklyn, NY",3068,Compliant,2020-07-07 15:33:06,DEP,11231,40.678933,-73.997563,6.0,39.0,65.0,3005293.0,3003660000.0,Carroll Gardens-Columbia Street-Red Hook,442,Court Street
15441,Queens,Fayrooz,Both,Atlantic East Group Inc.,"28-08, Steinway Street, Queens, NY",12826,Non-Compliant,2020-07-21 22:17:09,DOT,11103,40.76524,-73.913909,1.0,22.0,6502.0,4010771.0,4006620000.0,Astoria,28-08,Steinway Street
65837,Manhattan,Leonelli Focacceria E Pasticceria,Both,Leonelli Restaurants LLC,"7, East 27th, Manhattan, NY",72866,Non-Compliant,2021-12-20 13:34:19,DOT,10016,40.743843,-73.987196,5.0,2.0,56.0,1016886.0,1008570000.0,Hudson Yards-Chelsea-Flatiron-Union Square,7,East 27th


## Chainging the postion of columns

In [74]:
# Adding column names in a list
column_list = []
for column in df:
    column_list.append(column)  

In [75]:
# chinging the placement of columns
column_list = ['Borough',
 'RestaurantName',
 'SeatingChoice',
 'LegalBusinessName',
 'BuildingNumber',
 'Street',
 'BusinessAddress',
 'RestaurantInspectionID',
 'IsRoadwayCompliant',
 'InspectedOn',
 'AgencyCode',
 'Postcode',
 'Latitude',
 'Longitude',
 'CommunityBoard',
 'CouncilDistrict',
 'CensusTract',
 'BIN',
 'BBL',
 'NTA'
 ]

df = df.loc[:, column_list]  # Applying the placement inplace

In [76]:
df

Unnamed: 0,Borough,RestaurantName,SeatingChoice,LegalBusinessName,BuildingNumber,Street,BusinessAddress,RestaurantInspectionID,IsRoadwayCompliant,InspectedOn,AgencyCode,Postcode,Latitude,Longitude,CommunityBoard,CouncilDistrict,CensusTract,BIN,BBL,NTA
0,Manhattan,Oscar Wilde,Both,Camelot Castle LLC,45,West 27th Street,"45, West 27th Street, Manhattan, NY",72891,Non-Compliant,2021-12-20 16:06:58,DOT,10001,40.744876,-73.989657,5.0,3.0,58.0,1015677.0,1.008290e+09,Hudson Yards-Chelsea-Flatiron-Union Square
1,Manhattan,La Rubia Restaurant,Both,La Rubia Restaurant Inc.,3517,Broadway,"3517, Broadway, Manhattan, NY",72892,For HIQA Review,2021-12-20 16:18:42,DOT,10031,40.825863,-73.950874,9.0,7.0,229.0,1062369.0,1.020910e+09,Hamilton Heights
2,Manhattan,Thai Sliders,Sidewalk,Silom Thai Inc.,150,8th Avenue,"150, 8th Avenue, Manhattan, NY",72893,Non-Compliant,2021-12-20 16:35:41,DOT,10011,40.741906,-74.000945,4.0,3.0,81.0,1013845.0,1.007670e+09,Hudson Yards-Chelsea-Flatiron-Union Square
3,Brooklyn,Otway,Both,St James 930 LLC,930,Fulton Street,"930, Fulton Street, Brooklyn, NY",72894,Cease and Desist,2021-12-20 16:38:45,DOT,11238,40.682833,-73.963833,2.0,35.0,201.0,3335112.0,3.020130e+09,Clinton Hill
4,Brooklyn,Williamsburg Thai Cuisine,Both,Williamsburg Thai Cuisine Ny Inc.,212,Bedford Avenue,"212, Bedford Avenue, Brooklyn, NY",72896,Compliant,2021-12-20 16:52:41,DOT,11249,40.716913,-73.958728,1.0,33.0,553.0,3062192.0,3.023350e+09,North Side-South Side
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
79198,Brooklyn,Sunshine Co,Both,780 Washington LLC,780,Wahington Avenue,"780, Wahington Avenue, Brooklyn, NY",107873,Pre-Removal,2023-12-01 15:44:49,DOT,11238,,,,,,,,
79199,Manhattan,Moustache,Sidewalk,Bedford Pitza Corp.,29,7th Avenue South,"29, 7th Avenue South, Manhattan, NY",107874,Cease and Desist,2023-12-01 16:13:28,DOT,10014,40.730812,-74.004395,2.0,3.0,67.0,1087324.0,1.005860e+09,West Village
79200,Brooklyn,Mekelburg's,Both,"Hop, Stock & Barrel Ii LLC",319,Kent Avenue,"319, Kent Avenue, Brooklyn, NY",107875,Skipped Inspection,2023-12-01 16:42:51,DOT,11238,40.713757,-73.967200,1.0,33.0,551.0,3424711.0,3.024280e+09,North Side-South Side
79201,Brooklyn,Sunday In Brooklyn,Both,Sunday In Brooklyn LLC,348,Wythe Avenue,"348, Wythe Avenue, Brooklyn, NY",107876,Compliant,2023-12-01 16:48:20,DOT,11249,40.714171,-73.965208,1.0,33.0,551.0,3321284.0,3.024150e+09,North Side-South Side


In [77]:
# 
df.to_csv("Cleaned_current_open_inspections.csv",index=False)