In [33]:
import pandas as pd
from pytz import timezone
import append_reviews as a
%load_ext autoreload
%autoreload 2

# ny.to_csv('final_ny.csv')

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


#### Lets start with the reviews

In [34]:
reviews = pd.read_csv('reviews.csv')

In [35]:
# view dataframe
reviews.head(2)

#after analyzing the dataframe, I discovered:
#   a mix of date time formats in the publish_time columns
#   NaN in review_text
#   We will deal with this before concatenating review text.

Unnamed: 0,camis,publish_time,review_text,review_rating,service,atmosphere,food,price_per_person
0,50138517,2023-10-30,,5,5.0,5.0,,
1,50138517,2023-10-28,"Broth salty, cloudy. Summer rolls old. Short o...",2,4.0,3.0,2.0,25.0


In [36]:
reviews['review_text'] = reviews['review_text'].fillna('')

In [38]:
len(reviews)

371105

In [39]:
reviews.head(2)

Unnamed: 0,camis,publish_time,review_text,review_rating,service,atmosphere,food,price_per_person
0,50138517,2023-10-30,,5,5.0,5.0,,
1,50138517,2023-10-28,"Broth salty, cloudy. Summer rolls old. Short o...",2,4.0,3.0,2.0,25.0


In [42]:
# reviews.publish_time contains more than one time format (zulu and naive)
# We must standardize the format

# Function to convert string to local time and format it
def convert_to_local_and_format(dt_str):
    # Define the local timezone (e.g., Eastern Time)
    local_tz = timezone('America/New_York')
    
    # Parse the string into a datetime object
    dt = pd.to_datetime(dt_str)

    # Check if datetime is timezone-aware (has valid timezone info)
    if dt.tzinfo is not None and dt.tzinfo.utcoffset(dt) is not None:
        # Convert to local timezone
        dt = dt.astimezone(local_tz)

    # Return only the date part
    return dt.date()

In [43]:
# Apply the conversion and formatting to the 'publish_time' column.
reviews['publish_time'] = reviews['publish_time'].apply(convert_to_local_and_format) # The output is still a string

# Convert the 'publish_time' column to datetime, handling ISO8601 format
reviews['publish_time'] = pd.to_datetime(reviews['publish_time'])

In [44]:
# We have successfully converted to local time and removed hour information. 
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 371105 entries, 0 to 371104
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   camis             371105 non-null  int64         
 1   publish_time      371105 non-null  datetime64[ns]
 2   review_text       371105 non-null  object        
 3   review_rating     371105 non-null  int64         
 4   service           68029 non-null   float64       
 5   atmosphere        66905 non-null   float64       
 6   food              55142 non-null   float64       
 7   price_per_person  65615 non-null   float64       
dtypes: datetime64[ns](1), float64(4), int64(2), object(1)
memory usage: 22.7+ MB


In [45]:
# confirming that publish_time contains date only
reviews.head(2)

# dataframe ready for concatenation. 

Unnamed: 0,camis,publish_time,review_text,review_rating,service,atmosphere,food,price_per_person
0,50138517,2023-10-30,,5,5.0,5.0,,
1,50138517,2023-10-28,"Broth salty, cloudy. Summer rolls old. Short o...",2,4.0,3.0,2.0,25.0


#### Moving on to the inspections dataframe

In [46]:
ny = pd.read_csv('final_ny.csv', index_col=0)

In [47]:
ny['inspection_date'] = pd.to_datetime(ny['inspection_date'])
ny.head(2)

Unnamed: 0,camis,dba,boro,phone,inspection_date,latitude,longitude,cuisine_description,action,score,grade,full_address,violation_code,violation_description
0,41168748,DUNKIN,Bronx,7188614171,2022-03-30,40.816753,-73.892364,Donuts,Violations cited,13,A,880 GARRISON AVENUE 10474,10J 04N 08A,Hand wash sign not posted Filth flies or food/...
1,41688142,TABLE 87,Brooklyn,9176186100,2017-01-25,40.683447,-73.975691,Pizza,No violations,0,A,620 ATLANTIC AVENUE 11217,No violation,No violation


In [48]:
# inspection_date date is a date time object
ny.info()

<class 'pandas.core.frame.DataFrame'>
Index: 60825 entries, 0 to 60824
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   camis                  60825 non-null  int64         
 1   dba                    60825 non-null  object        
 2   boro                   60825 non-null  object        
 3   phone                  60825 non-null  int64         
 4   inspection_date        60825 non-null  datetime64[ns]
 5   latitude               60825 non-null  float64       
 6   longitude              60825 non-null  float64       
 7   cuisine_description    60825 non-null  object        
 8   action                 60825 non-null  object        
 9   score                  60825 non-null  int64         
 10  grade                  60825 non-null  object        
 11  full_address           60825 non-null  object        
 12  violation_code         60825 non-null  object        
 13  violat

Here we want to sort the dataframe by camis, then by inspection_date ascending.  
once we have the rows in this order, we can shift the dates down one row and assign  
that date to last_inspection_date.

|   | inspection_date | last_inspection_date |
|---|-----------------|----------------------|
| 0 | 2021-08-10      |    NaT                 |
| 1 | `2023-01-31`      | 2021-08-10           |
| 2 | 2023-02-03      | `2023-01-31`           |


In [49]:
# Sort the dummy dataframe by 'camis' and 'inspection_date'
ny_sorted = ny.sort_values(['camis', 'inspection_date'])

# Create the 'last_inspection_date' by shiting the inspection date 1
ny_sorted['last_inspection_date'] = ny_sorted.groupby('camis')['inspection_date'].shift(1)

# Display the changes
ny_sorted[['camis', 'dba', 'last_inspection_date', 'inspection_date']].head(20)

Unnamed: 0,camis,dba,last_inspection_date,inspection_date
53458,30075445,MORRIS PARK BAKE SHOP,NaT,2021-08-10
36355,30075445,MORRIS PARK BAKE SHOP,2021-08-10,2023-01-31
35589,30075445,MORRIS PARK BAKE SHOP,2023-01-31,2023-02-03
26194,30075445,MORRIS PARK BAKE SHOP,2023-02-03,2023-08-01
57252,30075445,MORRIS PARK BAKE SHOP,2023-08-01,2023-08-22
52742,30112340,WENDY'S,NaT,2021-07-20
44255,30112340,WENDY'S,2021-07-20,2021-08-05
36500,30112340,WENDY'S,2021-08-05,2022-07-13
57986,30112340,WENDY'S,2022-07-13,2023-07-21
1860,30191841,DJ REYNOLDS PUB AND RESTAURANT,NaT,2022-01-04


What would I do in excel?
I would approach the problem by row. 
in the current row, check if the camis is the same as the row above, if true, then take the inspection_date. if its not the same camis, 

We are left with NaT, this is because it was the first row of that restaurant (camis)  
Possibly because they are new restaurants, or no prior inspections exist in the inspections df. 

I think we should fill those NaTs with inspection_date `-` 1 year. 

In [50]:
# Fill NaT in 'last_inspection_date' with 'inspection_date' - 1 year
ny_sorted['last_inspection_date'] = ny_sorted['last_inspection_date'].fillna(ny_sorted['inspection_date'] - pd.DateOffset(years=1))
# Display the changes
ny_sorted[['camis', 'dba', 'last_inspection_date', 'inspection_date']].head(20)

Unnamed: 0,camis,dba,last_inspection_date,inspection_date
53458,30075445,MORRIS PARK BAKE SHOP,2020-08-10,2021-08-10
36355,30075445,MORRIS PARK BAKE SHOP,2021-08-10,2023-01-31
35589,30075445,MORRIS PARK BAKE SHOP,2023-01-31,2023-02-03
26194,30075445,MORRIS PARK BAKE SHOP,2023-02-03,2023-08-01
57252,30075445,MORRIS PARK BAKE SHOP,2023-08-01,2023-08-22
52742,30112340,WENDY'S,2020-07-20,2021-07-20
44255,30112340,WENDY'S,2021-07-20,2021-08-05
36500,30112340,WENDY'S,2021-08-05,2022-07-13
57986,30112340,WENDY'S,2022-07-13,2023-07-21
1860,30191841,DJ REYNOLDS PUB AND RESTAURANT,2021-01-04,2022-01-04


In [51]:
ny_sorted.info()

<class 'pandas.core.frame.DataFrame'>
Index: 60825 entries, 53458 to 9941
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   camis                  60825 non-null  int64         
 1   dba                    60825 non-null  object        
 2   boro                   60825 non-null  object        
 3   phone                  60825 non-null  int64         
 4   inspection_date        60825 non-null  datetime64[ns]
 5   latitude               60825 non-null  float64       
 6   longitude              60825 non-null  float64       
 7   cuisine_description    60825 non-null  object        
 8   action                 60825 non-null  object        
 9   score                  60825 non-null  int64         
 10  grade                  60825 non-null  object        
 11  full_address           60825 non-null  object        
 12  violation_code         60825 non-null  object        
 13  vio

Its time to bring in the reviews to the inspections dataframe by matching the date range and camis

In [52]:
def get_concatenated_reviews(inspection_row, reviews_df):
    '''
    We use camis to match reviews then we use last_inspection_date and inspection_date to filter reviews that fall within the date ranges. Finally we concatenate the review_text and return a string.  
    '''
    # Get the camis
    camis = inspection_row['camis']
    
    # If last_inspection_date is not null, we add one day to it to get the start date
    start_date = inspection_row['last_inspection_date'] + pd.Timedelta(days=1) if pd.notnull(inspection_row['last_inspection_date']) else inspection_row['inspection_date']
    end_date = inspection_row['inspection_date']
    
    # Filter reviews that match the camis and fall within the date range
    matching_reviews = reviews_df[
        (reviews_df['camis'] == camis) &
        (reviews_df['publish_time'] >= start_date) &
        (reviews_df['publish_time'] <= end_date)
    ]
    
    # Concatenate the review texts
    concatenated_reviews = ' '.join(matching_reviews['review_text'].dropna())
    
    
    return concatenated_reviews


In [53]:
# Apply the function to each row in the inspections dataframe
ny_sorted['concatenated_reviews'] = ny_sorted.apply(lambda row: get_concatenated_reviews(row, reviews), axis=1)

In [54]:
ny_sorted.head()

Unnamed: 0,camis,dba,boro,phone,inspection_date,latitude,longitude,cuisine_description,action,score,grade,full_address,violation_code,violation_description,last_inspection_date,concatenated_reviews
53458,30075445,MORRIS PARK BAKE SHOP,Bronx,7188924968,2021-08-10,40.848231,-73.855972,Bakery Products/Desserts,Violations cited,5,A,1007 MORRIS PARK AVENUE 10462,06E,"Sanitized equipment or utensil, including in-u...",2020-08-10,
36355,30075445,MORRIS PARK BAKE SHOP,Bronx,7188924968,2023-01-31,40.848231,-73.855972,Bakery Products/Desserts,Closed,21,B,1007 MORRIS PARK AVENUE 10462,04H 08C 04L 10D 10F 06C 04N 06D,"Raw, cooked or prepared food is adulterated, c...",2021-08-10,
35589,30075445,MORRIS PARK BAKE SHOP,Bronx,7188924968,2023-02-03,40.848231,-73.855972,Bakery Products/Desserts,Re-opened,13,A,1007 MORRIS PARK AVENUE 10462,10F 02G,Non-food contact surface or equipment made of ...,2023-01-31,
26194,30075445,MORRIS PARK BAKE SHOP,Bronx,7188924968,2023-08-01,40.848231,-73.855972,Bakery Products/Desserts,Violations cited,38,C,1007 MORRIS PARK AVENUE 10462,08A 05H 04L,Establishment is not free of harborage or cond...,2023-02-03,
57252,30075445,MORRIS PARK BAKE SHOP,Bronx,7188924968,2023-08-22,40.848231,-73.855972,Bakery Products/Desserts,Violations cited,12,A,1007 MORRIS PARK AVENUE 10462,04L 08A 08C,Evidence of mice or live mice in establishment...,2023-08-01,


In [55]:
reviews.head()

Unnamed: 0,camis,publish_time,review_text,review_rating,service,atmosphere,food,price_per_person
0,50138517,2023-10-30,,5,5.0,5.0,,
1,50138517,2023-10-28,"Broth salty, cloudy. Summer rolls old. Short o...",2,4.0,3.0,2.0,25.0
2,50138517,2023-10-21,"Had the brisket and flank pho, the broth and n...",5,5.0,5.0,5.0,15.0
3,50138517,2023-10-14,,5,5.0,5.0,5.0,15.0
4,50138517,2023-10-14,I was waiting in front of the counter for at l...,1,1.0,,,


In [56]:
def calculate_average_ratings(ny_sorted, reviews):
    # Pre-filter reviews to dates that could possibly match
    min_date = ny_sorted['inspection_date'].min()
    max_date = ny_sorted['inspection_date'].max()
    pre_filtered_reviews = reviews[(reviews['publish_time'] >= min_date) & (reviews['publish_time'] <= max_date)]

    # Merge the pre-filtered reviews with inspections
    merged = pd.merge(ny_sorted, pre_filtered_reviews, on='camis', how='left')

    # Only keep rows where the publish_time is between the last inspection and the inspection dates
    merged = merged[(merged['publish_time'] >= merged['last_inspection_date']) & (merged['publish_time'] <= merged['inspection_date'])]

    # Group by 'camis' and calculate the mean for each category
    grouped = merged.groupby('camis')[['service', 'atmosphere', 'food', 'price_per_person']].mean().reset_index()

    # Rename columns to indicate they are averages
    grouped.rename(columns={
        'service': 'avg_service',
        'atmosphere': 'avg_atmosphere',
        'food': 'avg_food',
        'price_per_person': 'avg_price'
    }, inplace=True)

    # Merge the averages back to the ny_sorted DataFrame
    ny_sorted = pd.merge(ny_sorted, grouped, on='camis', how='left')

    return ny_sorted


In [57]:

# Use the function to calculate the averages
ny_sorted_ratings = calculate_average_ratings(ny_sorted, reviews)


In [58]:
ny_sorted_ratings.head()

Unnamed: 0,camis,dba,boro,phone,inspection_date,latitude,longitude,cuisine_description,action,score,grade,full_address,violation_code,violation_description,last_inspection_date,concatenated_reviews,avg_service,avg_atmosphere,avg_food,avg_price
0,30075445,MORRIS PARK BAKE SHOP,Bronx,7188924968,2021-08-10,40.848231,-73.855972,Bakery Products/Desserts,Violations cited,5,A,1007 MORRIS PARK AVENUE 10462,06E,"Sanitized equipment or utensil, including in-u...",2020-08-10,,,,,
1,30075445,MORRIS PARK BAKE SHOP,Bronx,7188924968,2023-01-31,40.848231,-73.855972,Bakery Products/Desserts,Closed,21,B,1007 MORRIS PARK AVENUE 10462,04H 08C 04L 10D 10F 06C 04N 06D,"Raw, cooked or prepared food is adulterated, c...",2021-08-10,,,,,
2,30075445,MORRIS PARK BAKE SHOP,Bronx,7188924968,2023-02-03,40.848231,-73.855972,Bakery Products/Desserts,Re-opened,13,A,1007 MORRIS PARK AVENUE 10462,10F 02G,Non-food contact surface or equipment made of ...,2023-01-31,,,,,
3,30075445,MORRIS PARK BAKE SHOP,Bronx,7188924968,2023-08-01,40.848231,-73.855972,Bakery Products/Desserts,Violations cited,38,C,1007 MORRIS PARK AVENUE 10462,08A 05H 04L,Establishment is not free of harborage or cond...,2023-02-03,,,,,
4,30075445,MORRIS PARK BAKE SHOP,Bronx,7188924968,2023-08-22,40.848231,-73.855972,Bakery Products/Desserts,Violations cited,12,A,1007 MORRIS PARK AVENUE 10462,04L 08A 08C,Evidence of mice or live mice in establishment...,2023-08-01,,,,,


In [60]:
ny_sorted_ratings[['camis', 'dba', 'last_inspection_date', 'inspection_date', 'concatenated_reviews', 'avg_service', 'avg_atmosphere' ,'avg_food', 'avg_price']]

Unnamed: 0,camis,dba,last_inspection_date,inspection_date,concatenated_reviews,avg_service,avg_atmosphere,avg_food,avg_price
0,30075445,MORRIS PARK BAKE SHOP,2020-08-10,2021-08-10,,,,,
1,30075445,MORRIS PARK BAKE SHOP,2021-08-10,2023-01-31,,,,,
2,30075445,MORRIS PARK BAKE SHOP,2023-01-31,2023-02-03,,,,,
3,30075445,MORRIS PARK BAKE SHOP,2023-02-03,2023-08-01,,,,,
4,30075445,MORRIS PARK BAKE SHOP,2023-08-01,2023-08-22,,,,,
...,...,...,...,...,...,...,...,...,...
60820,50143007,CHILL OUT!,2022-10-24,2023-10-24,,,,,
60821,50143038,ZEN KITCHEN,2022-10-18,2023-10-18,,,,,
60822,50143057,HANDFIELD HOUSE,2022-10-23,2023-10-23,,,,,
60823,50143179,TRUCK STOP KITCHEN,2022-10-17,2023-10-17,,,,,


In [61]:
ny_sorted['concatenated_reviews'].isna().sum()

0

In [63]:
# Filter the dataframe to show only rows where 'concatenated_reviews' is not an empty string
ny_with_reviews = ny_sorted_ratings[ny_sorted_ratings['concatenated_reviews'].str.strip() != ""]
# ny_with_reviews = ny_sorted[ny_sorted['concatenated_reviews'] != ' ']
ny_with_reviews[['camis', 'dba', 'last_inspection_date', 'inspection_date', 'concatenated_reviews', 'avg_service', 'avg_atmosphere' ,'avg_food', 'avg_price']]


Unnamed: 0,camis,dba,last_inspection_date,inspection_date,concatenated_reviews,avg_service,avg_atmosphere,avg_food,avg_price
74,40364439,SEVILLA RESTAURANT,2020-08-04,2021-08-04,Horrible food and slow service. Seafood in p...,5.000000,5.000000,5.000000,60.612245
75,40364439,SEVILLA RESTAURANT,2021-08-04,2022-05-18,Superb! I haven’t had good paella since mo...,5.000000,5.000000,5.000000,60.612245
76,40364439,SEVILLA RESTAURANT,2022-05-18,2023-05-17,Gem in the océano of nyc. Spanish dishes and o...,5.000000,5.000000,5.000000,60.612245
157,40365904,MEE SUM CAFE,2020-10-13,2021-10-13,Very authentic place. To the point that I ne...,4.333333,4.333333,4.500000,17.692308
158,40365904,MEE SUM CAFE,2021-10-13,2021-11-24,Really great classic Chinese comfort food. Gre...,4.333333,4.333333,4.500000,17.692308
...,...,...,...,...,...,...,...,...,...
60673,50140822,SUSHI BY BOU,2022-09-19,2023-09-19,Fantastic omakase experience! We’ve been to th...,4.782609,4.652174,4.619048,89.772727
60674,50140823,TANG BY MR SUN,2022-09-12,2023-09-12,What a magical evening - the atmosphere is ele...,5.000000,5.000000,5.000000,57.500000
60684,50140852,12 CHAIRS CATERING,2022-09-27,2023-09-27,"Cozy spot in LES, great food, and friendly se...",3.949495,3.989796,4.400000,46.024096
60749,50141497,COFFEE AT BREADS BAKERY,2022-09-22,2023-09-22,Choco Boba Just very moist!! Chocolate bread I...,,,,


In [64]:
len(ny_with_reviews)

4605

In [66]:
ny_with_reviews.dropna(subset=['avg_service', 'avg_atmosphere', 'avg_food', 'avg_price'], inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  ny_with_reviews.dropna(subset=['avg_service', 'avg_atmosphere', 'avg_food', 'avg_price'], inplace=True)


In [67]:
len(ny_with_reviews)

2594

In [68]:
ny_with_reviews.avg_service.isna().sum()

0

In [31]:
ny_with_reviews[ny_with_reviews.concatenated_reviews.isna()]#.concatenated_reviews.iloc[0]

Unnamed: 0,camis,dba,boro,phone,inspection_date,latitude,longitude,cuisine_description,action,score,grade,full_address,violation_code,violation_description,last_inspection_date,concatenated_reviews


Many of the reviews are not available, lets filter so we can see some reviews

In [None]:
# lets take a closer look. 
print(ny_with_reviews.concatenated_reviews.iloc[0])

A midtown treasure! I absolutely love D.J. Reynolds. I recently came here with my dad who was visiting from out of town.

After a visit to a museum and a walk around the Time Warner Center, we had lunch at D.J. Reynolds.

It was a Friday afternoon and we were able to find a table. The staff here is SUPER friendly and welcoming.

I had the buffalo chicken wrap whereas my dad had the lunch special - beef stir fry.

Both dishes were served with generous proportions. I'll definitely return! We all absolutely loved this place. When does that happen that everyone in your party is happy. Large portions, the food was cooked perfectly. Such a pleasant experience. It’s very cozy and beautiful traditional Irish restaurant in the middle of the city! Nice atmosphere, welcoming stuff, tasty food and acceptable prices! What else do you need for perfect dinner! Always happy to find places where you are feeling that your at home


In [18]:
ny_with_reviews.head()

Unnamed: 0,camis,dba,boro,phone,inspection_date,latitude,longitude,cuisine_description,action,score,grade,full_address,violation_code,violation_description,last_inspection_date,concatenated_reviews
8922,40364439,SEVILLA RESTAURANT,Manhattan,2129293189,2021-08-04,40.734908,-74.002973,Latin American,Violations cited,27,B,62 CHARLES STREET 10014,10B 04L 02H 04H 08A,Plumbing not properly installed or maintained;...,2020-08-04,Horrible food and slow service. Seafood in p...
43616,40364439,SEVILLA RESTAURANT,Manhattan,2129293189,2022-05-18,40.734908,-74.002973,Latin American,Violations cited,19,B,62 CHARLES STREET 10014,04L 08C 06C 08A,Evidence of mice or live mice present in facil...,2021-08-04,Superb! I haven’t had good paella since mo...
47767,40364439,SEVILLA RESTAURANT,Manhattan,2129293189,2023-05-17,40.734908,-74.002973,Latin American,Violations cited,12,A,62 CHARLES STREET 10014,10B 04L 08A,Anti-siphonage or back-flow prevention device ...,2022-05-18,Gem in the océano of nyc. Spanish dishes and o...
22175,40365904,MEE SUM CAFE,Manhattan,2123495260,2021-10-13,40.714861,-73.9982,Coffee/Tea,Violations cited,67,C,26 PELL STREET 10013,02G 04A 08A 06C 02B 04N 05D 10B 10F 09C 06F,Cold food item held above 41º F (smoked fish a...,2020-10-13,Very authentic place. To the point that I ne...
16421,40365904,MEE SUM CAFE,Manhattan,2123495260,2021-11-24,40.714861,-73.9982,Coffee/Tea,Violations cited,24,B,26 PELL STREET 10013,10F 02G 02B 06C 10B,Non-food contact surface improperly constructe...,2021-10-13,Really great classic Chinese comfort food. Gre...


In [20]:
ny_with_reviews.shape

(4638, 16)

In [69]:
ny_with_reviews.to_csv('ny_reviews_3.csv', index= False) #ny_reviews may not have been the first time we saved it here, look for original save if something goes wroign


In [63]:
# lets save our progress

ny_sorted.to_csv('ny_plus_reviews.csv', index= False)


In [66]:
ny_load = pd.read_csv('ny_plus_reviews.csv')
ny_load.head()

Unnamed: 0,camis,dba,boro,phone,inspection_date,latitude,longitude,cuisine_description,action,score,grade,full_address,violation_code,violation_description,last_inspection_date,concatenated_reviews
0,30075445,MORRIS PARK BAKE SHOP,Bronx,7188924968,2021-08-10,40.848231,-73.855972,Bakery Products/Desserts,Violations cited,5,A,1007 MORRIS PARK AVENUE 10462,06E,"Sanitized equipment or utensil, including in-u...",2020-08-10,
1,30075445,MORRIS PARK BAKE SHOP,Bronx,7188924968,2023-01-31,40.848231,-73.855972,Bakery Products/Desserts,Closed,21,B,1007 MORRIS PARK AVENUE 10462,04H 08C 04L 10D 10F 06C 04N 06D,"Raw, cooked or prepared food is adulterated, c...",2021-08-10,
2,30075445,MORRIS PARK BAKE SHOP,Bronx,7188924968,2023-02-03,40.848231,-73.855972,Bakery Products/Desserts,Re-opened,13,A,1007 MORRIS PARK AVENUE 10462,10F 02G,Non-food contact surface or equipment made of ...,2023-01-31,
3,30075445,MORRIS PARK BAKE SHOP,Bronx,7188924968,2023-08-01,40.848231,-73.855972,Bakery Products/Desserts,Violations cited,38,C,1007 MORRIS PARK AVENUE 10462,08A 05H 04L,Establishment is not free of harborage or cond...,2023-02-03,
4,30075445,MORRIS PARK BAKE SHOP,Bronx,7188924968,2023-08-22,40.848231,-73.855972,Bakery Products/Desserts,Violations cited,12,A,1007 MORRIS PARK AVENUE 10462,04L 08A 08C,Evidence of mice or live mice in establishment...,2023-08-01,


In [16]:
reviews = pd.read_csv('reviews.csv')

In [None]:
reviews = a.reviews_pipeline(reviews)

In [21]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61337 entries, 0 to 61336
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   camis          61337 non-null  int64         
 1   publish_time   61337 non-null  datetime64[ns]
 2   review_text    61337 non-null  object        
 3   review_rating  61337 non-null  int64         
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 1.9+ MB


In [30]:
ny = pd.read_csv('final_ny.csv', index_col=0)

In [31]:
ny = a.ny_last_inspection(ny)

In [34]:
ny[['camis', 'dba', 'last_inspection_date', 'inspection_date']].head(20)

Unnamed: 0,camis,dba,last_inspection_date,inspection_date
53458,30075445,MORRIS PARK BAKE SHOP,2020-08-10,2021-08-10
36355,30075445,MORRIS PARK BAKE SHOP,2021-08-10,2023-01-31
35589,30075445,MORRIS PARK BAKE SHOP,2023-01-31,2023-02-03
26194,30075445,MORRIS PARK BAKE SHOP,2023-02-03,2023-08-01
57252,30075445,MORRIS PARK BAKE SHOP,2023-08-01,2023-08-22
52742,30112340,WENDY'S,2020-07-20,2021-07-20
44255,30112340,WENDY'S,2021-07-20,2021-08-05
36500,30112340,WENDY'S,2021-08-05,2022-07-13
57986,30112340,WENDY'S,2022-07-13,2023-07-21
1860,30191841,DJ REYNOLDS PUB AND RESTAURANT,2021-01-04,2022-01-04


In [35]:
# Apply the function to each row in the inspections dataframe
ny['concatenated_reviews'] = ny.apply(lambda row: a.ny_concat_reviews(row, reviews), axis=1)

In [None]:
ny

In [None]:
ny = ny_pipeline(ny, reviews)