In [1]:
import pandas as pd
from pytz import timezone
import append_reviews as a
%load_ext autoreload
%autoreload 2

# ny.to_csv('final_ny.csv')

#### Lets start with the reviews

In [2]:
reviews = pd.read_csv('reviews.csv')

In [3]:
# view dataframe
reviews.head(2)

#after analyzing the dataframe, I discovered:
#   a mix of date time formats in the publish_time columns
#   NaN in review_text
#   We will deal with this before concatenating review text.

Unnamed: 0,camis,publish_time,review_text,review_rating,service,atmosphere,food,price_per_person
0,50138517,2023-10-30,,5,5.0,5.0,,
1,50138517,2023-10-28,"Broth salty, cloudy. Summer rolls old. Short o...",2,4.0,3.0,2.0,25.0


In [4]:
reviews['review_text'] = reviews['review_text'].fillna('')

In [5]:
reviews.head(2)

Unnamed: 0,camis,publish_time,review_text,review_rating,service,atmosphere,food,price_per_person
0,50138517,2023-10-30,,5,5.0,5.0,,
1,50138517,2023-10-28,"Broth salty, cloudy. Summer rolls old. Short o...",2,4.0,3.0,2.0,25.0


In [6]:
# reviews.publish_time contains more than one time format (zulu and naive)
# We must standardize the format

# Function to convert string to local time and format it
def convert_to_local_and_format(dt_str):
    # Define the local timezone (e.g., Eastern Time)
    local_tz = timezone('America/New_York')
    
    # Parse the string into a datetime object
    dt = pd.to_datetime(dt_str)

    # Check if datetime is timezone-aware (has valid timezone info)
    if dt.tzinfo is not None and dt.tzinfo.utcoffset(dt) is not None:
        # Convert to local timezone
        dt = dt.astimezone(local_tz)

    # Return only the date part
    return dt.date()

In [7]:
# Apply the conversion and formatting to the 'publish_time' column.
reviews['publish_time'] = reviews['publish_time'].apply(convert_to_local_and_format) # The output is still a string

# Convert the 'publish_time' column to datetime, handling ISO8601 format
reviews['publish_time'] = pd.to_datetime(reviews['publish_time'])

In [8]:
# We have successfully converted to local time and removed hour information. 
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325203 entries, 0 to 325202
Data columns (total 8 columns):
 #   Column            Non-Null Count   Dtype         
---  ------            --------------   -----         
 0   camis             325203 non-null  int64         
 1   publish_time      325203 non-null  datetime64[ns]
 2   review_text       325203 non-null  object        
 3   review_rating     325203 non-null  int64         
 4   service           58952 non-null   float64       
 5   atmosphere        57974 non-null   float64       
 6   food              47818 non-null   float64       
 7   price_per_person  56982 non-null   float64       
dtypes: datetime64[ns](1), float64(4), int64(2), object(1)
memory usage: 19.8+ MB


In [9]:
# confirming that publish_time contains date only
reviews.head(2)

# dataframe ready for concatenation. 

Unnamed: 0,camis,publish_time,review_text,review_rating,service,atmosphere,food,price_per_person
0,50138517,2023-10-30,,5,5.0,5.0,,
1,50138517,2023-10-28,"Broth salty, cloudy. Summer rolls old. Short o...",2,4.0,3.0,2.0,25.0


#### Moving on to the inspections dataframe

In [10]:
ny = pd.read_csv('final_ny.csv', index_col=0)

In [11]:
ny['inspection_date'] = pd.to_datetime(ny['inspection_date'])
ny.head(2)

Unnamed: 0,camis,dba,boro,phone,inspection_date,latitude,longitude,cuisine_description,action,score,grade,full_address,violation_code,violation_description
0,41168748,DUNKIN,Bronx,7188614171,2022-03-30,40.816753,-73.892364,Donuts,Violations cited,13,A,880 GARRISON AVENUE 10474,10J 04N 08A,Hand wash sign not posted Filth flies or food/...
1,41688142,TABLE 87,Brooklyn,9176186100,2017-01-25,40.683447,-73.975691,Pizza,No violations,0,A,620 ATLANTIC AVENUE 11217,No violation,No violation


In [12]:
# inspection_date date is a date time object
ny.info()

<class 'pandas.core.frame.DataFrame'>
Index: 60825 entries, 0 to 60824
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   camis                  60825 non-null  int64         
 1   dba                    60825 non-null  object        
 2   boro                   60825 non-null  object        
 3   phone                  60825 non-null  int64         
 4   inspection_date        60825 non-null  datetime64[ns]
 5   latitude               60825 non-null  float64       
 6   longitude              60825 non-null  float64       
 7   cuisine_description    60825 non-null  object        
 8   action                 60825 non-null  object        
 9   score                  60825 non-null  int64         
 10  grade                  60825 non-null  object        
 11  full_address           60825 non-null  object        
 12  violation_code         60825 non-null  object        
 13  violat

Here we want to sort the dataframe by camis, then by inspection_date ascending.  
once we have the rows in this order, we can shift the dates down one row and assign  
that date to last_inspection_date.

|   | inspection_date | last_inspection_date |
|---|-----------------|----------------------|
| 0 | 2021-08-10      |    NaT                 |
| 1 | `2023-01-31`      | 2021-08-10           |
| 2 | 2023-02-03      | `2023-01-31`           |


In [13]:
# Sort the dummy dataframe by 'camis' and 'inspection_date'
ny_sorted = ny.sort_values(['camis', 'inspection_date'])

# Create the 'last_inspection_date' by shiting the inspection date 1
ny_sorted['last_inspection_date'] = ny_sorted.groupby('camis')['inspection_date'].shift(1)

# Display the changes
ny_sorted[['camis', 'dba', 'last_inspection_date', 'inspection_date']].head(20)

Unnamed: 0,camis,dba,last_inspection_date,inspection_date
53458,30075445,MORRIS PARK BAKE SHOP,NaT,2021-08-10
36355,30075445,MORRIS PARK BAKE SHOP,2021-08-10,2023-01-31
35589,30075445,MORRIS PARK BAKE SHOP,2023-01-31,2023-02-03
26194,30075445,MORRIS PARK BAKE SHOP,2023-02-03,2023-08-01
57252,30075445,MORRIS PARK BAKE SHOP,2023-08-01,2023-08-22
52742,30112340,WENDY'S,NaT,2021-07-20
44255,30112340,WENDY'S,2021-07-20,2021-08-05
36500,30112340,WENDY'S,2021-08-05,2022-07-13
57986,30112340,WENDY'S,2022-07-13,2023-07-21
1860,30191841,DJ REYNOLDS PUB AND RESTAURANT,NaT,2022-01-04


What would I do in excel?
I would approach the problem by row. 
in the current row, check if the camis is the same as the row above, if true, then take the inspection_date. if its not the same camis, 

We are left with NaT, this is because it was the first row of that restaurant (camis)  
Possibly because they are new restaurants, or no prior inspections exist in the inspections df. 

I think we should fill those NaTs with inspection_date `-` 1 year. 

In [14]:
# Fill NaT in 'last_inspection_date' with 'inspection_date' - 1 year
ny_sorted['last_inspection_date'] = ny_sorted['last_inspection_date'].fillna(ny_sorted['inspection_date'] - pd.DateOffset(years=1))
# Display the changes
ny_sorted[['camis', 'dba', 'last_inspection_date', 'inspection_date']].head(20)

Unnamed: 0,camis,dba,last_inspection_date,inspection_date
53458,30075445,MORRIS PARK BAKE SHOP,2020-08-10,2021-08-10
36355,30075445,MORRIS PARK BAKE SHOP,2021-08-10,2023-01-31
35589,30075445,MORRIS PARK BAKE SHOP,2023-01-31,2023-02-03
26194,30075445,MORRIS PARK BAKE SHOP,2023-02-03,2023-08-01
57252,30075445,MORRIS PARK BAKE SHOP,2023-08-01,2023-08-22
52742,30112340,WENDY'S,2020-07-20,2021-07-20
44255,30112340,WENDY'S,2021-07-20,2021-08-05
36500,30112340,WENDY'S,2021-08-05,2022-07-13
57986,30112340,WENDY'S,2022-07-13,2023-07-21
1860,30191841,DJ REYNOLDS PUB AND RESTAURANT,2021-01-04,2022-01-04


In [15]:
ny_sorted.info()

<class 'pandas.core.frame.DataFrame'>
Index: 60825 entries, 53458 to 9941
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   camis                  60825 non-null  int64         
 1   dba                    60825 non-null  object        
 2   boro                   60825 non-null  object        
 3   phone                  60825 non-null  int64         
 4   inspection_date        60825 non-null  datetime64[ns]
 5   latitude               60825 non-null  float64       
 6   longitude              60825 non-null  float64       
 7   cuisine_description    60825 non-null  object        
 8   action                 60825 non-null  object        
 9   score                  60825 non-null  int64         
 10  grade                  60825 non-null  object        
 11  full_address           60825 non-null  object        
 12  violation_code         60825 non-null  object        
 13  vio

Its time to bring in the reviews to the inspections dataframe by matching the date range and camis

In [16]:
def get_concatenated_reviews(inspection_row, reviews_df):
    '''
    We use camis to match reviews then we use last_inspection_date and inspection_date to filter reviews that fall within the date ranges. Finally we concatenate the review_text and return a string.  
    '''
    # Get the camis
    camis = inspection_row['camis']
    
    # If last_inspection_date is not null, we add one day to it to get the start date
    start_date = inspection_row['last_inspection_date'] + pd.Timedelta(days=1) if pd.notnull(inspection_row['last_inspection_date']) else inspection_row['inspection_date']
    end_date = inspection_row['inspection_date']
    
    # Filter reviews that match the camis and fall within the date range
    matching_reviews = reviews_df[
        (reviews_df['camis'] == camis) &
        (reviews_df['publish_time'] >= start_date) &
        (reviews_df['publish_time'] <= end_date)
    ]
    
    # Concatenate the review texts
    concatenated_reviews = ' '.join(matching_reviews['review_text'].dropna())
    
    
    return concatenated_reviews


In [19]:
# Apply the function to each row in the inspections dataframe
ny_sorted['concatenated_reviews'] = ny_sorted.apply(lambda row: get_concatenated_reviews(row, reviews), axis=1)

In [20]:
ny_sorted[['camis', 'dba', 'last_inspection_date', 'inspection_date', 'concatenated_reviews']]

Unnamed: 0,camis,dba,last_inspection_date,inspection_date,concatenated_reviews
53458,30075445,MORRIS PARK BAKE SHOP,2020-08-10,2021-08-10,
36355,30075445,MORRIS PARK BAKE SHOP,2021-08-10,2023-01-31,
35589,30075445,MORRIS PARK BAKE SHOP,2023-01-31,2023-02-03,
26194,30075445,MORRIS PARK BAKE SHOP,2023-02-03,2023-08-01,
57252,30075445,MORRIS PARK BAKE SHOP,2023-08-01,2023-08-22,
...,...,...,...,...,...
19539,50143007,CHILL OUT!,2022-10-24,2023-10-24,
14322,50143038,ZEN KITCHEN,2022-10-18,2023-10-18,
2377,50143057,HANDFIELD HOUSE,2022-10-23,2023-10-23,
26967,50143179,TRUCK STOP KITCHEN,2022-10-17,2023-10-17,


In [55]:
ny_sorted['concatenated_reviews'].isna().sum()

0

In [58]:
# Filter the dataframe to show only rows where 'concatenated_reviews' is not an empty string
ny_with_reviews = ny_sorted[ny_sorted['concatenated_reviews'] != ""]
ny_with_reviews[['camis', 'dba', 'last_inspection_date', 'inspection_date', 'concatenated_reviews']]


Unnamed: 0,camis,dba,last_inspection_date,inspection_date,concatenated_reviews
45287,30191841,DJ REYNOLDS PUB AND RESTAURANT,2022-01-04,2023-04-23,A midtown treasure! I absolutely love D.J. Rey...
7141,40359705,NATHAN'S FAMOUS,2021-02-10,2022-02-10,For three years in a row this was our first me...
7209,40362274,ANGELIKA FILM CENTER,2022-02-14,2023-06-21,Came here to attend The Last of Us immersive e...
4194,40363834,CARVEL,2019-10-01,2021-08-02,"Great ice cream spot on Hylan. On Wednesday, t..."
4429,40364220,KOSHER BAGEL HOLE,2019-11-27,2021-08-04,Nothing special...i had an everything bagel mi...
...,...,...,...,...,...
33984,50141509,DR. Tea House,2023-09-20,2023-10-05,Great service. So friendly and really fast! Pa...
5554,50141926,VARIETY COFFEE ROASTERS,2022-09-26,2023-09-26,Great coffee and lovely staff. Loving the retr...
1093,50142345,YEMEN CAFE,2022-09-21,2023-09-21,Great experience and good food!\nI love the mi...
6401,50142540,La Cucharita Paisa Bar Restaurant,2022-10-10,2023-10-10,"This was one of my favorite restaurants, but ..."


Many of the reviews are not available, lets filter so we can see some reviews

In [None]:
# lets take a closer look. 
print(ny_with_reviews.concatenated_reviews.iloc[0])

A midtown treasure! I absolutely love D.J. Reynolds. I recently came here with my dad who was visiting from out of town.

After a visit to a museum and a walk around the Time Warner Center, we had lunch at D.J. Reynolds.

It was a Friday afternoon and we were able to find a table. The staff here is SUPER friendly and welcoming.

I had the buffalo chicken wrap whereas my dad had the lunch special - beef stir fry.

Both dishes were served with generous proportions. I'll definitely return! We all absolutely loved this place. When does that happen that everyone in your party is happy. Large portions, the food was cooked perfectly. Such a pleasant experience. It’s very cozy and beautiful traditional Irish restaurant in the middle of the city! Nice atmosphere, welcoming stuff, tasty food and acceptable prices! What else do you need for perfect dinner! Always happy to find places where you are feeling that your at home


In [63]:
# lets save our progress

ny_sorted.to_csv('ny_plus_reviews.csv', index= False)


In [66]:
ny_load = pd.read_csv('ny_plus_reviews.csv')
ny_load.head()

Unnamed: 0,camis,dba,boro,phone,inspection_date,latitude,longitude,cuisine_description,action,score,grade,full_address,violation_code,violation_description,last_inspection_date,concatenated_reviews
0,30075445,MORRIS PARK BAKE SHOP,Bronx,7188924968,2021-08-10,40.848231,-73.855972,Bakery Products/Desserts,Violations cited,5,A,1007 MORRIS PARK AVENUE 10462,06E,"Sanitized equipment or utensil, including in-u...",2020-08-10,
1,30075445,MORRIS PARK BAKE SHOP,Bronx,7188924968,2023-01-31,40.848231,-73.855972,Bakery Products/Desserts,Closed,21,B,1007 MORRIS PARK AVENUE 10462,04H 08C 04L 10D 10F 06C 04N 06D,"Raw, cooked or prepared food is adulterated, c...",2021-08-10,
2,30075445,MORRIS PARK BAKE SHOP,Bronx,7188924968,2023-02-03,40.848231,-73.855972,Bakery Products/Desserts,Re-opened,13,A,1007 MORRIS PARK AVENUE 10462,10F 02G,Non-food contact surface or equipment made of ...,2023-01-31,
3,30075445,MORRIS PARK BAKE SHOP,Bronx,7188924968,2023-08-01,40.848231,-73.855972,Bakery Products/Desserts,Violations cited,38,C,1007 MORRIS PARK AVENUE 10462,08A 05H 04L,Establishment is not free of harborage or cond...,2023-02-03,
4,30075445,MORRIS PARK BAKE SHOP,Bronx,7188924968,2023-08-22,40.848231,-73.855972,Bakery Products/Desserts,Violations cited,12,A,1007 MORRIS PARK AVENUE 10462,04L 08A 08C,Evidence of mice or live mice in establishment...,2023-08-01,


In [16]:
reviews = pd.read_csv('reviews.csv')

In [None]:
reviews = a.reviews_pipeline(reviews)

In [21]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61337 entries, 0 to 61336
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   camis          61337 non-null  int64         
 1   publish_time   61337 non-null  datetime64[ns]
 2   review_text    61337 non-null  object        
 3   review_rating  61337 non-null  int64         
dtypes: datetime64[ns](1), int64(2), object(1)
memory usage: 1.9+ MB


In [30]:
ny = pd.read_csv('final_ny.csv', index_col=0)

In [31]:
ny = a.ny_last_inspection(ny)

In [34]:
ny[['camis', 'dba', 'last_inspection_date', 'inspection_date']].head(20)

Unnamed: 0,camis,dba,last_inspection_date,inspection_date
53458,30075445,MORRIS PARK BAKE SHOP,2020-08-10,2021-08-10
36355,30075445,MORRIS PARK BAKE SHOP,2021-08-10,2023-01-31
35589,30075445,MORRIS PARK BAKE SHOP,2023-01-31,2023-02-03
26194,30075445,MORRIS PARK BAKE SHOP,2023-02-03,2023-08-01
57252,30075445,MORRIS PARK BAKE SHOP,2023-08-01,2023-08-22
52742,30112340,WENDY'S,2020-07-20,2021-07-20
44255,30112340,WENDY'S,2021-07-20,2021-08-05
36500,30112340,WENDY'S,2021-08-05,2022-07-13
57986,30112340,WENDY'S,2022-07-13,2023-07-21
1860,30191841,DJ REYNOLDS PUB AND RESTAURANT,2021-01-04,2022-01-04


In [35]:
# Apply the function to each row in the inspections dataframe
ny['concatenated_reviews'] = ny.apply(lambda row: a.ny_concat_reviews(row, reviews), axis=1)

In [None]:
ny

In [None]:
ny = ny_pipeline(ny, reviews)