In [1]:
import pandas as pd

# ny.to_csv('final_ny.csv')

In [2]:
reviews = pd.read_csv('reviews.csv')

In [3]:
# Convert the 'publish_time' column to datetime, handling ISO8601 format
reviews['publish_time'] = pd.to_datetime(reviews['publish_time'], format='mixed')

In [4]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 61337 entries, 0 to 61336
Data columns (total 4 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   camis          61337 non-null  int64 
 1   publish_time   61337 non-null  object
 2   review_text    54434 non-null  object
 3   review_rating  61337 non-null  int64 
dtypes: int64(2), object(2)
memory usage: 1.9+ MB


In [5]:
reviews.head(2)

Unnamed: 0,camis,publish_time,review_text,review_rating
0,41375676,2023-11-01 02:44:04.226505,,5
1,41375676,2023-11-01 02:44:04.227451,,5


In [6]:
ny = pd.read_csv('final_ny.csv', index_col=0)

In [7]:
ny['inspection_date'] = pd.to_datetime(ny['inspection_date'])
reviews.head(2)

Unnamed: 0,camis,publish_time,review_text,review_rating
0,41375676,2023-11-01 02:44:04.226505,,5
1,41375676,2023-11-01 02:44:04.227451,,5


In [8]:
ny.info()

<class 'pandas.core.frame.DataFrame'>
Index: 60825 entries, 0 to 60824
Data columns (total 14 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   camis                  60825 non-null  int64         
 1   dba                    60825 non-null  object        
 2   boro                   60825 non-null  object        
 3   phone                  60825 non-null  int64         
 4   inspection_date        60825 non-null  datetime64[ns]
 5   latitude               60825 non-null  float64       
 6   longitude              60825 non-null  float64       
 7   cuisine_description    60825 non-null  object        
 8   action                 60825 non-null  object        
 9   score                  60825 non-null  int64         
 10  grade                  60825 non-null  object        
 11  full_address           60825 non-null  object        
 12  violation_code         60825 non-null  object        
 13  violat

In [9]:
# Check the timezone of the first element in 'publish_time' column
publish_time_tz = reviews['publish_time'].dt.tz
print("publish_time timezone:", publish_time_tz)

# Check the timezone of the first element in 'inspection_date' column
inspection_date_tz = ny['inspection_date'].dt.tz
print("inspection_date timezone:", inspection_date_tz)

AttributeError: Can only use .dt accessor with datetimelike values

In [9]:
# Sort the dummy dataframe by 'camis' and 'inspection_date'
ny_sorted = ny.sort_values(['camis', 'inspection_date'])

# Create the 'last_inspection_date' by shiting the inspection date 1
ny_sorted['last_inspection_date'] = ny_sorted.groupby('camis')['inspection_date'].shift(1)

# Display the changes
ny_sorted[['camis', 'dba', 'last_inspection_date', 'inspection_date']]

Unnamed: 0,camis,dba,last_inspection_date,inspection_date
53458,30075445,MORRIS PARK BAKE SHOP,NaT,2021-08-10
36355,30075445,MORRIS PARK BAKE SHOP,2021-08-10,2023-01-31
35589,30075445,MORRIS PARK BAKE SHOP,2023-01-31,2023-02-03
26194,30075445,MORRIS PARK BAKE SHOP,2023-02-03,2023-08-01
57252,30075445,MORRIS PARK BAKE SHOP,2023-08-01,2023-08-22
...,...,...,...,...
19539,50143007,CHILL OUT!,NaT,2023-10-24
14322,50143038,ZEN KITCHEN,NaT,2023-10-18
2377,50143057,HANDFIELD HOUSE,NaT,2023-10-23
26967,50143179,TRUCK STOP KITCHEN,NaT,2023-10-17


In [12]:
ny_sorted.info()

<class 'pandas.core.frame.DataFrame'>
Index: 60825 entries, 53458 to 9941
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype         
---  ------                 --------------  -----         
 0   camis                  60825 non-null  int64         
 1   dba                    60825 non-null  object        
 2   boro                   60825 non-null  object        
 3   phone                  60825 non-null  int64         
 4   inspection_date        60825 non-null  datetime64[ns]
 5   latitude               60825 non-null  float64       
 6   longitude              60825 non-null  float64       
 7   cuisine_description    60825 non-null  object        
 8   action                 60825 non-null  object        
 9   score                  60825 non-null  int64         
 10  grade                  60825 non-null  object        
 11  full_address           60825 non-null  object        
 12  violation_code         60825 non-null  object        
 13  vio

In [10]:
def get_concatenated_reviews(inspection_row, reviews_df):
    '''
    We use camis to match reviews then we use last_inspection_date and inspection_date to filter reviews that fall within the date ranges. Finally we concatenate the review_text and return a string.  
    '''
    # Get the camis
    camis = inspection_row['camis']
    
    # If last_inspection_date is not null, we add one day to it to get the start date
    start_date = inspection_row['last_inspection_date'] + pd.Timedelta(days=1) if pd.notnull(inspection_row['last_inspection_date']) else inspection_row['inspection_date']
    end_date = inspection_row['inspection_date']
    
    # Filter reviews that match the camis and fall within the date range
    matching_reviews = reviews_df[
        (reviews_df['camis'] == camis) &
        (reviews_df['publish_time'] >= start_date) &
        (reviews_df['publish_time'] <= end_date)
    ]
    
    # Concatenate the review texts
    concatenated_reviews = ' '.join(matching_reviews['review_text'].dropna())
    
    return concatenated_reviews


In [None]:
#TypeError: can't compare offset-naive and offset-aware datetimes
# To fix we need to remove the timezone offset in the reviews 
# Convert all datetimes in the reviews DataFrame to naive
reviews['publish_time'] = reviews['publish_time'].dt.tz_localize(None)

# Then your existing function should work without further changes.


In [11]:
# Apply the function to each row in the inspections dataframe
ny_sorted['concatenated_reviews'] = ny_sorted.apply(lambda row: get_concatenated_reviews(row, reviews), axis=1)

TypeError: can't compare offset-naive and offset-aware datetimes

In [15]:
ny_sorted[['camis', 'dba', 'last_inspection_date', 'inspection_date', 'concatenated_reviews']]

Unnamed: 0,camis,dba,last_inspection_date,inspection_date
53458,30075445,MORRIS PARK BAKE SHOP,NaT,2021-08-10
36355,30075445,MORRIS PARK BAKE SHOP,2021-08-10,2023-01-31
35589,30075445,MORRIS PARK BAKE SHOP,2023-01-31,2023-02-03
26194,30075445,MORRIS PARK BAKE SHOP,2023-02-03,2023-08-01
57252,30075445,MORRIS PARK BAKE SHOP,2023-08-01,2023-08-22
...,...,...,...,...
19539,50143007,CHILL OUT!,NaT,2023-10-24
14322,50143038,ZEN KITCHEN,NaT,2023-10-18
2377,50143057,HANDFIELD HOUSE,NaT,2023-10-23
26967,50143179,TRUCK STOP KITCHEN,NaT,2023-10-17


In [None]:
ny_sorted['last_inspection_date'] = ny_sorted['last_inspection_date'] + pd.Timedelta(days=1)
