In [31]:
import warnings
warnings.simplefilter("ignore")

In [32]:
import gzip
import json

def parse_json_gz(file_path):
    with gzip.open(file_path, 'r') as f:
        for line in f:
            yield json.loads(line)

reviews = list(parse_json_gz('data/review-Delaware.json.gz'))
metadata = list(parse_json_gz('data/meta-Delaware.json.gz'))


In [33]:
import pandas as pd
from datetime import datetime
pd.set_option('display.max_columns', None)

reviews_df = pd.DataFrame(reviews)
metadata_df = pd.DataFrame(metadata)

print("Reviews DataFrame:")
print(reviews_df.info())
print("\nMetadata DataFrame:")
print(metadata_df.info())




Reviews DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1885948 entries, 0 to 1885947
Data columns (total 8 columns):
 #   Column   Dtype  
---  ------   -----  
 0   user_id  object 
 1   name     object 
 2   time     int64  
 3   rating   float64
 4   text     object 
 5   pics     object 
 6   resp     object 
 7   gmap_id  object 
dtypes: float64(1), int64(1), object(6)
memory usage: 115.1+ MB
None

Metadata DataFrame:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14706 entries, 0 to 14705
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              14706 non-null  object 
 1   address           14505 non-null  object 
 2   gmap_id           14706 non-null  object 
 3   description       2674 non-null   object 
 4   latitude          14706 non-null  float64
 5   longitude         14706 non-null  float64
 6   category          14647 non-null  object 
 7   avg_rating        1470

In [34]:
def merge_datasets(reviews_df, metadata_df, merge_on='gmap_id'):
    return pd.merge(reviews_df, metadata_df, on=merge_on, how='inner')

merged_data_df = merge_datasets(reviews_df, metadata_df)
merged_data_df.head(5)

Unnamed: 0,user_id,name_x,time,rating,text,pics,resp,gmap_id,name_y,address,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url
0,113797972931183350426,Heather Carper,1507258129698,5.0,Lived here for 3 years and enjoyed it. Locatio...,[{'url': ['https://lh5.googleusercontent.com/p...,,0x89b8b77c34771c5f:0xf768433b3a39763,Beach Plum Dunes Apartments,"Beach Plum Dunes Apartments, 36916 Crooked Ham...",,38.745316,-75.149381,[Apartment complex],4.5,25,,,{'Accessibility': ['Wheelchair accessible entr...,,"[0x89b8c7f7fa362dcd:0xd2b52bfac98b528f, 0x89b8...",https://www.google.com/maps/place//data=!4m2!3...
1,113797972931183350426,Heather Carper,1507258129698,5.0,Lived here for 3 years and enjoyed it. Locatio...,[{'url': ['https://lh5.googleusercontent.com/p...,,0x89b8b77c34771c5f:0xf768433b3a39763,Beach Plum Dunes Apartments,"Beach Plum Dunes Apartments, 36916 Crooked Ham...",,38.745316,-75.149381,[Apartment complex],4.5,25,,,{'Accessibility': ['Wheelchair accessible entr...,,"[0x89b8c7f7fa362dcd:0xd2b52bfac98b528f, 0x89b8...",https://www.google.com/maps/place//data=!4m2!3...
2,103394737817126197742,Terri Walliczek,1578072003214,5.0,I absolutely love living here. My rent is affo...,,,0x89b8b77c34771c5f:0xf768433b3a39763,Beach Plum Dunes Apartments,"Beach Plum Dunes Apartments, 36916 Crooked Ham...",,38.745316,-75.149381,[Apartment complex],4.5,25,,,{'Accessibility': ['Wheelchair accessible entr...,,"[0x89b8c7f7fa362dcd:0xd2b52bfac98b528f, 0x89b8...",https://www.google.com/maps/place//data=!4m2!3...
3,103394737817126197742,Terri Walliczek,1578072003214,5.0,I absolutely love living here. My rent is affo...,,,0x89b8b77c34771c5f:0xf768433b3a39763,Beach Plum Dunes Apartments,"Beach Plum Dunes Apartments, 36916 Crooked Ham...",,38.745316,-75.149381,[Apartment complex],4.5,25,,,{'Accessibility': ['Wheelchair accessible entr...,,"[0x89b8c7f7fa362dcd:0xd2b52bfac98b528f, 0x89b8...",https://www.google.com/maps/place//data=!4m2!3...
4,109199378483092610849,Jennifer Schulte,1560471695507,5.0,Lived here for two years and it is wonderful. ...,,,0x89b8b77c34771c5f:0xf768433b3a39763,Beach Plum Dunes Apartments,"Beach Plum Dunes Apartments, 36916 Crooked Ham...",,38.745316,-75.149381,[Apartment complex],4.5,25,,,{'Accessibility': ['Wheelchair accessible entr...,,"[0x89b8c7f7fa362dcd:0xd2b52bfac98b528f, 0x89b8...",https://www.google.com/maps/place//data=!4m2!3...


In [35]:
merged_data_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1889846 entries, 0 to 1889845
Data columns (total 22 columns):
 #   Column            Dtype  
---  ------            -----  
 0   user_id           object 
 1   name_x            object 
 2   time              int64  
 3   rating            float64
 4   text              object 
 5   pics              object 
 6   resp              object 
 7   gmap_id           object 
 8   name_y            object 
 9   address           object 
 10  description       object 
 11  latitude          float64
 12  longitude         float64
 13  category          object 
 14  avg_rating        float64
 15  num_of_reviews    int64  
 16  price             object 
 17  hours             object 
 18  MISC              object 
 19  state             object 
 20  relative_results  object 
 21  url               object 
dtypes: float64(4), int64(2), object(16)
memory usage: 317.2+ MB


In [36]:
merged_data_df.isnull().sum()

user_id                8030
name_x                    0
time                      0
rating                 8030
text                 801845
pics                1850656
resp                1620181
gmap_id                   0
name_y                    0
address                7769
description          953243
latitude                  0
longitude                 0
category                249
avg_rating                0
num_of_reviews            0
price                953832
hours                186862
MISC                 116691
state                780837
relative_results      87816
url                       0
dtype: int64

In [37]:
def handle_missing_values(df, state='Delaware'):
    drop_subset = ['user_id', 'rating']
    text_columns = ['text', 'description', 'resp']
    categorical_columns = ['name_x', 'name_y', 'category', 'price', 'address', 'hours', 'MISC', 'url']
    
    # Dropping rows with missing critical fields
    df = df.dropna(subset=drop_subset)

    # Filling text columns with empty strings
    for col in text_columns:
        if col in df.columns:
            df[col] = df[col].fillna("")

    #Filling state value
    if 'state' in df.columns:
        df['state'] = state
    
    # Filling categorical columns with "unknown"
    for col in categorical_columns:
        if col in df.columns:
            df[col] = df[col].fillna("unknown")
    
    return df

In [38]:
df = handle_missing_values(merged_data_df)

In [39]:
df.isnull().sum()

user_id                   0
name_x                    0
time                      0
rating                    0
text                      0
pics                1842626
resp                      0
gmap_id                   0
name_y                    0
address                   0
description               0
latitude                  0
longitude                 0
category                  0
avg_rating                0
num_of_reviews            0
price                     0
hours                     0
MISC                      0
state                     0
relative_results      79819
url                       0
dtype: int64

In [40]:
if 'time' in df.columns:
    try:
        df['time'] = pd.to_datetime(df['time'], unit='s')
    except Exception:
        df['time'] = pd.to_datetime(df['time'], errors='coerce')

In [41]:
numeric_fields = ['rating', 'avg_rating', 'num_of_reviews', 'latitude', 'longitude']
for field in numeric_fields:
    if field in df.columns:
        df[field] = pd.to_numeric(df[field], errors='coerce')

In [43]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 1881816 entries, 0 to 1889845
Data columns (total 22 columns):
 #   Column            Dtype         
---  ------            -----         
 0   user_id           object        
 1   name_x            object        
 2   time              datetime64[ns]
 3   rating            float64       
 4   text              object        
 5   pics              object        
 6   resp              object        
 7   gmap_id           object        
 8   name_y            object        
 9   address           object        
 10  description       object        
 11  latitude          float64       
 12  longitude         float64       
 13  category          object        
 14  avg_rating        float64       
 15  num_of_reviews    int64         
 16  price             object        
 17  hours             object        
 18  MISC              object        
 19  state             object        
 20  relative_results  object        
 21  url          

In [50]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import string
import re

nltk.download('punkt')  
nltk.download('stopwords')  
nltk.download('wordnet')    
nltk.download('omw-1.4')

def clean_text(text):
    # changing text to Lowercase
    text = text.lower()
    
    # Removing punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Removing extra whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Removing stopwords
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if word not in set(stopwords.words('english'))]
    
    return ' '.join(tokens)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\kb3842\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\kb3842\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\kb3842\AppData\Roaming\nltk_data...
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\kb3842\AppData\Roaming\nltk_data...


In [51]:
df['clean_text'] = df['text'].apply(clean_text)

In [52]:
df.head()

Unnamed: 0,user_id,name_x,time,rating,text,pics,resp,gmap_id,name_y,address,description,latitude,longitude,category,avg_rating,num_of_reviews,price,hours,MISC,state,relative_results,url,clean_text
0,113797972931183350426,Heather Carper,1970-01-01 00:25:07.258129698,5.0,Lived here for 3 years and enjoyed it. Locatio...,[{'url': ['https://lh5.googleusercontent.com/p...,,0x89b8b77c34771c5f:0xf768433b3a39763,Beach Plum Dunes Apartments,"Beach Plum Dunes Apartments, 36916 Crooked Ham...",,38.745316,-75.149381,[Apartment complex],4.5,25,unknown,unknown,{'Accessibility': ['Wheelchair accessible entr...,Delaware,"[0x89b8c7f7fa362dcd:0xd2b52bfac98b528f, 0x89b8...",https://www.google.com/maps/place//data=!4m2!3...,lived 3 years enjoyed location convenience vie...
1,113797972931183350426,Heather Carper,1970-01-01 00:25:07.258129698,5.0,Lived here for 3 years and enjoyed it. Locatio...,[{'url': ['https://lh5.googleusercontent.com/p...,,0x89b8b77c34771c5f:0xf768433b3a39763,Beach Plum Dunes Apartments,"Beach Plum Dunes Apartments, 36916 Crooked Ham...",,38.745316,-75.149381,[Apartment complex],4.5,25,unknown,unknown,{'Accessibility': ['Wheelchair accessible entr...,Delaware,"[0x89b8c7f7fa362dcd:0xd2b52bfac98b528f, 0x89b8...",https://www.google.com/maps/place//data=!4m2!3...,lived 3 years enjoyed location convenience vie...
2,103394737817126197742,Terri Walliczek,1970-01-01 00:26:18.072003214,5.0,I absolutely love living here. My rent is affo...,,,0x89b8b77c34771c5f:0xf768433b3a39763,Beach Plum Dunes Apartments,"Beach Plum Dunes Apartments, 36916 Crooked Ham...",,38.745316,-75.149381,[Apartment complex],4.5,25,unknown,unknown,{'Accessibility': ['Wheelchair accessible entr...,Delaware,"[0x89b8c7f7fa362dcd:0xd2b52bfac98b528f, 0x89b8...",https://www.google.com/maps/place//data=!4m2!3...,absolutely love living rent affordable place c...
3,103394737817126197742,Terri Walliczek,1970-01-01 00:26:18.072003214,5.0,I absolutely love living here. My rent is affo...,,,0x89b8b77c34771c5f:0xf768433b3a39763,Beach Plum Dunes Apartments,"Beach Plum Dunes Apartments, 36916 Crooked Ham...",,38.745316,-75.149381,[Apartment complex],4.5,25,unknown,unknown,{'Accessibility': ['Wheelchair accessible entr...,Delaware,"[0x89b8c7f7fa362dcd:0xd2b52bfac98b528f, 0x89b8...",https://www.google.com/maps/place//data=!4m2!3...,absolutely love living rent affordable place c...
4,109199378483092610849,Jennifer Schulte,1970-01-01 00:26:00.471695507,5.0,Lived here for two years and it is wonderful. ...,,,0x89b8b77c34771c5f:0xf768433b3a39763,Beach Plum Dunes Apartments,"Beach Plum Dunes Apartments, 36916 Crooked Ham...",,38.745316,-75.149381,[Apartment complex],4.5,25,unknown,unknown,{'Accessibility': ['Wheelchair accessible entr...,Delaware,"[0x89b8c7f7fa362dcd:0xd2b52bfac98b528f, 0x89b8...",https://www.google.com/maps/place//data=!4m2!3...,lived two years wonderful apartment great main...


In [53]:
df.to_csv('Cleaned file_Delaware')