In [5]:
import pandas as pd
import json
import re

# Load the combined dataset
combined_df = pd.read_csv('combined_korean_restaurants.csv')

# Function to extract prices and calculate mean cost
def calculate_mean_cost(dishes):
    try:
        # Parse the dishes string as a list
        dish_list = json.loads(dishes.replace("'", "\""))
        
        # Extract prices and convert to numeric values
        prices = []
        for dish in dish_list:
            price_str = dish[1]
            price_num_str = re.sub(r'[^\d]', '', price_str)  # Remove non-numeric characters
            if price_num_str:  # Check if the string is not empty
                price_num = int(price_num_str)
                prices.append(price_num)
        
        # Calculate mean cost
        if prices:
            mean_cost = sum(prices) / len(prices)
        else:
            mean_cost = 0
    except (json.JSONDecodeError, TypeError, IndexError):
        mean_cost = 0
    
    return mean_cost

# Apply the function to the 'dishes' column
combined_df['meancost'] = combined_df['dishes'].apply(calculate_mean_cost)

# Save the updated DataFrame to a new CSV file
combined_df.to_csv('combined_korean_restaurants.csv', index=False)

print("Updated dataset with mean cost saved as 'combined_korean_restaurants_with_meancost.csv'")


Updated dataset with mean cost saved as 'combined_korean_restaurants_with_meancost.csv'


In [21]:
import pandas as pd
import json
import re
import random

# Load the Zomato and Korean datasets
zomato_df = pd.read_csv('zomato.csv')
korean_df = pd.read_csv('cleaned_korean_restaurants.csv')

# Function to clean and parse the dishes string
def parse_dishes(dishes):
    try:
        # Replace problematic characters and parse the JSON string
        dishes_cleaned = re.sub(r'(?<!\\)\'', '\"', dishes)
        dish_list = json.loads(dishes_cleaned)
        return dish_list
    except json.JSONDecodeError:
        return []

# Function to parse and select the top 5 reviews with the highest count
def parse_reviews(review_str, num_reviews=5):
    try:
        # Remove the 'reviewCount' part and split the rest into individual reviews
        reviews_cleaned = re.sub(r'reviewCount: \d+, ', '', review_str)
        review_list = re.findall(r"'([^']+)': (\d+)", reviews_cleaned)
        # Sort the reviews by count in descending order and select the top ones
        sorted_reviews = sorted(review_list, key=lambda x: int(x[1]), reverse=True)
        selected_reviews = sorted_reviews[:num_reviews]
        return ', '.join([f'{review[0]}: {review[1]}' for review in selected_reviews])
    except Exception as e:
        print(f"Error parsing reviews: {e}")
        return ""

# Select the same number of Korean restaurants
korean_selected = korean_df.head(len(zomato_df))

# Function to map Korean data to Zomato data
def map_korean_to_zomato(zomato_row, korean_row):
    # Replace cuisines, cost, and dish_liked from the same Korean row
    zomato_row['cuisines'] = korean_row['category']
    zomato_row['cost'] = korean_row['meancost']
    
    # Replace dish_liked with dishes from Korean data, matching original length
    korean_dishes = parse_dishes(korean_row['dishes'])
    if korean_dishes:
        num_dishes = len(zomato_row['dish_liked'].split(', ')) if pd.notna(zomato_row['dish_liked']) else 0
        selected_dishes = random.sample(korean_dishes, min(num_dishes, len(korean_dishes)))
        zomato_row['dish_liked'] = ', '.join([dish[0] for dish in selected_dishes])
    else:
        zomato_row['dish_liked'] = None
    
    # Replace reviews_list with the top 5 reviews from Korean data
    reviews_list = parse_reviews(korean_row['review'])
    zomato_row['reviews_list'] = reviews_list

    return zomato_row

# Create a copy of the Zomato dataset to preserve original data
zomato_updated_df = zomato_df.copy()

# Replace the values in the Zomato dataset with the Korean dataset
for zomato_idx, korean_row in zip(zomato_df.index, korean_selected.iterrows()):
    zomato_updated_df.loc[zomato_idx] = map_korean_to_zomato(zomato_updated_df.loc[zomato_idx], korean_row[1])

# Save the updated Zomato dataset to a new CSV file
zomato_updated_df.to_csv('updated_zomato_with_korean.csv', index=False)

# Verify if the reviews_list column is populated correctly
print(zomato_updated_df[['reviews_list']].head())

print("Updated dataset saved as 'updated_zomato_with_korean.csv'")

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zomato_row['cuisines'] = korean_row['category']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zomato_row['cost'] = korean_row['meancost']
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zomato_row['dish_liked'] = ', '.join([dish[0] for dish in selected_dishes])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  

                                        reviews_list
0  음식이 맛있어요: 2044, 매장이 청결해요: 1597, 매장이 넓어요: 1596,...
1  음식이 맛있어요: 1019, 특별한 메뉴가 있어요: 301, 양이 많아요: 237,...
2  음식이 맛있어요: 1495, 친절해요: 1246, 매장이 넓어요: 841, 재료가 ...
3  음식이 맛있어요: 432, 친절해요: 232, 재료가 신선해요: 126, 특별한 메...
4  음식이 맛있어요: 272, 술이 다양해요: 207, 인테리어가 멋져요: 196, 친...
Updated dataset saved as 'updated_zomato_with_korean.csv'


In [19]:
import pandas as pd
import json
import re

# Load the Korean dataset
korean_df = pd.read_csv('combined_korean_restaurants.csv')

# Function to clean and transform the reviews string
def transform_reviews(reviews):
    try:
        # Replace problematic characters
        reviews_cleaned = reviews.replace("'", '"').replace('""', '"').replace('\\"', "'")
        
        # Ensure the JSON string is correctly formatted
        if reviews_cleaned.startswith('"'):
            reviews_cleaned = reviews_cleaned[1:]
        if reviews_cleaned.endswith('"'):
            reviews_cleaned = reviews_cleaned[:-1]
        
        # Parse the JSON string
        review_dict = json.loads(reviews_cleaned)
        
        # Initialize the new review dictionary
        new_review_dict = {'reviewCount': review_dict['reviewCount']}
        
        # Transform the reviews into the new format
        for review in review_dict.get('reviews', []):
            content = review['content'][0].strip('"')
            count = review['content'][1]
            new_review_dict[content] = count
        
        # Convert the new review dictionary to a formatted string
        formatted_reviews = f"reviewCount: {new_review_dict['reviewCount']}, " + ", ".join(
            [f"'{key}': {value}" for key, value in new_review_dict.items() if key != 'reviewCount']
        )
        return formatted_reviews
    except (json.JSONDecodeError, KeyError) as e:
        print(f"Error: {e} in reviews: {reviews}")
        return "reviewCount: 0"

# Apply the transformation function to the 'review' column
korean_df['review'] = korean_df['review'].apply(transform_reviews)

# Save the cleaned Korean dataset to a new CSV file
korean_df.to_csv('combined_korean_restaurants.csv', index=False)

print("Cleaned dataset saved as 'cleaned_korean_restaurants.csv'")

Cleaned dataset saved as 'cleaned_korean_restaurants.csv'


In [22]:
df = pd.read_csv('updated_zomato_with_korean.csv')
df.isnull().sum()

name              0
rate              0
dish_liked       83
cuisines          0
cost              0
reviews_list    309
city              0
Mean Rating       0
dtype: int64

In [23]:
df.dropna(how='any',inplace=True)

In [24]:
df.reset_index(drop=True, inplace=True)
df.to_csv('updated_zomato_with_korean.csv', index=False)