In [None]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, StandardScaler

# Load the dataset
restaurants_df = pd.read_csv('/content/korean_restaurants_id.csv')

# Handling missing values (example: fill with a placeholder or remove rows)
restaurants_df.fillna('Unknown', inplace=True)

# Extract the first two words from road address
def extract_first_two_words(address):
    words = address.split()
    if len(words) >= 2:
        return ' '.join(words[:2])
    else:
        return 'Unknown'

# Extract the first words from jibun address
def extract_first_words(address):
    words = address.split()
    if len(words) >= 1:
        return ' '.join(words[:1])
    else:
        return 'Unknown'

restaurants_df['roadaddr_part'] = restaurants_df['roadaddr'].apply(extract_first_two_words)

restaurants_df['jibunaddr_part'] = restaurants_df['jibunaddr'].apply(extract_first_words)

# One-hot encode categorical columns
one_hot_encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
encoded_columns = one_hot_encoder.fit_transform(restaurants_df[['category', 'roadaddr_part', 'jibunaddr_part']])
encoded_df = pd.DataFrame(encoded_columns, columns=one_hot_encoder.get_feature_names_out(['category', 'roadaddr_part', 'jibunaddr_part']))

# Normalize numerical columns #except review
scaler = StandardScaler()
normalized_columns = scaler.fit_transform(restaurants_df[['meancost']])
normalized_df = pd.DataFrame(normalized_columns, columns=['meancost'])

# Combine the processed columns
restaurants_processed_df = pd.concat([restaurants_df[['restaurant_id', 'name', 'dishes', 'review']], encoded_df, normalized_df], axis=1)

restaurants_processed_df.to_csv('/content/korean_restaurants_processed.csv', index=False)



In [None]:
# Load the dataset
users_df = pd.read_csv('/content/users_dataset_15000.csv')

# Handling missing values
users_df.fillna('Unknown', inplace=True)

# One-hot encode categorical columns
encoded_columns = one_hot_encoder.fit_transform(users_df[['gender', 'location']])
encoded_df = pd.DataFrame(encoded_columns, columns=one_hot_encoder.get_feature_names_out(['gender', 'location']))

# Combine the processed columns
users_processed_df = pd.concat([users_df[['user_id', 'age']], encoded_df], axis=1)

users_processed_df.to_csv('/content/users_dataset_processed.csv', index=False)



In [None]:
# Load the dataset
interactions_df = pd.read_csv('/content/interactions_dataset_15000.csv')

# Handling missing values
interactions_df.dropna(subset=['rating'], inplace=True)

# Convert timestamp to datetime
interactions_df['timestamp'] = pd.to_datetime(interactions_df['timestamp'])

interactions_df.to_csv('/content/interactions_dataset_processed.csv', index=False)

In [None]:
#######################################
# Merge datasets for modeling
merged_df = pd.merge(interactions_df, users_processed_df, on='user_id')
merged_df = pd.merge(merged_df, restaurants_processed_df, on='restaurant_id')

# Save the final processed dataset
merged_df.to_csv('/content/processed_dataset.csv', index=False)


zomato dataset

In [None]:
zomato_df = pd.read_csv('/content/zomato_with_korean.csv')

# 처음부터 663번째까지의 데이터 유지
df = zomato_df[:662]

# 수정된 데이터프레임을 CSV 파일로 저장
df.to_csv('/content/zomato_modified.csv', index=False)

In [None]:
zomato_df = pd.read_csv('/content/zomato_modified.csv')

In [None]:
zomato_df.columns.tolist()

['name',
 'rate',
 'dish_liked',
 'cuisines',
 'cost',
 'reviews_list',
 'city',
 'Mean Rating']

In [None]:


zomato_df.fillna('Unknown', inplace=True)

# One-hot encode categorical columns
encoded_columns = one_hot_encoder.fit_transform(zomato_df[['cuisines', 'city']])
encoded_df = pd.DataFrame(encoded_columns, columns=one_hot_encoder.get_feature_names_out(['cuisines', 'city']))

# Normalize numerical columns #except review
scaler = StandardScaler()
normalized_columns = scaler.fit_transform(zomato_df[['cost', 'Mean Rating']])
normalized_df = pd.DataFrame(normalized_columns, columns=['cost', 'Mean Rating'])

# Combine the processed columns
zomato_processed_df = pd.concat([zomato_df[['name', 'rate', 'dish_liked', 'reviews_list']], encoded_df, normalized_df], axis=1)

zomato_processed_df.to_csv('/content/zomato_processed.csv', index=False)


