In [29]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split
from collections import defaultdict

In [30]:
data = pd.read_csv('google_review_ratings.csv')

data = pd.DataFrame(data)

In [31]:
old = ["User", "Category 1", "Category 2", "Category 3", "Category 4", "Category 5",
        "Category 6", "Category 7", "Category 8", "Category 9", "Category 10",
        "Category 11", "Category 12", "Category 13", "Category 14", "Category 15",
        "Category 16", "Category 17", "Category 18", "Category 19", "Category 20",
        "Category 21", "Category 22", "Category 23", "Category 24", "location"
        ]

new = ["User","ART CENTRES AND GALLERIES", "FORTS", "MONUMENTS", "MUSEUMS", "PALACES",
        "HANDICRAFT CENTERS", "SCIENCE CENTRES", "HISTORICAL PLACES", "THEATRES",
        "LANDMARKS", "HORSE STABLES", "NATIONAL PARKS", "WATERFALLS", "MALLS",
        "BEACHES", "SPRINGS", "WILDLIFE SANCTUARIES", "RESTAURANTS", "SOCIAL CLUBS",
        "INTERTAINMENTS", "ZOOS", "POOLS", "GYMS", "CULTURAL CENTERS", "location"
        ]

# Create a dictionary mapping old names to new names
name_mapping = dict(zip(old, new))

# Rename the columns
data = data.rename(columns=name_mapping)

In [32]:
print("Missing values in each column:")
print(data.isnull().sum())

Missing values in each column:
User                         0
ART CENTRES AND GALLERIES    0
FORTS                        0
MONUMENTS                    0
MUSEUMS                      0
PALACES                      0
HANDICRAFT CENTERS           0
SCIENCE CENTRES              0
HISTORICAL PLACES            0
THEATRES                     0
LANDMARKS                    0
HORSE STABLES                0
NATIONAL PARKS               1
WATERFALLS                   0
MALLS                        0
BEACHES                      0
SPRINGS                      0
WILDLIFE SANCTUARIES         0
RESTAURANTS                  0
SOCIAL CLUBS                 0
INTERTAINMENTS               0
ZOOS                         0
POOLS                        0
GYMS                         0
CULTURAL CENTERS             1
location                     0
dtype: int64


In [33]:
print(data.index)

RangeIndex(start=0, stop=5456, step=1)


In [34]:
data_train, data_test = train_test_split(data, test_size=0.3)

In [35]:
# Convert all columns to numeric, errors='coerce' will set non-numeric values to NaN
data_numeric = data.apply(pd.to_numeric, errors='coerce')

data_numeric_filled = data_numeric.fillna(0)

# Compute cosine similarity
user_similarity = cosine_similarity(data_numeric_filled)

similarity_df = pd.DataFrame(user_similarity, index=data.index, columns=data.index)

print("Similarity DataFrame:\n", similarity_df)



Similarity DataFrame:
           0         1         2         3         4         5         6     \
0     1.000000  0.999667  0.999660  0.998940  0.999660  0.999683  0.883147   
1     0.999667  1.000000  0.999998  0.998608  0.999998  0.999997  0.883331   
2     0.999660  0.999998  1.000000  0.998605  1.000000  0.999998  0.883262   
3     0.998940  0.998608  0.998605  1.000000  0.998605  0.998630  0.902536   
4     0.999660  0.999998  1.000000  0.998605  1.000000  0.999998  0.883262   
...        ...       ...       ...       ...       ...       ...       ...   
5451  0.575006  0.573585  0.573616  0.591164  0.573616  0.573594  0.685359   
5452  0.589581  0.588379  0.588352  0.608449  0.588352  0.588327  0.730370   
5453  0.543419  0.542300  0.542269  0.561062  0.542269  0.542257  0.675639   
5454  0.566387  0.564980  0.564989  0.579870  0.564989  0.564978  0.659060   
5455  0.585599  0.584129  0.584137  0.599653  0.584137  0.584127  0.673102   

          7         8         9     ... 

In [36]:
print("Missing values in each column:")
print(data.isnull().sum())

Missing values in each column:
User                         0
ART CENTRES AND GALLERIES    0
FORTS                        0
MONUMENTS                    0
MUSEUMS                      0
PALACES                      0
HANDICRAFT CENTERS           0
SCIENCE CENTRES              0
HISTORICAL PLACES            0
THEATRES                     0
LANDMARKS                    0
HORSE STABLES                0
NATIONAL PARKS               1
WATERFALLS                   0
MALLS                        0
BEACHES                      0
SPRINGS                      0
WILDLIFE SANCTUARIES         0
RESTAURANTS                  0
SOCIAL CLUBS                 0
INTERTAINMENTS               0
ZOOS                         0
POOLS                        0
GYMS                         0
CULTURAL CENTERS             1
location                     0
dtype: int64


In [38]:
def fill_nans_with_top_k_average(data, similarity_matrix, k=5):
    for user in data.index:
        for category in data.columns[1:-1]:
            if pd.isna(data.loc[user, category]):
                # Find the top k most similar users (excluding the user itself)
                most_similar_users = similarity_matrix.loc[user].sort_values(ascending=False).head(k + 1)[1:]
                # Get the data of the most similar users
                similar_users_data = data.loc[most_similar_users.index]
                # Compute the average rating for the category
                avg_rating = similar_users_data[category].mean()
                # Fill the NaN value with the average rating
                data.loc[user, category] = avg_rating
                print(f"Filled NaN for user {user}, category {category} with average rating {avg_rating:.2f}")
                
                # Print the similarity values of each user from the top 5 similar users
                for similar_user, similarity in most_similar_users.items():
                    print(f"User {user} has similarity {similarity:.2f} with user {similar_user}")

# Call the function to fill NaNs in the data
fill_nans_with_top_k_average(data, similarity_df, k=5)

# Print the updated data
print("Updated data:\n", data)


Filled NaN for user 1347, category CULTURAL CENTERS with average rating 1.11
User 1347 has similarity 0.99 with user 1336
User 1347 has similarity 0.98 with user 2283
User 1347 has similarity 0.98 with user 4083
User 1347 has similarity 0.97 with user 1340
User 1347 has similarity 0.97 with user 1375
Filled NaN for user 2712, category NATIONAL PARKS with average rating 0.90
User 2712 has similarity 0.94 with user 2732
User 2712 has similarity 0.94 with user 2703
User 2712 has similarity 0.94 with user 2716
User 2712 has similarity 0.94 with user 5349
User 2712 has similarity 0.94 with user 2725
Updated data:
            User  ART CENTRES AND GALLERIES  FORTS  MONUMENTS  MUSEUMS  \
0        User 1                       0.00   0.00       3.63     3.65   
1        User 2                       0.00   0.00       3.63     3.65   
2        User 3                       0.00   0.00       3.63     3.63   
3        User 4                       0.00   0.50       3.63     3.63   
4        User 5   

In [39]:
print("Missing values in each column:")
print(data.isnull().sum())

Missing values in each column:
User                         0
ART CENTRES AND GALLERIES    0
FORTS                        0
MONUMENTS                    0
MUSEUMS                      0
PALACES                      0
HANDICRAFT CENTERS           0
SCIENCE CENTRES              0
HISTORICAL PLACES            0
THEATRES                     0
LANDMARKS                    0
HORSE STABLES                0
NATIONAL PARKS               0
WATERFALLS                   0
MALLS                        0
BEACHES                      0
SPRINGS                      0
WILDLIFE SANCTUARIES         0
RESTAURANTS                  0
SOCIAL CLUBS                 0
INTERTAINMENTS               0
ZOOS                         0
POOLS                        0
GYMS                         0
CULTURAL CENTERS             0
location                     0
dtype: int64
