## Initial Model

In [5]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import MinMaxScaler
import joblib


df0 = pd.read_csv('places_v7.csv')
df1 = pd.read_csv('places_v8.csv')

# Ensure necessary columns exist
required_columns = ['categories', 'name', 'rating', 'user_ratings_total', 'positive_words', 'negative_words']
for df in [df0, df1]:
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Column '{col}' not found in the CSV file.")

# Create X and y for both datasets
X0 = df0['categories']
y0 = df0['name']

X1 = df1['categories']
y1 = df1['name']

# Initialize the TF-IDF vectorizer
tfidf_vectorizer0 = TfidfVectorizer()
tfidf_vectorizer1 = TfidfVectorizer()

# Fit and transform the input data
X0_tfidf = tfidf_vectorizer0.fit_transform(X0)
X1_tfidf = tfidf_vectorizer1.fit_transform(X1)

# Initialize the KNN classifier
knn_classifier0 = KNeighborsClassifier(n_neighbors=10, metric='cosine')
knn_classifier1 = KNeighborsClassifier(n_neighbors=10, metric='cosine')

# Fit the classifier to the data
knn_classifier0.fit(X0_tfidf, y0)
knn_classifier1.fit(X1_tfidf, y1)

# Function to calculate composite score
def calculate_score(row):
    rating = row['rating']
    rating_count = row['user_ratings_total']
    positive_count = row['positive_words']
    negative_count = row['negative_words']
    
    # Weighted sum of factors (adjust these weights as needed)
    score = (
        0.2 * rating +
        0.2 * np.log1p(rating_count) +  # log to dampen the effect of very high counts
        0.5 * (positive_count / (positive_count + negative_count + 1)) +  # sentiment ratio
        0.1 * np.log1p(positive_count + negative_count)  # total review length
    )
    return score

# Calculate scores for all places in both datasets
df0['score'] = df0.apply(calculate_score, axis=1)
df1['score'] = df1.apply(calculate_score, axis=1)

# Normalize scores
scaler0 = MinMaxScaler()
scaler1 = MinMaxScaler()
df0['normalized_score'] = scaler0.fit_transform(df0[['score']])
df1['normalized_score'] = scaler1.fit_transform(df1[['score']])

# Define input categories
input_categories = "wildlife, theater, safaris"

# Split input categories by comma and strip any extra spaces
category_list = [category.strip() for category in input_categories.split(',')]

# Function to get predictions for a single category, verify them, and rank by score
def get_verified_top_2_predictions(category, df, classifier, tfidf_vectorizer):
    # Transform the input category using the same TF-IDF vectorizer
    category_tfidf = tfidf_vectorizer.transform([category])

    # Get the top 10 predictions for the dataset
    top_10_predictions = classifier.kneighbors(category_tfidf, n_neighbors=10, return_distance=False)[0]

    # Get predicted place names and verify their actual categories
    verified_places = []
    for prediction in top_10_predictions:
        place_row = df.iloc[prediction]
        actual_category = place_row['categories']
        
        # Verify if the predicted place's category matches the input category
        if category.lower() in actual_category.lower():
            verified_places.append(place_row)
    
    # Convert to DataFrame and sort by normalized score
    if len(verified_places) > 0:
        verified_df = pd.DataFrame(verified_places)
        verified_df_sorted = verified_df.sort_values('normalized_score', ascending=False).head(2)
        return verified_df_sorted[['name', 'rating', 'user_ratings_total', 'normalized_score']].to_dict('records')
    
    return []  # If no verified places found, return empty list

# List to store the final top 6 places
final_places_list = []

# Iterate over each category and collect the top 2 places
for category in category_list:
    # Get verified places for df0 and df1
    verified_places_0 = get_verified_top_2_predictions(category, df0, knn_classifier0, tfidf_vectorizer0)
    verified_places_1 = get_verified_top_2_predictions(category, df1, knn_classifier1, tfidf_vectorizer1)
    
    # Combine results from both datasets and limit to 2 places
    final_places = (verified_places_0 + verified_places_1)[:2]
    
    # Add the place names to the final list
    for place in final_places:
        final_places_list.append(place['name'])

# Print the final list of 6 places
print("Final List of Top 5 Places:")
print(final_places_list[:5])

Final List of Top 5 Places:
['Dehiwala Zoological Gardens', 'Udawatta Kele Sanctuary', 'Nelum Pokuna Theatre', 'Nelung Arts Centre', 'Ridiyagama Safari Park']


In [None]:
joblib.dump(knn_classifier0, 'knn_classifier0.pkl')
joblib.dump(knn_classifier1, 'knn_classifier1.pkl')
joblib.dump(tfidf_vectorizer0, 'tfidf_vectorizer0.pkl')
joblib.dump(tfidf_vectorizer1, 'tfidf_vectorizer1.pkl')

## Inference

In [4]:
import joblib
import pandas as pd
import numpy as np


# Load the saved models and vectorizers
knn_classifier0 = joblib.load('knn_classifier0.pkl')
knn_classifier1 = joblib.load('knn_classifier1.pkl')
tfidf_vectorizer0 = joblib.load('tfidf_vectorizer0.pkl')
tfidf_vectorizer1 = joblib.load('tfidf_vectorizer1.pkl')

# Reload the datasets (for reference and category validation)
df0 = pd.read_csv('places_v7.csv')
df1 = pd.read_csv('places_v8.csv')

# Function to calculate composite score (reused for inference)
def calculate_score(row):
    rating = row['rating']
    rating_count = row['user_ratings_total']
    positive_count = row['positive_words']
    negative_count = row['negative_words']
    
    # Weighted sum of factors (you can adjust these weights)
    score = (
        0.2 * rating +
        0.2 * np.log1p(rating_count) +  # log to dampen the effect of very high counts
        0.5 * (positive_count / (positive_count + negative_count + 1)) +  # sentiment ratio
        0.1 * np.log1p(positive_count + negative_count)  # total review length
    )
    return score

# Recalculate the scores (same as training)
df0['score'] = df0.apply(calculate_score, axis=1)
df1['score'] = df1.apply(calculate_score, axis=1)

# Input categories for inference
input_categories = "wildlife, theater, safaris"

# Split input categories by comma and strip any extra spaces
category_list = [category.strip() for category in input_categories.split(',')]

# Function to get predictions for a single category, verify them, and rank by score
def get_verified_top_2_predictions(category, df, classifier, tfidf_vectorizer):
    # Transform the input category using the same TF-IDF vectorizer
    category_tfidf = tfidf_vectorizer.transform([category])

    # Get the top 10 predictions for the dataset
    top_10_predictions = classifier.kneighbors(category_tfidf, n_neighbors=10, return_distance=False)[0]

    # Get predicted place names and verify their actual categories
    verified_places = []
    for prediction in top_10_predictions:
        place_row = df.iloc[prediction]
        actual_category = place_row['categories']
        
        # Verify if the predicted place's category matches the input category
        if category.lower() in actual_category.lower():
            verified_places.append(place_row)
    
    # Convert to DataFrame and sort by normalized score
    if len(verified_places) > 0:
        verified_df = pd.DataFrame(verified_places)
        verified_df_sorted = verified_df.sort_values('score', ascending=False).head(2)
        return verified_df_sorted[['name', 'rating', 'user_ratings_total', 'score']].to_dict('records')
    
    return []  # If no verified places found, return empty list

# List to store the final top 6 places
final_places_list = []

# Iterate over each category and collect the top 2 places
for category in category_list:
    # Get verified places for df0 and df1
    verified_places_0 = get_verified_top_2_predictions(category, df0, knn_classifier0, tfidf_vectorizer0)
    verified_places_1 = get_verified_top_2_predictions(category, df1, knn_classifier1, tfidf_vectorizer1)
    
    # Combine results from both datasets and limit to 2 places
    final_places = (verified_places_0 + verified_places_1)[:2]
    
    # Add the place names to the final list
    for place in final_places:
        final_places_list.append(place['name'])

# Print the final list of 6 places
print("Final List of Top 5 Places:")
print(final_places_list[:5])


Final List of Top 5 Places:
['Dehiwala Zoological Gardens', 'Udawatta Kele Sanctuary', 'Nelum Pokuna Theatre', 'Nelung Arts Centre', 'Ridiyagama Safari Park']
