In [None]:
import pandas as pd
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt

# Load your dataset
df = pd.read_csv('Placestestf.csv', encoding='latin-1')

# Ensure your dataset has the necessary columns
required_columns = ['City', 'Place_Name', 'latitude', 'longitude', 'Ratings', 'votes', 'Categories', 'Place_desc']
for col in required_columns:
    if col not in df.columns:
        raise ValueError(f"Missing required column: {col}")

# Drop rows with missing coordinates
df = df.dropna(subset=['latitude', 'longitude'])

# Handle missing values in Ratings and votes by filling with a default value
df['Ratings'].fillna(0, inplace=True)
df['votes'].fillna(0, inplace=True)

# Function to calculate the weight based on ratings and votes
def calculate_weight(row):
    if row['Ratings'] > 0 and row['votes'] > 0:
        return (row['Ratings'] + (row['votes'] / 1000)) / 2  # Aggregate by averaging
    elif row['Ratings'] > 0:
        return row['Ratings']
    elif row['votes'] > 0:
        return row['votes'] / 1000  # Scale votes down
    else:
        return 0  # Default value if both are missing

df['Weight'] = df.apply(calculate_weight, axis=1)

# Min-max normalization to scale weights between 2.5 and 3.5
min_weight = df['Weight'].min()
max_weight = df['Weight'].max()

df['Score'] = 2.5 + (df['Weight'] - min_weight) * (3.5 - 2.5) / (max_weight - min_weight)

# Function to match categories
def match_categories(categories_string, selected_categories):
    if pd.isna(categories_string):
        return False
    categories_list = categories_string.split(',')
    for category in categories_list:
        if any(cat.strip().lower() in category.strip().lower() for cat in selected_categories):
            return True
    return False

# Function to recommend places
def recommend_places(city, categories, num_days, places_per_day):
    # Filter places for the specified city
    city_places = df[df['City'] == city]
    city_places = city_places.sort_values(by=['Score'], ascending=False)

    # Filter places based on user-specified categories
    primary_places = city_places[city_places['Categories'].apply(lambda x: match_categories(str(x), categories))]
    secondary_places = city_places[~city_places['Categories'].apply(lambda x: match_categories(str(x), categories))]
    
    # Combine primary and secondary places
    combined_places = pd.concat([primary_places, secondary_places]).drop_duplicates(subset=['Place_Name'])

    # Apply K-Means clustering
    k = 5  # Adjust based on your requirements
    kmeans = KMeans(n_clusters=k, random_state=42)
    combined_places['Cluster'] = kmeans.fit_predict(combined_places[['latitude', 'longitude']])
    
    # Plot the clusters (optional)
    plt.figure(figsize=(12, 8))
    plt.scatter(combined_places['longitude'], combined_places['latitude'], c=combined_places['Cluster'], cmap='viridis', marker='o')
    plt.colorbar(label='Cluster')
    plt.xlabel('Longitude')
    plt.ylabel('Latitude')
    plt.title(f'K-Means Clustering of Places in {city}')
    plt.show()

    # Group by clusters to ensure diversity
    clusters = combined_places.groupby('Cluster')

    # Create itinerary
    itinerary = []
    selected_indices = set()  # To keep track of selected places to avoid duplicates
    for day in range(num_days):
        daily_itinerary = []
        for cluster_id, cluster_data in clusters:
            cluster_places = cluster_data[~cluster_data['Place_Name'].isin(selected_indices)].head(places_per_day)
            daily_itinerary.extend(cluster_places.to_dict('records'))
            selected_indices.update(cluster_places['Place_Name'])
            if len(daily_itinerary) >= places_per_day:
                break
        itinerary.append(daily_itinerary[:places_per_day])  # Ensure we only take the desired number of places

        # Drop the selected places to avoid duplicates in subsequent days
        combined_places = combined_places[~combined_places['Place_Name'].isin(selected_indices)]

        # Update clusters after dropping the selected places
        clusters = combined_places.groupby('Cluster')

    return itinerary

# Take user inputs
city = input("Enter the city name: ")

# Display available categories in the city
available_categories = df[df['City'] == city]['Categories'].dropna().unique()
available_categories = [cat.strip() for sublist in available_categories for cat in sublist.split(',')]
available_categories = list(set(available_categories))  # Get unique categories
print("Available categories in the city:", ', '.join(available_categories))

# Ensure user selects valid categories
selected_categories = []
while not selected_categories:
    categories = input("Enter categories (comma-separated) from the above list: ").split(',')
    selected_categories = [cat.strip() for cat in categories if cat.strip() in available_categories]
    if not selected_categories:
        print("Please select valid categories from the list provided.")

places_per_day = int(input("Enter the number of places per day: "))

# Calculate the maximum number of days
total_places = df[(df['City'] == city) & (df['Categories'].apply(lambda x: match_categories(str(x), selected_categories)))].shape[0]
max_days = -(-total_places // places_per_day)  # Ceiling division
print(f"Based on {places_per_day} places per day, you can cover the places in a maximum of {max_days} days.")

num_days = int(input(f"Enter the number of days (max {max_days}): "))
if num_days > max_days:
    print(f"You requested more days than the recommended maximum of {max_days} days.")
    print("You will be provided with the top places based on weight for the additional days.")
    
    # Ask user for additional categories only once
    additional_categories = input("Enter additional categories (comma-separated) from the available list or press Enter to skip: ").split(',')
    additional_categories = [cat.strip() for cat in additional_categories if cat.strip() in available_categories]
    
    if additional_categories:
        selected_categories.extend(additional_categories)
        total_places = df[(df['City'] == city) & (df['Categories'].apply(lambda x: match_categories(str(x), selected_categories)))].shape[0]
        max_days = -(-total_places // places_per_day)  # Ceiling division
        print(f"Based on {places_per_day} places per day, you can now cover the places in a maximum of {max_days} days.")
    
    # Adjust num_days if additional categories increased the max_days
    num_days = int(input(f"Enter the number of days (max {max_days}): "))

# Generate the initial itinerary
itinerary = recommend_places(city, selected_categories, min(num_days, max_days), places_per_day)

# If the user requested more days, add top weighted places for the extra days
if num_days > max_days:
    additional_days = num_days - max_days
    top_places = df[df['City'] == city].sort_values(by='Weight', ascending=False).drop_duplicates(subset=['Place_Name'])
    additional_itinerary = []
    selected_indices = set([place['Place_Name'] for day in itinerary for place in day])  # Update with already selected places
    for day in range(additional_days):
        daily_itinerary = top_places[~top_places['Place_Name'].isin(selected_indices)].head(places_per_day).to_dict('records')
        additional_itinerary.append(daily_itinerary)
        selected_indices.update([place['Place_Name'] for place in daily_itinerary])
    itinerary.extend(additional_itinerary)

# Display the itinerary
for day, daily_itinerary in enumerate(itinerary, start=1):
    print(f"Day {day}:")
    for place in daily_itinerary:
        print(f"  {place['Place_Name']} - {place['Categories']} - Rating: {place['Ratings']} - Votes: {place['votes']} - Score: {place['Score']} - Latitude: {place['latitude']} - Longitude: {place['longitude']} - Description: {place['Place_desc']}")
