In [5]:
import pandas as pd

# Load the data from CSV file
df = pd.read_csv('dataset.csv')

# Combine Category and Sub-Category into a new column
df['Category-Subcategory'] = df['Category'] + '-' + df['Sub-Category']

# Calculate the frequency of each Category-Subcategory
frequency = df['Category-Subcategory'].value_counts()

# Add a new column 'weights' based on the frequency
df['weights'] = df['Category-Subcategory'].apply(lambda x: 1 / frequency[x])

# Drop the original Category and Sub-Category columns
df = df.drop(columns=['Category', 'Sub-Category'])

# Save the updated DataFrame to a new CSV file
df.to_csv('data_combined.csv', index=False)

print("Data with combined Category-Subcategory and weights saved to 'data_combined.csv'")


Data with combined Category-Subcategory and weights saved to 'data_combined.csv'


In [6]:
import os
import pandas as pd

# Load the dataset from a CSV file
df = pd.read_csv('data_combined.csv')

# Define the range for column 'n'
n = 24

# Create a folder to store CSV files
folder_name = 'filtered_data_files'
os.makedirs(folder_name, exist_ok=True)

# Iterate over each row and filter data
for row in range(0, n+1):
    for column in range(0, n+1):
        filtered_df = df[(df['Row'] == row) & (df['Column'] == column)]
        if not filtered_df.empty:
            # Create a filename based on Row and Column
            filename = f'Data_{row}_{column}.csv'
            filepath = os.path.join(folder_name, filename)
            
            # Save filtered data to CSV file
            filtered_df.to_csv(filepath, index=False)
            print(f"Saved {filename}")

print("Filtered data saved to CSV files in folder 'filtered_data_files'")


Saved Data_0_0.csv
Saved Data_0_1.csv
Saved Data_0_2.csv
Saved Data_0_3.csv
Saved Data_0_4.csv
Saved Data_0_5.csv
Saved Data_0_6.csv
Saved Data_0_7.csv
Saved Data_0_8.csv
Saved Data_0_9.csv
Saved Data_0_10.csv
Saved Data_0_11.csv
Saved Data_0_12.csv
Saved Data_0_13.csv
Saved Data_0_14.csv
Saved Data_0_15.csv
Saved Data_0_16.csv
Saved Data_0_17.csv
Saved Data_0_18.csv
Saved Data_0_19.csv
Saved Data_0_20.csv
Saved Data_0_21.csv
Saved Data_0_22.csv
Saved Data_0_23.csv
Saved Data_0_24.csv
Saved Data_1_0.csv
Saved Data_1_1.csv
Saved Data_1_4.csv
Saved Data_1_5.csv
Saved Data_1_6.csv
Saved Data_1_7.csv
Saved Data_1_8.csv
Saved Data_1_9.csv
Saved Data_1_10.csv
Saved Data_1_11.csv
Saved Data_1_12.csv
Saved Data_1_13.csv
Saved Data_1_15.csv
Saved Data_1_16.csv
Saved Data_1_17.csv
Saved Data_1_18.csv
Saved Data_1_19.csv
Saved Data_1_20.csv
Saved Data_1_21.csv
Saved Data_1_22.csv
Saved Data_1_23.csv
Saved Data_2_0.csv
Saved Data_2_1.csv
Saved Data_2_2.csv
Saved Data_2_3.csv
Saved Data_2_4.csv
Sav

In [8]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import numpy as np

# Load the dataset
df = pd.read_csv('filtered_data_files/Data_0_0.csv')

# Calculate the frequency of each Category-Subcategory
frequency = df['Category-Subcategory'].value_counts()

# Add a new column 'weights' based on the frequency
df['weights'] = df['Category-Subcategory'].apply(lambda x: 1 / frequency[x])


# Define the custom distance metric
def custom_distance(point1, point2, w=0.5, r=0.5):
    # point1 and point2 are arrays: [x, y, weight, rating]
    distance = np.sqrt((point1[0] - point2[0])**2 + (point1[1] - point2[1])**2)
    weight_factor = (point1[2] + point2[2]) / 2  # Average weight
    rating_factor = (point1[3] + point2[3]) / 2  # Average rating
    return ((w * r) + (weight_factor * distance * (1 / rating_factor)) / (r + (weight_factor * distance)))

# Extract relevant columns for KNN
data = df[['Latitude', 'Longitude', 'weights', 'FinalRating']].values  # Extract as numpy array

# Fit the KNN model
knn = NearestNeighbors(metric=custom_distance)
knn.fit(data)

# Function to find k distinct features based on the input feature
def find_k_distinct_features(input_feature, k=5):
    # Get the corresponding row(s) for the given input feature
    query_indices = df.index[df['Category-Subcategory'] == input_feature].tolist()
    if not query_indices:
        return None
    
    query_point = data[query_indices[0]]  # Use the first matching row for query
    distances, indices = knn.kneighbors([query_point], n_neighbors=len(df))
    
    # Get the features and distances of the nearest neighbors
    nearest_features = df.iloc[indices[0]]['Category-Subcategory'].values
    nearest_distances = distances[0]
    
    # Select k distinct features and their scores
    distinct_features = {}
    seen_features = set()
    for feature, distance in zip(nearest_features, nearest_distances):
        if feature not in seen_features:
            distinct_features[feature] = 1 - distance
            seen_features.add(feature)
        if len(distinct_features) == k:
            break
    
    return distinct_features

# Example usage
input_feature = 'Restaurant-FastFood'
k = 3
distinct_features_with_scores = find_k_distinct_features(input_feature, k)
print(f"The {k} distinct features nearest to '{input_feature}' with their scores are: {distinct_features_with_scores}")


The 3 distinct features nearest to 'Restaurant-FastFood' with their scores are: {'Restaurant-FastFood': 0.75, 'Fitness&Wellness-Gym': 0.7499242417846601, 'EducationalInstitute-School': 0.749723696020214}


In [10]:
import pandas as pd
from sklearn.neighbors import NearestNeighbors
import numpy as np
import glob

# Define the custom distance metric
def custom_distance(point1, point2, w=0.5, r=0.5):
    distance = np.sqrt((point1[0] - point2[0])**2 + (point1[1] - point2[1])**2)
    weight_factor = (point1[2] + point2[2]) / 2  # Average weight
    rating_factor = (point1[3] + point2[3]) / 2  # Average rating
    return ((w * r) + (weight_factor * distance * (1 / rating_factor)) / (r + (weight_factor * distance)))

# Function to find k distinct features based on the input feature
def find_k_distinct_features(df, knn, input_feature, k=5):
    data = df[['Latitude', 'Longitude', 'weights', 'FinalRating']].values   # Extract relevant columns
    query_indices = df.index[df['Category-Subcategory'] == input_feature].tolist()
    if not query_indices:
        return None
    
    query_point = data[query_indices[0]]  # Use the first matching row for query
    distances, indices = knn.kneighbors([query_point], n_neighbors=len(df))
    
    # Get the features and distances of the nearest neighbors
    nearest_features = df.iloc[indices[0]]['Category-Subcategory'].values
    nearest_distances = distances[0]
    
    # Select k distinct features and their scores
    distinct_features = {}
    seen_features = set()
    for feature, distance in zip(nearest_features, nearest_distances):
        if feature not in seen_features:
            distinct_features[feature] = 1 - distance  # Score is 1 - distance
            seen_features.add(feature)
        if len(distinct_features) == k:
            break
    
    return distinct_features    

# Load multiple datasets and find nearest features
all_datas = glob.glob('filtered_data_files/Data*.csv')
datasets = []

# Iterate through each dataset
for all_data in all_datas:
    df = pd.read_csv(all_data)
    datasets.append(df)

# Assume ratings are extracted from each dataset
ratings = [df['Population'].unique()[0] for df in datasets]  # List of rating values corresponding to each dataset
input_feature = 'Restaurant-FastFood'
k = 5

results = []

for dataset, rating in zip(datasets, ratings):
    # Extract relevant columns for KNN
    data = dataset[['Latitude', 'Longitude', 'weights', 'FinalRating']].values 
    
    # Fit the KNN model
    knn = NearestNeighbors(metric=custom_distance)
    knn.fit(data)
    
    # Find k distinct features
    distinct_features = find_k_distinct_features(dataset, knn, input_feature, k)
    
    if distinct_features:
        for feature, score in distinct_features.items():
            results.append({'Category-Subcategory': feature, 'score': score, 'Population': rating})

# Convert results to DataFrame and save to CSV
results_df = pd.DataFrame(results)
results_df.to_csv('final_results.csv', index=False)

print(f"Results saved to 'final_results.csv'")


Results saved to 'final_results.csv'


In [11]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('final_results.csv')

# Filter rows where col1 equals 'k'
df_filtered = df[df['Category-Subcategory'] != 'Restaurant-FastFood']

# Save the filtered DataFrame back to a CSV file
df_filtered.to_csv('final_results.csv', index=False)


In [16]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder

# Load the data from CSV file
df = pd.read_csv('final_results.csv')

# One-hot encode the 'Category-Subcategory' column
encoder = OneHotEncoder(sparse_output=False)
encoded_features = encoder.fit_transform(df[['Category-Subcategory']])
encoded_feature_names = encoder.get_feature_names_out(['Category-Subcategory'])
df_encoded = pd.DataFrame(encoded_features, columns=encoded_feature_names)

# Combine the encoded features with the original dataframe
df_combined = pd.concat([df.drop(columns=['Category-Subcategory']), df_encoded], axis=1)

# Ensure 'Population' is included in the feature set
# Separate features and target
X = df_combined.drop(columns=["score"])
y = df_combined["score"]

# Train a Random Forest Regressor
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X, y)

# Extract feature importances
feature_importances = model.feature_importances_
features = X.columns

# Create a DataFrame for feature importances
importance_df = pd.DataFrame({
    "feature": features,
    "importance": feature_importances
}).sort_values(by="importance", ascending=False)

# Filter to get the top 4 features from 'Category-Subcategory'
top_features = importance_df[importance_df['feature'].str.startswith('Category-Subcategory')].head(4)

# Remove 'Category-Subcategory_' prefix
top_features['Category-Subcategory'] = top_features['feature'].str.replace('Category-Subcategory_', '')

# Select relevant columns and display
top_features = top_features[['Category-Subcategory', 'importance']]
print(top_features)


                Category-Subcategory  importance
18       Fitness&Wellness-YogaStudio    0.220170
20    GovernmentBuilding-FireStation    0.147327
5   EducationalInstitute-MusicSchool    0.063186
6        EducationalInstitute-School    0.058920


In [17]:
import pandas as pd

# Example of top_features DataFrame, replace this with the actual top_features DataFrame you have


# Load the data_combined.csv file
data_combined = pd.read_csv('data_combined.csv')

# Extract Latitude and Longitude for the rows containing Category-Subcategory from top_features
top_categories = top_features['Category-Subcategory']
filtered_data = data_combined[data_combined['Category-Subcategory'].isin(top_categories)]

# Merge with top_features to include importance
final_data = filtered_data.merge(top_features, on='Category-Subcategory')

# Save the result to model.csv
final_data.to_csv('model.csv', index=False)

print(final_data)


      Latitude  Longitude  Row  Column  Population  FinalRating  \
0    30.714827  76.738702   10       1      8454.6     2.750000   
1    30.714827  76.738702   10       1      8454.6     2.750000   
2    30.714827  76.738702   10       1      8454.6     2.750000   
3    30.741681  76.747399   21       9      9005.0     3.920000   
4    30.741681  76.747399   21       9      9005.0     3.920000   
..         ...        ...  ...     ...         ...          ...   
819  30.741367  76.766978   24      24      2495.6     3.936000   
820  30.739010  76.761064   20      21      7887.6     3.500000   
821  30.740178  76.757908   20      18      8137.4     4.500000   
822  30.742323  76.758953   21      19      8039.4     4.307692   
823  30.732280  76.770713   24      24      2495.6     2.750000   

                 Category-Subcategory   weights  importance  
0      GovernmentBuilding-FireStation  0.047619    0.147327  
1      GovernmentBuilding-FireStation  0.047619    0.147327  
2      Go

In [18]:
import pandas as pd
from sklearn.ensemble import RandomForestRegressor

# Load the model.csv file containing Latitude, Longitude, and importance
model_data = pd.read_csv('model.csv')

# List of Category-Subcategory values
categories = model_data['Category-Subcategory'].unique()

# Dictionary to store predictions for each Category-Subcategory
predictions_dict = {}

# Train a separate model for each Category-Subcategory
for category in categories:
    # Filter data for the current Category-Subcategory
    category_data = model_data[model_data['Category-Subcategory'] == category]
    
    # Prepare features (Latitude, Longitude) and target (importance)
    X = category_data[['Latitude', 'Longitude']]
    y = category_data['importance']
    
    # Train a RandomForestRegressor model
    model = RandomForestRegressor(n_estimators=100, random_state=42)
    model.fit(X, y)
    
    # Predict importance for the current Category-Subcategory
    predictions = model.predict(X)
    
    # Store predictions in the dictionary
    predictions_dict[category] = predictions

# Create a DataFrame to hold all predictions
predictions_df = pd.DataFrame()

for category, predictions in predictions_dict.items():
    category_data = model_data[model_data['Category-Subcategory'] == category]
    category_data['predictions'] = predictions
    predictions_df = pd.concat([predictions_df, category_data], ignore_index=True)

# Save predictions to a new CSV file
predictions_df.to_csv('predictions.csv', index=False)

print(predictions_df)


      Latitude  Longitude  Row  Column  Population  FinalRating  \
0    30.714827  76.738702   10       1      8454.6     2.750000   
1    30.714827  76.738702   10       1      8454.6     2.750000   
2    30.714827  76.738702   10       1      8454.6     2.750000   
3    30.741681  76.747399   21       9      9005.0     3.920000   
4    30.741681  76.747399   21       9      9005.0     3.920000   
..         ...        ...  ...     ...         ...          ...   
819  30.614551  76.878546   24      24      2495.6     2.750000   
820  30.739010  76.761064   20      21      7887.6     3.500000   
821  30.740178  76.757908   20      18      8137.4     4.500000   
822  30.742323  76.758953   21      19      8039.4     4.307692   
823  30.732280  76.770713   24      24      2495.6     2.750000   

                 Category-Subcategory   weights  importance  predictions  
0      GovernmentBuilding-FireStation  0.047619    0.147327     0.147327  
1      GovernmentBuilding-FireStation  0.0476

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_data['predictions'] = predictions
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_data['predictions'] = predictions
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  category_data['predictions'] = predictions
A value is trying to be set on a copy of a slice from a DataFrame.
Try using

In [19]:
import pandas as pd

# Load the dataset
df = pd.read_csv('model.csv')

# Group by Latitude and Longitude and calculate the weighted importance
df['WeightedImportance'] = df['FinalRating'] * df['importance']
grouped_df = df.groupby(['Latitude', 'Longitude']).agg({'WeightedImportance': 'sum'}).reset_index()

# Find the Latitude and Longitude with the highest weighted importance
best_location = grouped_df.loc[grouped_df['WeightedImportance'].idxmax()]

print(f"Best Latitude: {best_location['Latitude']}")
print(f"Best Longitude: {best_location['Longitude']}")


Best Latitude: 30.713844
Best Longitude: 76.753389


In [20]:
import pandas as pd

# Load the dataset
df = pd.read_csv('model.csv')

# Calculate Weighted Importance for each row
df['WeightedImportance'] = df['FinalRating'] * df['importance']

# Group by Latitude and Longitude and sum the weighted importance
grouped_df = df.groupby(['Latitude', 'Longitude']).agg({'WeightedImportance': 'sum'}).reset_index()

# DP table initialization
n = len(grouped_df)
dp = [0] * (n + 1)

# Compute the DP table
for i in range(1, n + 1):
    dp[i] = max(dp[i-1], dp[i-1] + grouped_df.loc[i-1, 'WeightedImportance'])

# Find the best location
max_importance = max(dp)
best_index = dp.index(max_importance)

best_location = grouped_df.loc[best_index - 1]

print(f"Best Latitude: {best_location['Latitude']}")
print(f"Best Longitude: {best_location['Longitude']}")
print(f"Maximum Weighted Importance: {max_importance}")


Best Latitude: 30.862174
Best Longitude: 76.662754
Maximum Weighted Importance: 402.1613451092763


In [24]:
import pandas as pd

# Load the dataset
df = pd.read_csv('model.csv')

# Calculate Weighted Importance for each row
df['WeightedImportance'] = df['FinalRating'] * df['importance']

# Filter the dataset to ensure the values fall within the specified range
latitude_min = 30.60805029690371
latitude_max = 30.790863413283007
longitude_min = 76.70726499899425
longitude_max = 76.7948349520892

filtered_df = df[(df['Latitude'] >= latitude_min) & (df['Latitude'] <= latitude_max) &
                 (df['Longitude'] >= longitude_min) & (df['Longitude'] <= longitude_max)]
print(filtered_df)
# Group by Latitude and Longitude and sum the weighted importance
grouped_df = filtered_df.groupby(['Latitude', 'Longitude']).agg({'WeightedImportance': 'sum'}).reset_index()

# DP table initialization
n = len(grouped_df)
dp = [0] * (n + 1)

# Store corresponding latitude and longitude
locations = [(0, 0)] * (n + 1)

# Compute the DP table
for i in range(1, n + 1):
    if dp[i-1] > dp[i-1] + grouped_df.loc[i-1, 'WeightedImportance']:
        dp[i] = dp[i-1]
        locations[i] = locations[i-1]
    else:
        dp[i] = dp[i-1] + grouped_df.loc[i-1, 'WeightedImportance']
        locations[i] = (grouped_df.loc[i-1, 'Latitude'], grouped_df.loc[i-1, 'Longitude'])

# Find the best location
max_importance = max(dp)
best_index = dp.index(max_importance)
best_location = locations[best_index]

print(f"Best Latitude: {best_location[0]}")
print(f"Best Longitude: {best_location[1]}")
print(f"Maximum Weighted Importance: {max_importance}")


      Latitude  Longitude  Row  Column  Population  FinalRating  \
0    30.714827  76.738702   10       1      8454.6     2.750000   
1    30.714827  76.738702   10       1      8454.6     2.750000   
2    30.714827  76.738702   10       1      8454.6     2.750000   
3    30.741681  76.747399   21       9      9005.0     3.920000   
4    30.741681  76.747399   21       9      9005.0     3.920000   
..         ...        ...  ...     ...         ...          ...   
819  30.741367  76.766978   24      24      2495.6     3.936000   
820  30.739010  76.761064   20      21      7887.6     3.500000   
821  30.740178  76.757908   20      18      8137.4     4.500000   
822  30.742323  76.758953   21      19      8039.4     4.307692   
823  30.732280  76.770713   24      24      2495.6     2.750000   

                 Category-Subcategory   weights  importance  \
0      GovernmentBuilding-FireStation  0.047619    0.147327   
1      GovernmentBuilding-FireStation  0.047619    0.147327   
2     

In [26]:
import pandas as pd

# Load the dataset
df = pd.read_csv('model.csv')

# Calculate Weighted Importance for each row
df['WeightedImportance'] = df['FinalRating'] * df['importance']

# Filter the dataset to ensure the values fall within the specified range
latitude_min = 30.60805029690371
latitude_max = 30.790863413283007
longitude_min = 76.70726499899425
longitude_max = 76.7948349520892

filtered_df = df[(df['Latitude'] >= latitude_min) & (df['Latitude'] <= latitude_max) &
                 (df['Longitude'] >= longitude_min) & (df['Longitude'] <= longitude_max)]

# Group by Latitude and Longitude and sum the weighted importance
grouped_df = filtered_df.groupby(['Latitude', 'Longitude']).agg({'WeightedImportance': 'sum'}).reset_index()

# DP table initialization
n = len(grouped_df)
dp = [0] * (n + 1)

# Store corresponding latitude and longitude
locations = [(0, 0)] * (n + 1)

# Compute the DP table
for i in range(1, n + 1):
    if dp[i-1] > dp[i-1] + grouped_df.loc[i-1, 'WeightedImportance']:
        dp[i] = dp[i-1]
        locations[i] = locations[i-1]
    else:
        dp[i] = dp[i-1] + grouped_df.loc[i-1, 'WeightedImportance']
        locations[i] = (grouped_df.loc[i-1, 'Latitude'], grouped_df.loc[i-1, 'Longitude'])

# Find the best location
max_importance = max(dp)
best_index = dp.index(max_importance)
best_location = locations[best_index]

print(f"Best Latitude: {best_location[0]}")
print(f"Best Longitude: {best_location[1]}")
print(f"Maximum Weighted Importance: {max_importance}")


Best Latitude: 30.75098
Best Longitude: 76.756848
Maximum Weighted Importance: 400.0331732967293
