In [1]:
pip install geopy


Defaulting to user installation because normal site-packages is not writeable
Collecting geopy
  Downloading geopy-2.4.1-py3-none-any.whl.metadata (6.8 kB)
Collecting geographiclib<3,>=1.52 (from geopy)
  Downloading geographiclib-2.0-py3-none-any.whl.metadata (1.4 kB)
Downloading geopy-2.4.1-py3-none-any.whl (125 kB)
   ---------------------------------------- 0.0/125.4 kB ? eta -:--:--
   ------ --------------------------------- 20.5/125.4 kB ? eta -:--:--
   --------------- ----------------------- 51.2/125.4 kB 525.1 kB/s eta 0:00:01
   ---------------------------------------  122.9/125.4 kB 1.0 MB/s eta 0:00:01
   ---------------------------------------  122.9/125.4 kB 1.0 MB/s eta 0:00:01
   -------------------------------------- 125.4/125.4 kB 616.0 kB/s eta 0:00:00
Downloading geographiclib-2.0-py3-none-any.whl (40 kB)
   ---------------------------------------- 0.0/40.3 kB ? eta -:--:--
   ---------------------------------------- 40.3/40.3 kB 2.0 MB/s eta 0:00:00
Installing col

In [1]:
import warnings
warnings.filterwarnings("ignore")

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
plt.rcParams['figure.dpi'] = 200
import seaborn as sns
import math
from scipy import stats
from scipy.stats import norm

In [3]:
data = pd.read_csv('Top Indian Places to Visit.csv')
data.head(3)

Unnamed: 0.1,Unnamed: 0,Zone,State,City,Name,Type,Establishment Year,time needed to visit in hrs,Google review rating,Entrance Fee in INR,Airport with 50km Radius,Weekly Off,Significance,DSLR Allowed,Number of google review in lakhs,Best Time to visit
0,0,Northern,Delhi,Delhi,India Gate,War Memorial,1921,0.5,4.6,0,Yes,,Historical,Yes,2.6,Evening
1,1,Northern,Delhi,Delhi,Humayun's Tomb,Tomb,1572,2.0,4.5,30,Yes,,Historical,Yes,0.4,Afternoon
2,2,Northern,Delhi,Delhi,Akshardham Temple,Temple,2005,5.0,4.6,60,Yes,,Religious,No,0.4,Afternoon


In [4]:
data["Zone"].unique()

array(['Northern', 'Western', 'Southern', 'Eastern', 'Central',
       'North Eastern'], dtype=object)

In [5]:
data["Airport with 50km Radius"].unique()

array(['Yes', 'No'], dtype=object)

In [6]:
data["DSLR Allowed"].unique()

array(['Yes', 'No'], dtype=object)

In [7]:
data["Best Time to visit"].unique()

array(['Evening', 'Afternoon', 'Morning', 'All', 'All ', 'Anytime',
       'Night'], dtype=object)

In [8]:
data.shape

(325, 16)

In [9]:
data.describe()

Unnamed: 0.1,Unnamed: 0,time needed to visit in hrs,Google review rating,Entrance Fee in INR,Number of google review in lakhs
count,325.0,325.0,325.0,325.0,325.0
mean,162.0,1.807692,4.486154,115.809231,0.408438
std,93.963645,0.971398,0.27458,530.859785,0.646668
min,0.0,0.5,1.4,0.0,0.01
25%,81.0,1.0,4.4,0.0,0.059
50%,162.0,1.5,4.5,0.0,0.17
75%,243.0,2.0,4.6,40.0,0.5
max,324.0,7.0,4.9,7500.0,7.4


In [10]:
data.info() # here we can see "weekly off" contains too much NULL values

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 325 entries, 0 to 324
Data columns (total 16 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Unnamed: 0                        325 non-null    int64  
 1   Zone                              325 non-null    object 
 2   State                             325 non-null    object 
 3   City                              325 non-null    object 
 4   Name                              325 non-null    object 
 5   Type                              325 non-null    object 
 6   Establishment Year                325 non-null    object 
 7   time needed to visit in hrs       325 non-null    float64
 8   Google review rating              325 non-null    float64
 9   Entrance Fee in INR               325 non-null    int64  
 10  Airport with 50km Radius          325 non-null    object 
 11  Weekly Off                        32 non-null     object 
 12  Signific

In [11]:
data.isnull().sum() # checking for NULL values in each column

Unnamed: 0                            0
Zone                                  0
State                                 0
City                                  0
Name                                  0
Type                                  0
Establishment Year                    0
time needed to visit in hrs           0
Google review rating                  0
Entrance Fee in INR                   0
Airport with 50km Radius              0
Weekly Off                          293
Significance                          0
DSLR Allowed                          0
Number of google review in lakhs      0
Best Time to visit                    0
dtype: int64

In [12]:
(293/325)*100  # so 90% data is missing in "weekly off" column, its better to drop the column

90.15384615384615

In [13]:
data = data.drop(['Weekly Off'], axis = 1) #dropping column "weelky off"

In [14]:
data.loc[data.duplicated()] # there is no duplicated rows

Unnamed: 0.1,Unnamed: 0,Zone,State,City,Name,Type,Establishment Year,time needed to visit in hrs,Google review rating,Entrance Fee in INR,Airport with 50km Radius,Significance,DSLR Allowed,Number of google review in lakhs,Best Time to visit


In [15]:

# Encode categorical features
data['Airport with 50km Radius'] = data['Airport with 50km Radius'].map({'Yes': 1, 'No': 0})
data['DSLR Allowed'] = data['DSLR Allowed'].map({'Yes': 1, 'No': 0})


In [16]:
# Calculate overall score
data['overall_score'] = (data['Google review rating'] * 0.4) + \
                        (data['Number of google review in lakhs'] * 0.3) + \
                        (data['time needed to visit in hrs'] * 0.2) - \
                        (data['Entrance Fee in INR'] * 0.1)


In [80]:

from geopy.distance import geodesic

# Manually add latitude and longitude to your dataset
city_coordinates = {
    'Delhi': (28.7041, 77.1025),
    'Mumbai': (19.0760, 72.8777),
    'Kolkata': (22.5726, 88.3639),
    'Bangalore': (12.9716, 77.5946),
    'Hyderabad': (17.3850, 78.4867),
    
}

def get_city_coordinates(city):
    return city_coordinates.get(city, (0,0))

def calculate_distance(city1_coords, city2_coords):
    return geodesic(city1_coords, city2_coords).km
    

# Assuming user city is Kolkata
user_city = 'Kolkata'
user_city_coords = get_city_coordinates(user_city)


# Add distance column
data['distance_km'] = data['City'].apply(lambda city: calculate_distance(user_city_coords, get_city_coordinates(city)))



In [85]:
data.head(3)

Unnamed: 0.1,Unnamed: 0,Zone,State,City,Name,Type,Establishment Year,time needed to visit in hrs,Google review rating,Entrance Fee in INR,Airport with 50km Radius,Significance,DSLR Allowed,Number of google review in lakhs,Best Time to visit,overall_score,distance_km
0,0,Northern,Delhi,Delhi,India Gate,War Memorial,1921,0.5,4.6,0,1,Historical,1,2.6,Evening,2.72,1318.138916
1,1,Northern,Delhi,Delhi,Humayun's Tomb,Tomb,1572,2.0,4.5,30,1,Historical,1,0.4,Afternoon,-0.68,1318.138916
2,2,Northern,Delhi,Delhi,Akshardham Temple,Temple,2005,5.0,4.6,60,1,Religious,0,0.4,Afternoon,-3.04,1318.138916


In [81]:
# Filter places within 300 km
filtered_data = data[data['distance_km'] <= 300]


# Model Building

In [82]:
from sklearn.model_selection import train_test_split,cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

# Define features and target
features = ['Google review rating', 'Number of google review in lakhs', 'time needed to visit in hrs', 'Entrance Fee in INR', 'distance_km']
target = 'overall_score'

# Split the data
X = filtered_data[features]
y = filtered_data[target]
# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Perform cross-validation
cv_scores = cross_val_score(model, X_scaled, y, cv=5, scoring='r2')
print(f'Cross-Validation R-squared Scores: {cv_scores}')
print(f'Mean Cross-Validation R-squared Score: {cv_scores.mean()}')

y_pred = model.predict(X_test) # predict score



Cross-Validation R-squared Scores: [1. 1. 1. 1. 1.]
Mean Cross-Validation R-squared Score: 1.0


In [83]:
# Rank places based on predicted scores
filtered_data['predicted_score'] = model.predict(X_scaled)
ranked_places = filtered_data.sort_values(by='predicted_score', ascending=False)

# Result

In [84]:
# Display top weekend destinations
top_weekend_places = ranked_places[['Name', 'City', 'predicted_score']]
print(top_weekend_places.head(10))

                          Name     City  predicted_score
45    Dakshineswar Kali Temple  Kolkata         0.576845
43               Howrah Bridge  Kolkata         0.564244
49        Science City Kolkata  Kolkata         0.461336
42           Victoria Memorial  Kolkata         0.400377
50                  Belur Math  Kolkata         0.397235
44               Indian Museum  Kolkata         0.044706
48  Alipore Zoological Gardens  Kolkata        -0.190490
46        Kalighat Kali Temple  Kolkata        -0.355209
51               Marble Palace  Kolkata        -0.701961
47                Eden Gardens  Kolkata        -1.197083


# Checking Accuracy

In [73]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score


In [74]:
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f'Mean Absolute Error: {mae}')
print(f'Mean Squared Error: {mse}')
print(f'R-squared: {r2}')


Mean Absolute Error: 2.9976021664879227e-15
Mean Squared Error: 1.2276647837501996e-29
R-squared: 1.0
