In [1]:
pip install geopy

Note: you may need to restart the kernel to use updated packages.


In [2]:
#Basic Libraries 📚

import pandas as pd
import numpy as np
import os
import sys
import matplotlib.pyplot as plt

In [3]:
# Basic Modules & Useful Installations

import warnings
warnings.simplefilter(action='ignore')

In [4]:
# Geo Libraries 📚

import geopandas as gpd
from shapely.geometry import Point
from geopy.distance import geodesic
from geopy.distance import great_circle

In [5]:
# Machine Learning Libraries

from sklearn.cluster import DBSCAN
from sklearn.metrics import silhouette_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [6]:
# Construct Wild distance

def calculate_distance_row(row):
    
    coordinates_1 = (row['lat_buf'], row['long_buf'])
    coordinates_2 = (row['latitude'], row['longitude'])
    
    return geodesic(coordinates_1, coordinates_2).kilometers

In [7]:
# Find Nearby

def find_nearby_coworks(campsite, coworks, max_distance_km):
    nearby_coworks = []
    for _, cowork in coworks.iterrows():
        dist = great_circle((campsite.latitude, campsite.longitude), (cowork.latitude, cowork.longitude)).kilometers
        if dist <= max_distance_km:
            nearby_coworks.append(cowork)
    return nearby_coworks

In [8]:
# Construct reading relative path

relative_path = "my-jobcation-path/02_data_cleaning_phase/01_Preprocessing & Clean/coworkings_chars.csv"
relative_path2 = "my-jobcation-path/02_data_cleaning_phase/01_Preprocessing & Clean/campsites_chars.csv"

# Change directory
os.chdir('C:\\Users\\Oscar\\Documents\\00_Ironhack\\P3_Final\\') #Change for your own directory

# Check directory
current_directory = os.getcwd()
print("Current working directory is:", current_directory)

Current working directory is: C:\Users\Oscar\Documents\00_Ironhack\P3_Final


In [9]:
# Construct absolute paths

coworkings_path = os.path.join(current_directory, relative_path)
campsites_path = os.path.join(current_directory, relative_path2)


In [10]:
# Coworkings reading

if os.path.exists(coworkings_path):
    try:
        # Try to read the CSV file
        data = pd.read_csv(coworkings_path)
        print("The file has been loaded successfully.")
        # Now you can work with the 'data' DataFrame
    except Exception as e:
        print("An error occurred while reading the file:", e)
else:
    print("The file does not exist at the specified path:", coworkings_path)

The file has been loaded successfully.


In [11]:
# Campsites reading

if os.path.exists(campsites_path):
    try:
        # Try to read the CSV file
        data2 = pd.read_csv(campsites_path)
        print("The file has been loaded successfully.")
        # Now you can work with the 'data' DataFrame
    except Exception as e:
        print("An error occurred while reading the file:", e)
else:
    print("The file does not exist at the specified path:", campsites_path)

The file has been loaded successfully.


In [12]:
# Dataframe Creation

dfco = data.copy()
dfca = data2.copy()


## PREPROCESSING for CLUSTERING

In [14]:
# Encoding categoricals. We are going to use manual encoding because we need it for the model after.

type = {
    'City': 0,
    'Town': 1,
    'Village': 2,
}


luxury = {
    'Campsite': 0,
    'Glamping': 1,
    'Camper': 2
}


# Now let's do the mapping

dfca['type'] = dfca['type'].map(type)
dfca['luxury'] = dfca['luxury'].map(luxury)


In [15]:
encoding_column = ["type", "luxury", "beach", "wild", "rating", "distance_km", "longitude", "latitude"] 

df_campsites = dfca[encoding_column]

In [16]:
encoding_column = ["rating","distance_km", "longitude", "latitude"] 

df_cowork = dfco[encoding_column]

In [18]:
# For Geometric Models

geometry = [Point(xy) for xy in zip(dfcoding['lat_buf'], dfcoding['long_buf'])]

NameError: name 'dfcoding' is not defined

In [None]:
dfgeoca = gpd.GeoDataFrame(dfcoding, geometry=geometry)

## DBSCAN MODEL

In [None]:
# Assuming 'df_campsites' and 'df_cowork' are your two datasets

#df_campsites = df[df['type'] == 'campsite'][['latitude', 'longitude']]
#df_cowork = df[df['type'] == 'cowork'][['latitude', 'longitude']]

In [26]:
Xca = df_campsites

In [28]:
# Initialize and fit DBSCAN for campsites
dbscan_campsites = DBSCAN(eps=50, min_samples=20).fit(Xca)

# Assign cluster labels
df_campsites['cluster'] = dbscan_campsites.labels_



In [30]:
# Calculate silhouette score, only if more than 1 cluster and noise exists

if len(np.unique(dbscan_campsites.labels_)) > 1:
    silhouette_avg = silhouette_score(Xca, dbscan_campsites.labels_)
    print("Silhouette Score:", silhouette_avg)
else:
    print("Not enough clusters to compute the silhouette score.")

Silhouette Score: 0.7073499460137412


In [32]:
df_campsites['cluster'].value_counts()

cluster
 0    3364
-1     107
 2      61
 1      37
Name: count, dtype: int64

In [34]:
# SPA Modification

# Initialize and fit DBSCAN for campsites

dbscan_campsites_spa = DBSCAN(eps=50, min_samples=16, metric = 'manhattan').fit(Xca)

# Assign cluster labels
df_campsites['cluster'] = dbscan_campsites_spa.labels_



In [36]:
# Calculate silhouette score, only if more than 1 cluster and noise exists

if len(np.unique(dbscan_campsites_spa.labels_)) > 1:
    silhouette_avg = silhouette_score(Xca, dbscan_campsites_spa.labels_)
    print("Silhouette Score:", silhouette_avg)
else:
    print("Not enough clusters to compute the silhouette score.")

Silhouette Score: 0.7158593830074172


In [38]:
df_campsites['cluster'].value_counts()

cluster
 0    3364
-1     116
 2      47
 1      42
Name: count, dtype: int64

In [68]:
from sklearn.cluster import AgglomerativeClustering

class AgglomerativeClusteringWrapper(AgglomerativeClustering):
    def predict(self, X):
        """
        Predict cluster labels for each sample in X.

        Parameters
        ----------
        X : {array-like, sparse matrix} of shape (n_samples, n_features)
            New data to predict.

        Returns
        -------
        labels : array, shape (n_samples,)
            Index of the cluster each sample belongs to.
        """
        return self.labels_.astype(int)


In [101]:
estimator = dbscan_campsites_spa
silhouette_avg = silhouette_score(Xca, dbscan_campsites_spa.labels_)

In [56]:
dffiltered = df_campsites[df_campsites["cluster"] == 2]

In [58]:
dffiltered

Unnamed: 0,type,luxury,beach,wild,rating,distance_km,longitude,latitude,cluster
1116,1,0,1,0,4.5,21.32155,-75.576715,10.224149,2
1117,1,0,1,1,4.9,36.023179,-75.730947,10.179027,2
1118,1,0,1,0,4.4,10.095029,-75.407322,10.341448,2
1119,1,0,1,0,4.5,26.165736,-75.618215,10.198059,2
1121,1,1,1,0,5.0,13.180568,-75.590479,10.339151,2
1122,1,0,1,1,4.3,35.889956,-75.729371,10.179073,2
1123,1,0,1,0,4.3,20.41449,-75.565613,10.227674,2
1126,1,0,1,0,4.2,10.118699,-75.407299,10.341105,2
1127,1,0,1,0,4.9,12.692183,-75.373631,10.430576,2
1129,1,1,1,1,4.8,67.829495,-75.027672,10.809054,2


# GRID SEARCH

In [40]:
dbscan_clusters = []
cluster_count   = []
eps_space = [20,23,25,27,30,33,35,37,40,42,45,48,50]
min_samples_space = [5,7,10,12,15,17,20]

In [42]:
# SPA Modification

# Initialize and fit DBSCAN for campsites

dbscan = DBSCAN(eps=50, min_samples=16, metric = 'manhattan')

dbscan_campsites_gs = GridSearchCV(dbscan,parameters)
dbscan_campsites_gs.fit(Xca)

# Assign cluster labels
df_campsites['cluster'] = dbscan_campsites_gs.labels_



NameError: name 'parameters' is not defined

In [44]:
# Calculate silhouette score, only if more than 1 cluster and noise exists

if len(np.unique(dbscan_campsites_spa.labels_)) > 1:
    silhouette_avg = silhouette_score(Xca, dbscan_campsites_spa.labels_)
    print("Silhouette Score:", silhouette_avg)
else:
    print("Not enough clusters to compute the silhouette score.")

Silhouette Score: 0.7158593830074172


In [46]:
df_campsites['cluster'].value_counts()

cluster
 0    3364
-1     116
 2      47
 1      42
Name: count, dtype: int64

In [109]:
from geopy.distance import great_circle

def find_nearby_coworks(campsite, coworks, max_distance_km):
    nearby_coworks = []
    for _, cowork in coworks.iterrows():
        dist = great_circle((campsite.latitude, campsite.longitude), (cowork.latitude, cowork.longitude)).kilometers
        if dist <= max_distance_km:
            nearby_coworks.append(cowork)
    return nearby_coworks

# Example usage for the first campsite
campsite = df_campsites.iloc[0]
nearby_coworks = find_nearby_coworks(campsite, df_cowork, 1)  # finding coworks within 1 km

In [111]:
campsite = df_campsites.iloc[0]
nearby_coworks = find_nearby_coworks(campsite, df_cowork, 1) 

In [113]:
campsite

type            0.000000
luxury          0.000000
beach           0.000000
wild            0.000000
rating          3.900000
distance_km     9.458117
longitude      -3.603361
latitude       40.453734
cluster         0.000000
Name: 0, dtype: float64

In [115]:
nearby_coworks

[]

In [None]:
# Extraer los valores de distancia_km
distances = [entry['distance_km'] for entry in nearby_coworks]

# Encontrar el máximo
max_distance = max(distances)
# Encontrar el mínimo
min_distance = min(distances)

In [None]:
max_distance

In [None]:
min_distance

In [None]:
campsite = df_campsites.iloc[100]

In [117]:
nearby_coworks = find_nearby_coworks(campsite, df_cowork, 1)

In [119]:
campsite

type            0.000000
luxury          0.000000
beach           0.000000
wild            0.000000
rating          3.900000
distance_km     9.458117
longitude      -3.603361
latitude       40.453734
cluster         0.000000
Name: 0, dtype: float64

In [121]:
nearby_coworks

[]

In [None]:
# Assuming you add results to your dataframe or a separate list
df_campsites['nearby_coworks_count'] = df_campsites.apply(lambda x: len(find_nearby_coworks(x, df_cowork, 1)), axis=1)

# Recommend campsites with at least one coworking space nearby
recommended_campsites = df_campsites[df_campsites['nearby_coworks_count'] > 0]