Steps:
1. Take the districtwise csv and create a new dataframe with colums: district, sum of all rural --> urban, longitude, latitude. 
2. Create new dataframe, excluding the geographic coordinates. 
3. Use the standard scaler on the data
4. Run the DBSCAN algorithm on both datasets, appending a Geo_Clusters column and a Clusters column to the full dataframe
5. Plot the geo-based clusters and the non-geo based clusters.

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from geopy.geocoders import ArcGIS

In [8]:
df = pd.read_csv('districtLevel.csv')
df = df[['Area Name', 'Last residence-\nRural/Urban/Country', 'Place of enumeration\n-Rural/Urban', 'Total migrants-Persons']]
df = df[df['Last residence-\nRural/Urban/Country'] == 'Rural']
df = df[df['Place of enumeration\n-Rural/Urban'] == 'Urban']
df

Unnamed: 0,Area Name,Last residence-\nRural/Urban/Country,Place of enumeration\n-Rural/Urban,Total migrants-Persons
1,Kupwara,Rural,Urban,0
5,Kupwara,Rural,Urban,3
9,Kupwara,Rural,Urban,1
13,Kupwara,Rural,Urban,0
17,Kupwara,Rural,Urban,0
...,...,...,...,...
140731,South Andaman,Rural,Urban,10
140735,South Andaman,Rural,Urban,1915
140739,South Andaman,Rural,Urban,8331
140743,South Andaman,Rural,Urban,18


In [27]:
def migrants_per_district_list():
    district_migrantSum = []
    for district in df['Area Name'].unique():
        district_migrantSum.append(df[df['Area Name'] == district]['Total migrants-Persons'].sum())

    return district_migrantSum

In [29]:
migrants_per_district = migrants_per_district_list()

640

In [33]:
nom = ArcGIS()

def district_long(district):
    location = nom.geocode(district)
    if location:
        longitude = location[1][0]
        return longitude
    else:
        return -1
    
def district_lat(district):
    location = nom.geocode(district)
    if location:
        latitude = location[1][1]
        return latitude
    else:
        return -1
    
def create_coordinate_map():
    coordinates = {}
    for district in df['Area Name'].unique():
        coordinates[district] = (district_long(district), district_lat(district))
    return coordinates

In [34]:
coordinates = create_coordinate_map()

In [39]:
def get_longitude(area_name):
    if area_name in coordinates:
        return coordinates[area_name][0]  # Assuming coordinates are in the form (longitude, latitude)
    return None  # Handle cases where the area name is not found

def get_latitude(area_name):
    if area_name in coordinates:
        return coordinates[area_name][1]  # Assuming coordinates are in the form (longitude, latitude)
    return None  # Handle cases where the area name is not found

In [40]:
data = {'District': df['Area Name'].unique(),'Total migrants - Persons': migrants_per_district}
dataset = pd.DataFrame(data)
dataset['Longitude'] = dataset['District'].apply(get_longitude)
dataset['Latitude'] = dataset['District'].apply(get_latitude)
dataset

Unnamed: 0,District,Total migrants - Persons,Longitude,Latitude
0,Kupwara,64,34.524601,74.259598
1,Badgam,136,34.018576,74.750677
2,Leh(Ladakh),503,34.148300,77.579498
3,Kargil,18,34.560642,76.125898
4,Punch,83,33.861967,73.734338
...,...,...,...,...
635,Karaikal,10483,10.931702,79.835510
636,Union Territory - ANDAMAN & NICOBAR ISLANDS,29977,11.700650,92.675170
637,Nicobars,0,8.000000,93.500000
638,North & Middle Andaman,652,12.853291,92.868270


In [106]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

In [108]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(dataset[['Total migrants - Persons', 'Longitude', 'Latitude']])
X = dataset[['Total migrants - Persons', 'Longitude', 'Latitude']]
X_scaled = pd.DataFrame(scaled_features, columns=['Total migrants - Persons', 'Longitude', 'Latitude'])

In [109]:
from sklearn.metrics import silhouette_score as ss

In [275]:
import itertools
epsilons = np.linspace(0.01, 0.3, num = 40)
min_samples = np.arange(2, 20, step=1)
combinations = list(itertools.product(epsilons, min_samples))
N = len(combinations)

In [276]:
def get_max_score(combinations, matrix):
    scores = []
    all_labels = []
    
    for i, (eps, num_samples) in enumerate(combinations):
        dbscan_cluster_model = DBSCAN(eps=eps, min_samples=num_samples).fit(matrix)
        labels = dbscan_cluster_model.labels_
        labels_set = set(labels)
        num_clusters = len(labels_set)
        
        if -1 in labels_set:
            num_clusters -= 1
        
        if (num_clusters < 3):
            scores.append(-10)
            all_labels.append('bad')
            c = (eps, num_samples)
            print(f"Combination {c} on iteration {i+1} of {N} has {num_clusters} clusters. Continuing")
            continue

        scores.append(ss(matrix, labels))
        all_labels.append(labels)
        c = (eps, num_samples)
        print(f"Index: {i}, Score: {scores[-1]}, Combination {c}, Num Clusters: {num_clusters}")

    best_index = np.argmax(scores)
    best_parameters = combinations[best_index]
    best_labels = all_labels[best_index]
    best_score = scores[best_index]

    return {'best epsilon': best_parameters[0], 
            'best_min_samples': best_parameters[1], 
            'best_labels': best_labels, 
            'best_scores': best_score 
            }


In [277]:
best_dict = get_max_score(combinations, X_scaled)
best_dict

Index: 0, Score: -0.31594603881491945, Combination (0.01, 2), Num Clusters: 3
Combination (0.01, 3) on iteration 2 of 720 has 0 clusters. Continuing
Combination (0.01, 4) on iteration 3 of 720 has 0 clusters. Continuing
Combination (0.01, 5) on iteration 4 of 720 has 0 clusters. Continuing
Combination (0.01, 6) on iteration 5 of 720 has 0 clusters. Continuing
Combination (0.01, 7) on iteration 6 of 720 has 0 clusters. Continuing
Combination (0.01, 8) on iteration 7 of 720 has 0 clusters. Continuing
Combination (0.01, 9) on iteration 8 of 720 has 0 clusters. Continuing
Combination (0.01, 10) on iteration 9 of 720 has 0 clusters. Continuing
Combination (0.01, 11) on iteration 10 of 720 has 0 clusters. Continuing
Combination (0.01, 12) on iteration 11 of 720 has 0 clusters. Continuing
Combination (0.01, 13) on iteration 12 of 720 has 0 clusters. Continuing
Combination (0.01, 14) on iteration 13 of 720 has 0 clusters. Continuing
Combination (0.01, 15) on iteration 14 of 720 has 0 clusters.

{'best epsilon': 0.2553846153846154,
 'best_min_samples': 4,
 'best_labels': array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0, -1, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0, -1,  0, -1,  0, -1, -1, -1, -1,  0, -1, -1, -1, -1,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0, -1,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0, 

In [278]:
dbscan = DBSCAN(eps=0.22214285714285714, min_samples=16)
y_pred = dbscan.fit(X_scaled)
dataset['Clusters'] = dbscan.labels_
dataset

Unnamed: 0,District,Total migrants - Persons,Longitude,Latitude,Clusters
0,Kupwara,64,34.524601,74.259598,0
1,Badgam,136,34.018576,74.750677,0
2,Leh(Ladakh),503,34.148300,77.579498,0
3,Kargil,18,34.560642,76.125898,0
4,Punch,83,33.861967,73.734338,0
...,...,...,...,...,...
635,Karaikal,10483,10.931702,79.835510,2
636,Union Territory - ANDAMAN & NICOBAR ISLANDS,29977,11.700650,92.675170,-1
637,Nicobars,0,8.000000,93.500000,-1
638,North & Middle Andaman,652,12.853291,92.868270,-1


In [279]:
import plotly.express as px

In [280]:
def plot_DBSCAN_clusters(dataset_full):
    filtered_df = dataset_full[(dataset_full['Latitude'] >= 50) & (dataset_full['Latitude'] <= 100)]
    fig = px.scatter(x=filtered_df['Latitude'], y=filtered_df['Longitude'], color=filtered_df['Clusters'])
    fig.update_layout(
    width=700,   # Set the width of the plot
    height=600   # Set the height of the plot
    )
    fig.show()

In [281]:
plot_DBSCAN_clusters(dataset)

In [282]:
ss(X_scaled, dataset['Clusters'])

0.2007366595433787