Steps:
1. Take the districtwise csv and create a new dataframe with colums: district, sum of all rural --> urban, longitude, latitude. 
2. Create new dataframe, excluding the geographic coordinates. 
3. Use the standard scaler on the data
4. Run the DBSCAN algorithm on both datasets, appending a Geo_Clusters column and a Clusters column to the full dataframe
5. Plot the geo-based clusters and the non-geo based clusters.

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from geopy.geocoders import ArcGIS

In [6]:
df = pd.read_csv('districtLevel.csv')
df = df[['Area Name', 'Last residence-\nRural/Urban/Country', 'Place of enumeration\n-Rural/Urban', 'Total migrants-Persons']]
df = df[df['Last residence-\nRural/Urban/Country'] == 'Rural']
df = df[df['Place of enumeration\n-Rural/Urban'] == 'Urban']
df

Unnamed: 0,Area Name,Last residence-\nRural/Urban/Country,Place of enumeration\n-Rural/Urban,Total migrants-Persons
1,Kupwara,Rural,Urban,0
5,Kupwara,Rural,Urban,3
9,Kupwara,Rural,Urban,1
13,Kupwara,Rural,Urban,0
17,Kupwara,Rural,Urban,0
...,...,...,...,...
140731,South Andaman,Rural,Urban,10
140735,South Andaman,Rural,Urban,1915
140739,South Andaman,Rural,Urban,8331
140743,South Andaman,Rural,Urban,18


In [7]:
def migrants_per_district_list():
    district_migrantSum = []
    for district in df['Area Name'].unique():
        district_migrantSum.append(df[df['Area Name'] == district]['Total migrants-Persons'].sum())

    return district_migrantSum

In [8]:
migrants_per_district = migrants_per_district_list()

In [9]:
nom = ArcGIS()

def district_long(district):
    location = nom.geocode(district)
    if location:
        longitude = location[1][0]
        return longitude
    else:
        return -1
    
def district_lat(district):
    location = nom.geocode(district)
    if location:
        latitude = location[1][1]
        return latitude
    else:
        return -1
    
def create_coordinate_map():
    coordinates = {}
    for district in df['Area Name'].unique():
        coordinates[district] = (district_long(district), district_lat(district))
    return coordinates

In [10]:
coordinates = create_coordinate_map()

In [11]:
def get_longitude(area_name):
    if area_name in coordinates:
        return coordinates[area_name][0]  # Assuming coordinates are in the form (longitude, latitude)
    return None  # Handle cases where the area name is not found

def get_latitude(area_name):
    if area_name in coordinates:
        return coordinates[area_name][1]  # Assuming coordinates are in the form (longitude, latitude)
    return None  # Handle cases where the area name is not found

In [12]:
data = {'District': df['Area Name'].unique(),'Total migrants - Persons': migrants_per_district}
dataset = pd.DataFrame(data)
dataset['Longitude'] = dataset['District'].apply(get_longitude)
dataset['Latitude'] = dataset['District'].apply(get_latitude)
dataset

Unnamed: 0,District,Total migrants - Persons,Longitude,Latitude
0,Kupwara,64,34.524601,74.259598
1,Badgam,136,34.018576,74.750677
2,Leh(Ladakh),503,34.148300,77.579498
3,Kargil,18,34.560642,76.125898
4,Punch,83,33.861967,73.734338
...,...,...,...,...
635,Karaikal,10483,10.931702,79.835510
636,Union Territory - ANDAMAN & NICOBAR ISLANDS,29977,11.700650,92.675170
637,Nicobars,0,8.000000,93.500000
638,North & Middle Andaman,652,12.853291,92.868270


In [13]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import StandardScaler

In [14]:
scaler = StandardScaler()
scaled_features = scaler.fit_transform(dataset[['Total migrants - Persons', 'Longitude', 'Latitude']])
X = dataset[['Total migrants - Persons', 'Longitude', 'Latitude']]
X_scaled = pd.DataFrame(scaled_features, columns=['Total migrants - Persons', 'Longitude', 'Latitude'])

In [15]:
from sklearn.metrics import silhouette_score as ss

In [16]:
dbscan = DBSCAN(eps=0.22214285714285714, min_samples=16)
y_pred = dbscan.fit(X_scaled)
dataset['Clusters'] = dbscan.labels_
dataset

Unnamed: 0,District,Total migrants - Persons,Longitude,Latitude,Clusters
0,Kupwara,64,34.524601,74.259598,0
1,Badgam,136,34.018576,74.750677,0
2,Leh(Ladakh),503,34.148300,77.579498,0
3,Kargil,18,34.560642,76.125898,0
4,Punch,83,33.861967,73.734338,0
...,...,...,...,...,...
635,Karaikal,10483,10.931702,79.835510,2
636,Union Territory - ANDAMAN & NICOBAR ISLANDS,29977,11.700650,92.675170,-1
637,Nicobars,0,8.000000,93.500000,-1
638,North & Middle Andaman,652,12.853291,92.868270,-1


In [17]:
import plotly.express as px

In [18]:
def plot_DBSCAN_clusters(dataset_full):
    filtered_df = dataset_full[(dataset_full['Latitude'] >= 50) & (dataset_full['Latitude'] <= 100)]
    fig = px.scatter(x=filtered_df['Latitude'], y=filtered_df['Longitude'], color=filtered_df['Clusters'])
    fig.update_layout(
    width=700,   # Set the width of the plot
    height=600   # Set the height of the plot
    )
    fig.show()

In [19]:
plot_DBSCAN_clusters(dataset)

In [20]:
ss(X_scaled, dataset['Clusters'])

0.2007366595433787