<a href="https://colab.research.google.com/github/Richsrivastava/End-End_Data_Science_Workflows/blob/main/Dense_Geographic_Cluster_Estimate_FindInfected.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install cupy-cuda12x --upgrade
%reload_ext cudf.pandas
import pandas as pd
import cuml

import cupy as cp





In [3]:
# load google drive folder to check if data file is available
#from google.colab import drive
#drive.mount('/content/drive')
#!ls "/content/drive/My Drive/Colab Notebooks"

In [3]:
# load data about week 1 of the outbreak into a cuDF-accelerated pandas DataFrame. Only need the 'lat', 'long', and 'infected' columns.
df = pd.read_csv("/content/drive/My Drive/Colab Notebooks/week1.csv", usecols=["lat", "long", "infected"])
df

Unnamed: 0,lat,long,infected
0,54.522510,-1.571896,False
1,54.554030,-1.524968,False
2,54.552486,-1.435203,False
3,54.537189,-1.566215,False
4,54.528212,-1.588462,False
...,...,...,...
58479889,51.634416,-2.925863,False
58479890,51.556972,-3.036290,False
58479891,51.588992,-2.921915,False
58479892,51.590974,-2.954539,False


In [4]:
#Make new DataFrame that contains only the infected members of the population
infected_df = df[df['infected']== 1]
infected_df = infected_df.reset_index(drop=True)
infected_df.head()
#infected_df.shape

Unnamed: 0,lat,long,infected
0,54.472766,-1.654932,True
1,54.529717,-1.667143,True
2,54.512986,-1.589866,True
3,54.522322,-1.380694,True
4,54.54166,-1.61349,True


In [5]:
# lat/long to OSGB36 grid coordinates converter

def latlong2osgbgrid_cupy(lat, long, input_degrees=True):
    '''
    Converts latitude and longitude (ellipsoidal) coordinates into northing and easting (grid) coordinates, using a Transverse Mercator projection.

    Inputs:
    lat: latitude coordinate (N)
    long: longitude coordinate (E)
    input_degrees: if True (default), interprets the coordinates as degrees; otherwise, interprets coordinates as radians

    Output:
    (northing, easting)
    '''

    if input_degrees:
        lat = lat * cp.pi/180
        long = long * cp.pi/180

    a = 6377563.396
    b = 6356256.909
    e2 = (a**2 - b**2) / a**2

    N0 = -100000 # northing of true origin
    E0 = 400000 # easting of true origin
    F0 = .9996012717 # scale factor on central meridian
    phi0 = 49 * cp.pi / 180 # latitude of true origin
    lambda0 = -2 * cp.pi / 180 # longitude of true origin and central meridian

    sinlat = cp.sin(lat)
    coslat = cp.cos(lat)
    tanlat = cp.tan(lat)

    latdiff = lat-phi0
    longdiff = long-lambda0

    n = (a-b) / (a+b)
    nu = a * F0 * (1 - e2 * sinlat ** 2) ** -.5
    rho = a * F0 * (1 - e2) * (1 - e2 * sinlat ** 2) ** -1.5
    eta2 = nu / rho - 1
    M = b * F0 * ((1 + n + 5/4 * (n**2 + n**3)) * latdiff -
                  (3*(n+n**2) + 21/8 * n**3) * cp.sin(latdiff) * cp.cos(lat+phi0) +
                  15/8 * (n**2 + n**3) * cp.sin(2*(latdiff)) * cp.cos(2*(lat+phi0)) -
                  35/24 * n**3 * cp.sin(3*(latdiff)) * cp.cos(3*(lat+phi0)))
    I = M + N0
    II = nu/2 * sinlat * coslat
    III = nu/24 * sinlat * coslat ** 3 * (5 - tanlat ** 2 + 9 * eta2)
    IIIA = nu/720 * sinlat * coslat ** 5 * (61-58 * tanlat**2 + tanlat**4)
    IV = nu * coslat
    V = nu / 6 * coslat**3 * (nu/rho - cp.tan(lat)**2)
    VI = nu / 120 * coslat ** 5 * (5 - 18 * tanlat**2 + tanlat**4 + 14 * eta2 - 58 * tanlat**2 * eta2)

    northing = I + II * longdiff**2 + III * longdiff**4 + IIIA * longdiff**6
    easting = E0 + IV * longdiff + V * longdiff**3 + VI * longdiff**5

    return(northing, easting)

In [6]:
# Use above converter to create grid coordinate values stored in northing and easting columns of the infected_df
cupy_lat = cp.asarray(infected_df['lat'])
cupy_long = cp.asarray(infected_df['long'])

infected_df['northing'], infected_df['easting'] = latlong2osgbgrid_cupy(cupy_lat, cupy_long,input_degrees=True )
infected_df

CUDARuntimeError: cudaErrorInsufficientDriver: CUDA driver version is insufficient for CUDA runtime version

In [None]:
# Use DBSCAN to find clusters of at least 25 infected people where no member is more than 2000m from at least one other cluster member.
dbscan = cuml.DBSCAN(eps=2000, min_samples=25)
infected_df['cluster'] = dbscan.fit_predict(infected_df[['northing', 'easting']])
infected_df

In [None]:
# Find centroid of each cluster
centroids_df = infected_df[['northing', 'easting', 'cluster']].groupby('cluster')[['northing', 'easting']].mean()
centroids_df

In [None]:
# find the number of people in each cluster by counting the number of appearances of eah cluster's label in the column produced by DBSCAN
centroids_df['count'] = infected_df['cluster'].value_counts()
centroids_df

In [None]:
# Find the Centroid of the Cluster with the Most Members and write the answer to my_assessment/question_1.json.

centroids_df.loc[centroids_df['count'].idxmax()][['northing', 'easting']].to_json('question_1.json')

In [None]:
#check submission
!cat question_1.json