# Match every single grid points to every city in India

### Declare Library

In [20]:
import pandas as pd
import geopy
from geopy.distance import great_circle


### Import the csv file and put it into a dataframe

In [21]:
india_grid = pd.read_csv("/Users/williamadriel/Desktop/Yara/merged_grid.csv", index_col=False)
cities = pd.read_csv("/Users/williamadriel/Desktop/skyweathercsv/worldcities.csv")

In [22]:
india_grid.head()

Unnamed: 0.1,Unnamed: 0,lon,lat,id,index_right,NAME_1,NAME_2,NAME_0
0,77077,68.19,23.61,,147,Gujarat,Kachchh,India
1,77078,68.19,23.62,,147,Gujarat,Kachchh,India
2,77079,68.19,23.63,,147,Gujarat,Kachchh,India
3,80652,68.2,23.6,,147,Gujarat,Kachchh,India
4,80653,68.2,23.61,,147,Gujarat,Kachchh,India


In [23]:
cities

Unnamed: 0,city,city_ascii,lat,lng,country,iso2,iso3,admin_name,capital,population,id
0,Tokyo,Tokyo,35.6850,139.7514,Japan,JP,JPN,Tōkyō,primary,35676000.0,1392685764
1,New York,New York,40.6943,-73.9249,United States,US,USA,New York,,19354922.0,1840034016
2,Mexico City,Mexico City,19.4424,-99.1310,Mexico,MX,MEX,Ciudad de México,primary,19028000.0,1484247881
3,Mumbai,Mumbai,19.0170,72.8570,India,IN,IND,Mahārāshtra,admin,18978000.0,1356226629
4,São Paulo,Sao Paulo,-23.5587,-46.6250,Brazil,BR,BRA,São Paulo,admin,18845000.0,1076532519
...,...,...,...,...,...,...,...,...,...,...,...
15488,Timmiarmiut,Timmiarmiut,62.5333,-42.2167,Greenland,GL,GRL,Kujalleq,,10.0,1304206491
15489,Cheremoshna,Cheremoshna,51.3894,30.0989,Ukraine,UA,UKR,Kyyivs’ka Oblast’,,0.0,1804043438
15490,Ambarchik,Ambarchik,69.6510,162.3336,Russia,RU,RUS,Sakha (Yakutiya),,0.0,1643739159
15491,Nordvik,Nordvik,74.0165,111.5100,Russia,RU,RUS,Krasnoyarskiy Kray,,0.0,1643587468


### Take only lat lon in india_grid dataframe

In [24]:
india_grid = india_grid.loc[:, ['lon', 'lat']]

india_grid.columns = ['lng', 'lat']

india_grid['point'] = list(zip(india_grid.lat, india_grid.lng))

# Show some india_grid result
india_grid.head()

Unnamed: 0,lng,lat,point
0,68.19,23.61,"(23.61, 68.19)"
1,68.19,23.62,"(23.62, 68.19)"
2,68.19,23.63,"(23.63, 68.19)"
3,68.2,23.6,"(23.6, 68.2)"
4,68.2,23.61,"(23.61, 68.2)"


### Take only lat lng city of India in cities dataframe

In [25]:
india_cities = cities.loc[cities.country == 'India', ['lat', 'lng', 'city']]

india_cities['point'] = list(zip(india_cities.lat, india_cities.lng))

india_cities.reset_index(inplace=True, drop=True)

# Show some india_cities result
india_cities.head()

Unnamed: 0,lat,lng,city,point
0,19.017,72.857,Mumbai,"(19.017, 72.857)"
1,28.67,77.23,Delhi,"(28.67, 77.23)"
2,22.495,88.3247,Kolkata,"(22.495, 88.3247)"
3,13.09,80.28,Chennai,"(13.09, 80.28)"
4,12.97,77.56,Bengalūru,"(12.97, 77.56)"


### Define boundaries function 

In [26]:
# Define minimum and maximum boundary point for latitude and longitude
def boundaries(point, span):
    result = {}
    result['lat_min'] = point[0]-span
    result['lat_max'] = point[0]+span
    result['lng_min'] = point[1]-span
    result['lng_max'] = point[1]+span
    
    return result



### Define subset function

In [27]:
# Define subset to only find the cities within the boundary area
def subset(point, span, df):
    bounds = boundaries(point, span)
    
    sub_lat = (india_cities.lat >= bounds['lat_min']) & (india_cities.lat <= bounds['lat_max'])
    
    sub_lon = (india_cities.lng >= bounds['lng_min']) & (india_cities.lng <= bounds['lng_max'])
    
    sub = sub_lat & sub_lon

    result = df.loc[sub]
    
    return result


### Define all_distance function

In [28]:
# Define all_distance function to find the nearest city in each subset
def all_distances(df_point, span, df_cities):
    """Get distance for every city, for a given point."""
    
    data = subset(df_point, span, df_cities).copy()
    
    distances = data.apply(
        lambda row: great_circle(row['point'], df_point),
        axis=1
    )
        
    data['dist'] = distances

    min_dist = data.dist.min()

    result = data.loc[data.dist == min_dist, 'city'].values[0]
    result2 = data.loc[data.dist == min_dist, 'dist'].values[0]

    return result, result2
    
# show result to check the all_distances function  
all_distances(india_grid.loc[1000000, 'point'], 5, india_cities)


('Hisar', Distance(21.596579076951926))

### Create some test cases to check that all the function works properly

In [29]:
india_grid_test = india_grid.loc[:1000]

india_grid_test

Unnamed: 0,lng,lat,point
0,68.19,23.61,"(23.61, 68.19)"
1,68.19,23.62,"(23.62, 68.19)"
2,68.19,23.63,"(23.63, 68.19)"
3,68.20,23.60,"(23.6, 68.2)"
4,68.20,23.61,"(23.61, 68.2)"
...,...,...,...
996,68.49,23.77,"(23.77, 68.49)"
997,68.49,23.78,"(23.78, 68.49)"
998,68.49,23.78,"(23.78, 68.49)"
999,68.49,23.79,"(23.79, 68.49)"


In [30]:
%time result = india_grid_test.apply(lambda row: all_distances(row['point'], 5, india_cities), axis=1)

result

CPU times: user 4.65 s, sys: 23.4 ms, total: 4.68 s
Wall time: 4.68 s


0        (Bhuj, 170.0496560429935 km)
1       (Bhuj, 170.30846098623525 km)
2        (Bhuj, 170.5741204277332 km)
3        (Bhuj, 168.8047120128151 km)
4       (Bhuj, 169.05818635497488 km)
                    ...              
996     (Bhuj, 146.46921769667784 km)
997     (Bhuj, 146.90668906530442 km)
998     (Bhuj, 146.90668906530442 km)
999      (Bhuj, 147.3512516118195 km)
1000    (Bhuj, 147.80284134953436 km)
Length: 1001, dtype: object

### Compute every single point and then write it into a csv 

#### Below code takes time to run (~4 hours) with a processor of 8gb

#### .py version of this code is available if you want to run the code in the remote server rather than in your local server

In [31]:
# Print out result

result = india_grid.apply(lambda row: all_distances(row['point'], 5, india_cities), axis=1)

india_grid['city'] = result

print("result success")

result success


In [32]:
a = pd.DataFrame(india_grid['city'].values.tolist(), columns = ['city','dist'])
india_grid2 = india_grid.drop('city',axis=1).merge(a,how = 'left',right_index=True, left_index=True)

#### Check if the function can properly run

In [33]:
# Test data of 1000 users to display city and dist only
india_grid2

Unnamed: 0,lng,lat,point,city,dist
0,68.19,23.61,"(23.61, 68.19)",Bhuj,170.0496560429935 km
1,68.19,23.62,"(23.62, 68.19)",Bhuj,170.30846098623525 km
2,68.19,23.63,"(23.63, 68.19)",Bhuj,170.5741204277332 km
3,68.20,23.60,"(23.6, 68.2)",Bhuj,168.8047120128151 km
4,68.20,23.61,"(23.61, 68.2)",Bhuj,169.05818635497488 km
...,...,...,...,...,...
3772724,97.40,28.02,"(28.02, 97.4)",,
3772725,97.40,28.03,"(28.03, 97.4)",,
3772726,97.40,28.03,"(28.03, 97.4)",,
3772727,97.40,28.20,"(28.2, 97.4)",,


In [35]:
# Write into csv

india_grid2.to_csv(r'/Users/williamadriel/Desktop/Yara/india_grid_cities.csv', index=False)