## Geometry& Distances

### Run-time (Void this secion)

In [2]:
import pandas as pd
import geopandas as gpd
from shapely.geometry import Point
from math import radians, sin, cos, sqrt, atan2

# Load state boundary shapefile and CSV with village points
state_boundary = gpd.read_file('/Users/sid/Library/CloudStorage/OneDrive-DeakinUniversity/UDocs - D/DataSets/INDIAN-SHAPEFILES-master/State Boundary/KARNATAKA_STATE.geojson')
villages = gpd.read_file('/Users/sid/Library/CloudStorage/OneDrive-DeakinUniversity/UDocs - D/DataSets/MA/MA Data -  All India All_State/29.csv')
print(len(villages))

# Load the GeoJSON file with urban city points
geojson_file = '//Users/sid/Library/CloudStorage/OneDrive-DeakinUniversity/UDocs - D/DataSets/INDIAN-SHAPEFILES-master/Subdist_hq/KARNATAKA Sub District Hq.geojson'
urban_cities = gpd.read_file(geojson_file)

# Ensure the index of urban_cities is consistent and numeric
urban_cities.reset_index(drop=True, inplace=True)


def create_geometry(row):
    try:
        longitude = float(row['village_longitude'])
        latitude = float(row['village_latitude'])
        return Point(longitude, latitude)
    except ValueError:
        return None

villages['geometry'] = villages.apply(create_geometry, axis=1)

# Perform spatial join
joined = gpd.sjoin(villages, state_boundary, how='left', op='within')

# Filter and replace points outside the state boundary
def replace_latitude(row):
    if row['index_right'] >= 0:
        return row['village_latitude']
    return None

def replace_longitude(row):
    if row['index_right'] >= 0:
        return row['village_longitude']
    return None

joined['village_latitude'] = joined.apply(replace_latitude, axis=1)
joined['village_longitude'] = joined.apply(replace_longitude, axis=1)

def haversine(lon1, lat1, lon2, lat2):
    R = 6371.0  # Radius of the Earth in kilometers
    dlon = radians(lon2 - lon1)
    dlat = radians(lat2 - lat1)
    a = sin(dlat / 2)**2 + cos(radians(lat1)) * cos(radians(lat2)) * sin(dlon / 2)**2
    c = 2 * atan2(sqrt(a), sqrt(1 - a))
    distance = R * c
    return distance

def find_nearest_urban_centre(row):
    if row['geometry'] is None:
        return None, None
    
    distances = urban_cities.geometry.apply(lambda point: haversine(row['geometry'].x, row['geometry'].y, point.x, point.y))
    nearest_city_index = distances.idxmin()
    nearest_city_name = urban_cities.loc[nearest_city_index, 'LOC_NAME']
    nearest_city_proximity= distances.loc[nearest_city_index]
    return nearest_city_name, nearest_city_proximity

joined[['nearest_urban_centre', 'nearest_urban_distance']] = joined.apply(find_nearest_urban_centre, axis=1, result_type='expand')

# Save the updated CSV with null values for points outside boundary
joined.drop(columns=['village_latitude', 'village_longitude','index_right', 'STNAME', 'STCODE11', 'STNAME_SH', 'Shape_Length', 'Shape_Area', 'OBJECTID', 'geometry'], inplace=True)
print(len(joined))

joined.to_csv('/Users/sid/Desktop/KA.csv', index=False)

KeyboardInterrupt: 

In [None]:
import os
import numpy as np
# Specify the directory containing CSV files
folder_path = '/Users/sid/Library/CloudStorage/OneDrive-DeakinUniversity/UDocs - D/DataSets/ma2020/states'

# List all CSV files in the folder
csv_files = [file for file in os.listdir(folder_path) if file.endswith('.csv')]

# Initialize an empty DataFrame to store concatenated data
all_data = pd.DataFrame()

# Loop through CSV files and concatenate them
for file in csv_files:
    file_path = os.path.join(folder_path, file)
    data = pd.read_csv(file_path)
    all_data = pd.concat([all_data, data], ignore_index=True)

# Save the concatenated data to a new CSV file
output_file = '/Users/sid/Library/CloudStorage/OneDrive-DeakinUniversity/UDocs - D/DataSets/ma2020/concatenated_data.csv'

all_data['nearest_urban_distance'] = np.where(all_data['nearest_urban_distance'] > 250, np.nan, all_data['nearest_urban_distance'])


all_data.to_csv(output_file, index=False)

print(f"Concatenated data saved to {output_file}")


In [None]:
import pandas as pd

# Read the CSV file
file_path = '/Users/sid/Desktop/ma2020/concatenated_data.csv'
data = pd.read_csv(file_path)

# Select columns from 16 to second last
numeric_columns = data.columns[16:-2]

# Convert selected columns to numeric, handling errors
data[numeric_columns] = data[numeric_columns].apply(pd.to_numeric, errors='coerce')

# Drop rows with any missing values in selected columns
data.dropna(subset=numeric_columns, inplace=True)

print(f"Rows remaining after dropping missing values: {len(data)}")

# Create a new DataFrame for transposed data
transposed_data = pd.DataFrame()

# Extract columns from 16 to end and transpose them
columns_to_transpose = data.columns[16:-2]
numeric_columns_to_transpose = [col for col in columns_to_transpose if pd.api.types.is_numeric_dtype(data[col])]

transposed_data['Variable'] = numeric_columns_to_transpose
transposed_data['Range'] = transposed_data['Variable'].apply(lambda col: data[col].max() - data[col].min() if col in data else None)
transposed_data['Max'] = transposed_data['Variable'].apply(lambda col: data[col].max() if col in data else None)
transposed_data['Min'] = transposed_data['Variable'].apply(lambda col: data[col].min() if col in data else None)
transposed_data['Average'] = transposed_data['Variable'].apply(lambda col: data[col].mean() if col in data else None)  # Add Average column
transposed_data['Median'] = transposed_data['Variable'].apply(lambda col: data[col].median() if col in data else None)  # Add Median column

# Save the summary data to a new CSV file
output_transposed_file = '/Users/sid/Desktop/ma2020/summary_data.csv'
transposed_data.to_csv(output_transposed_file, index=False)

print(f"Transposed data saved to {output_transposed_file}")


### Village area

In [None]:
import geopandas as gpd
from shapely.ops import transform
from functools import partial
import pyproj
from geopy.distance import geodesic

# Load the shapefile
shapefile_path = '/Users/sid/Library/CloudStorage/OneDrive-DeakinUniversity/UDocs - D/DataSets/VIIRS_Monthly_Tiled/2018/tmpif4jtczm.shp'
gdf = gpd.read_file(shapefile_path)

# Define the geodetic coordinate system (EPSG:4326)
geod = pyproj.Geod(ellps='WGS84')

def calculate_area(row):
    geometry = row['geometry']
    if geometry.geom_type == 'Polygon':
        polygons = [geometry]
    elif geometry.geom_type == 'MultiPolygon':
        polygons = geometry.geoms
    else:
        return 0.0
    
    area = 0.0
    for polygon in polygons:
        lon, lat = polygon.centroid.x, polygon.centroid.y
        vertices = list(polygon.exterior.coords)
        for i in range(len(vertices) - 1):
            lon1, lat1 = vertices[i]
            lon2, lat2 = vertices[i + 1]
            _, _, distance = geod.inv(lon1, lat1, lon2, lat2)
            area += lat1 * lat2 * distance
    return abs(area) / 2.0 / 1e6

# Calculate the area in square kilometers
gdf['area_sq_km'] = gdf.apply(calculate_area, axis=1)

# # Create a new DataFrame with desired columns
# new_df = gdf[['attribute_column1', 'attribute_column2', 'area_sq_km']]

# # Export the new DataFrame to a CSV file
# csv_output_path = 'output.csv'
# new_df.to_csv(csv_output_path, index=False)

# print(f'CSV file saved at: {csv_output_path}')

In [None]:
column_mapping = {
    'DTCODE11.x': 'DTNAME',
    'State code': 'State_code',
    'State Name': 'State',
    'State cens':'State_census_code',
    'District c':'District_census_code', 
    'District N':'District', 
    'District_1':'District_code', 
    'SubDistric':'SubDistric', 
    'Subdistr_1':'Subdistrict',
    'Subdistr_2':'Subdistr_2', 
    'Village co':'village_code', 
    'Village Na':'Village', 
    'Block code':'Block_code', 
    'Block Name':'Block',
}

gdf.rename(columns=column_mapping, inplace=True)


In [None]:
new_df = gdf[['DTNAME',	'State_code', 'State', 'State_census_code', 'District_census_code',	'District', 
              'District_code', 'SubDistric', 'Subdistrict', 'Subdistr_2', 'village_code', 'Village', 'Block_code', 'Block', 'area_sq_km']]

new_df = new_df.dropna(subset='village_code')
new_df = new_df.drop_duplicates(subset='village_code')
new_df

# # Export the new DataFrame to a CSV file
# csv_output_path = '/Users/sid/Library/CloudStorage/OneDrive-DeakinUniversity/UDocs - D/DataSets/ma2020/area.csv'
# new_df.to_csv(csv_output_path, index=False)

# print(f'CSV file saved at: {csv_output_path}')