In [None]:
import polars as pl
from function.clean_spm import clean
from function.preprocessing import spm
from function.descriptive import descriptive
from function.visualisation import visual
import plotly.express as px
from matplotlib import pyplot as plt
import seaborn as sns
import statsmodels.api as sm
from scipy.stats import skew, kurtosis, shapiro, norm, spearmanr
import numpy as np

# read the spm_database
df = clean.read_spm_database(initial_clean=True, parquet_file = True)

# To generate a full list of STR beneficiaries
long_df = pl.concat([df.select("id", "id_ben", "sex_beneficiary", "ori_ben_age", "str_category", "state", "code_parlimen")\
                       .rename({"ori_ben_age":"age", "sex_beneficiary":"sex"})\
                       .with_columns(pl.col("age").cast(pl.Int64)),
                     df.filter(pl.col("id_partner").is_not_null())\
                       .select("id", "id_partner", "sex_partner", "age_partner", "str_category", "state", "code_parlimen")\
                          .rename({"id_partner":"id_ben", "sex_partner":"sex", "age_partner":"age"})\
                            .with_columns(pl.col("age").cast(pl.Int64)),
                     df.filter(pl.col("id_dependent").is_not_null())\
                       .select("id", "id_dependent", "sex_dependent", "age_dependent", "str_category", "state", "code_parlimen")\
                          .rename({"id_dependent":"id_ben", "sex_dependent":"sex", "age_dependent":"age"})\
                            .with_columns(pl.col("age").cast(pl.Int64))])\
          .unique(subset='id_ben')

In [None]:
old_df = long_df.filter(pl.col("age") < 2000, pl.col("age") > 100, pl.col("str_category") == "Household")
old_df

In [None]:
# df.filter(pl.col("age_dependent") > 2000)
ori_df = clean.read_spm_database(initial_clean=True, parquet_file = False)

In [None]:
# To show there is some without address, which will be drop out
long_df.null_count()

In [None]:
# Count the partner
print(df.filter(pl.col("id_partner").is_not_null())\
.select("id_ben", "id_partner").unique().count())

print(df.filter(pl.col("id_partner").is_not_null())\
.select("id_ben", "id_partner").unique(subset = "id_ben").count())

print(df.filter(pl.col("id_partner").is_not_null())\
.select("id_ben", "id_partner").unique(subset = "id_partner").count())

# Count the dependent
print(df.filter(pl.col("id_dependent").is_not_null())\
.select("id_ben", "id_dependent").unique().count())

print(df.filter(pl.col("id_dependent").is_not_null())\
.select("id_ben", "id_dependent").unique(subset = "id_ben").count())

print(df.filter(pl.col("id_dependent").is_not_null())\
.select("id_ben", "id_dependent").unique(subset = "id_dependent").count())

In [None]:
# To show distribution of sex
sex_pt = descriptive.gender_distribution(df)
sex_pt

In [None]:
# Distribution of age
age_df = long_df.filter(pl.col("age").is_not_null(), 
                        pl.col("age") < 100).select("age", "sex").to_pandas()

# Using pandas default describe
descriptive_df = age_df.describe().apply(lambda s: s.apply('{0:,.2f}'.format))

# Calculate the skew and kurtosis
for formula in [np.var, skew, kurtosis]:
    descriptive_df.loc[f"{formula.__name__}"] = [formula(age_df.loc[:,column]) for column in descriptive_df.columns]

# Print the table
print(descriptive_df.apply(lambda s: s.apply('{0:,.2f}'.format)).to_markdown(tablefmt="pretty"))

# To plot the graph
fig, (ax1, ax2, ax3) = plt.subplots(nrows=1, ncols=3, figsize=(15, 3))

# Histogram
ax1.hist(age_df.loc[:,"age"], bins=len(age_df.loc[:,"age"].unique()), edgecolor='black')
ax1.set_title(f'Histogram for {"age"}')
ax1.set_xlabel("age")
ax1.set_ylabel('Frequency')

# QQ plot
sm.qqplot(age_df.loc[:,"age"], ax=ax2, line='45', fit = True)
ax2.set_title(f'QQ Plot for {"age"}')

# Box plot
sns.boxplot(age_df.loc[:,"age"], ax=ax3)
ax3.set_title(f'Box Plot for {"age"}')

# Adjust spacing between subplots
plt.subplots_adjust(wspace=0.3)

# Display the plot
plt.show()

In [None]:
px.histogram(age_df, x="age")
px.box(age_df.loc[:,"age"])

In [None]:
# To show distribution of state
state_pt = long_df.group_by("state").len("count")\
                .with_columns((pl.col("count")/len(long_df) * 100).round(2).alias("count_%"))

# Print the result in table format
print(state_pt)

# Plot the chorepleth
visual.draw_chorepleth(map_file_type="state",
                       df = long_df,
                       location = "state",
                       z="len",
                       featureidkey="state",
                       text=("state",),
                       colorbar_title="STR Population Density")

In [None]:
# To show distribution of code_parlimen
parlimen_pt = long_df.group_by("code_parlimen").len("count")\
                .with_columns((pl.col("count")/len(long_df) * 100).round(2).alias("count_%"))

# Print the result in table format
# print(parlimen_pt)

# Plot the chorepleth
visual.draw_chorepleth(map_file_type="parlimen",
                       df = long_df,
                       location = "code_parlimen",
                       z="len",
                       featureidkey="code_parlimen",
                       text=("parlimen","state"),
                       colorbar_title="STR Population Density")

In [None]:
from function.visualisation import visual
import polars as pl

gp_df = pl.read_excel(visual._gp_file)

visual.draw_chorepleth(map_file_type="district", df=gp_df, location="code_state_district", z="len", featureidkey="code_state_district")

In [None]:
visual.gp_scatter(query=None, text=None, color=("district"), mapbox_style="basic", height=600)

In [None]:
pl.read_excel(visual._gp_file).group_by("district").len("Number of GPs").to_pandas().sort_values("Number of GPs", ascending=False).to_clipboard(index=False)

In [None]:
px.density_mapbox(population, lat="Latitude", lon="Longitude", z="pov_headcount", mapbox_style="open-street-map", radius = 20, zoom=5, height=600)\
.add_trace(px.scatter_mapbox(clinic_data, lat="Latitude", lon="Longitude").data[0])

In [None]:
 # Import necessary packages
import json
import plotly.graph_objects as go

# use json to load the choropleth file
with open(visual.read_map_file(map_file_type="population", read_with_gpd=False)) as file:
    geojson_data = json.load(file)
# geojson_data["features"][0]

px.choropleth(population, geojson=geojson_data, locations="fid", color="pop2023p_adj_2km", 
              color_continuous_scale="Viridis", range_color=(0,12))
population

In [None]:
import polars as pl

pd.read_parquet("data/population/Household Income Districts.parquet")\
    .merge(visual.read_map_file("district", read_with_gpd=True), on="district")

In [None]:
import plotly.express as px
import pandas as pd
from function.visualisation import visual
from dotenv import load_dotenv
import os

# To set the environement
load_dotenv()
px.set_mapbox_access_token(os.getenv("MAPBOX_TOKEN"))

px.density_mapbox(pd.read_csv("data/map_file/mys_pd_2020_1km_ASCII_XYZ.csv"),
                  lat="Y", lon="X", z="Z", radius=5,
                  labels = {"Z":"Population density per 1km"},
                  mapbox_style="open-street-map", height = 600,
                  zoom=5)\
.add_trace(px.scatter_mapbox(clinic_data, lat="Latitude", lon="Longitude", color="district", mapbox_style="open-street-map").data[0])

In [None]:
import pandas as pd
import geopandas as gpd
import rasterio
from shapely.geometry import Point
import matplotlib.pyplot as plt
from function.visualisation import visual
import plotly.express as px

# , 
district_list = ['14_1', '10_8', '10_1', '10_5', '10_2', '8_3', '7_4', '1_2']

clinic_data = pd.read_excel(visual._gp_file)
clinic_data['geometry'] = clinic_data.apply(lambda row: Point(row['Longitude'], row['Latitude']), axis=1)
clinics = gpd.GeoDataFrame(clinic_data, geometry='geometry', crs='EPSG:4326')  # Assuming WGS84

# with rasterio.open("data/map_file/Malaysia Population Density 2020.tif") as src:
#     population_density = src.read(1)
#     transform = src.transform
#     crs = src.crs

population = visual.read_map_file("population", read_with_gpd=True).set_geometry("geometry")
# population["Latitude"] = population["geometry"].centroid.y
# population["Longitude"] = population["geometry"].centroid.x

def calculate_average_nearest_neighbor(clinic_data, population_data):
    # Convert clinic data to GeoPandas GeoDataFrame
    # clinic_gdf = gpd.GeoDataFrame(clinic_data, 
    #                               geometry=gpd.points_from_xy(clinic_data["Longitude"], clinic_data["Latitude"]))
    
    # Ensure both GeoDataFrames have the same CRS
    # clinic_gdf = clinic_gdf.to_crs('EPSG:4326')

    # Calculate nearest neighbor distances
    distances = []
    for index, clinic in clinic_data.iterrows():
        nearest_distance = clinic["geometry"].distance(population_data.geometry)
        distances.append(nearest_distance)

    # Calculate Average Nearest Neighbor Distance
    return sum(distances) / len(distances)

temp_df = pd.DataFrame()
for district in district_list:
    print(district)
    temp_population = population.query(f"code_state_district == '{district}'")
    ann_temp = calculate_average_nearest_neighbor(clinic_data=clinics.query(f"code_state_district == '{district}'"), 
                                                  population_data=temp_population)

    temp_population.loc[:,"ann"] = ann_temp

    temp_df = pd.concat([temp_df, temp_population.loc[:,("fid", "code_state_district", "ann")]])    

temp_df

In [None]:
import json
with open(visual.read_map_file(map_file_type="population", read_with_gpd=False)) as file:
    geojson_data = json.load(file)

kl_population = population.query("code_state_district == '14_1'")
kl_gp = clinic_data.query("state == 'W.P. Kuala Lumpur'")

kl_population["lat"] = kl_population["geometry"].centroid.y
kl_population["lon"] = kl_population["geometry"].centroid.x

# px.choropleth_mapbox(kl_population, geojson=geojson_data, featureidkey="fid", locations="fid", color="pov_headcount")
px.density_mapbox(kl_population, lat="lat", lon="lon", radius=20, z="pov_headcount", mapbox_style="open-street-map")

In [None]:
district_df = visual.read_map_file("district", read_with_gpd=True)

gpd.sjoin(clinics, district_df)["district_right"].unique()

In [None]:
# population.loc[:,"ann"] = ann_list
# px.histogram(population.query("code_state_district.isin(['1_2'])"), 
#              x="ann", nbins = 50)
population

In [None]:
pl.read_parquet("""data/population/Malaysia Population Table.parquet""")\
.filter(pl.col("date").cast(pl.String) == '2023-01-01')\
.with_columns((pl.col("population")/33379.5 * 100).alias("percentage")).head(7)

In [None]:
# No need to write for all?
# Concentrate on the population first, for 10 districts,
# Merge with parliment to get the parliment involved
# Divide the parliment str number by total population to get a percentage
# Use the percentage to times with the z value from ASCII for density

In [None]:
from function.file import file
from function.provider import gp
from function.population import population
import geopandas as gpd
import pandas as pd
import polars as pl

parlimen_population = pl.read_parquet("data/population/Population Parlimen.parquet")
code_parlimen_district = pl.from_pandas(gpd.read_file(file._map_parlimen).sjoin(
                              gpd.read_file(file._map_district).drop(columns = ["state", "code_state"]))\
                                    .loc[:,("code_parlimen", "district", "code_state_district")])

temp_df = parlimen_population.filter(pl.col("date").cast(pl.String) == "2022-01-01",
                                     pl.col("sex") == "both",
                                     pl.col("age") == "overall",
                                     pl.col("ethnicity") == "overall")\
                             .with_columns(pl.col("population") * 1000).to_pandas()
code_parlimen_df = gpd.read_file(file._map_parlimen).loc[:,("parlimen", "code_parlimen")]
temp_df = pl.from_pandas(temp_df.merge(code_parlimen_df, how="left", on="parlimen"))

long_df = population.convert_str_to_long(df = pl.read_parquet(file._file_spm_parquet))

percentage_df = temp_df.join(long_df.group_by("code_parlimen").len("str_count"),
                             how="left", on="code_parlimen")\
                       .select("parlimen", "code_parlimen", (pl.col("str_count")/pl.col("population")).alias("str_percentage"))

parlimen_population.join(percentage_df, how="left", on="parlimen")\
.join(code_parlimen_district, how="left", on="code_parlimen")\
.write_parquet("data/population/parlimen_district_str.parquet", use_pyarrow=True)

In [None]:
gpd.read_file(file._map_parlimen).sjoin(gpd.read_file(file._map_district).drop(columns = ["state", "code_state"]))\
.loc[:,("code_parlimen", "district", "code_state_district")]

In [None]:
pl.read_parquet(file._population_parlimen)

In [None]:
from function.file import file
from function.provider import gp
from function.population import population
from function.map import map
import geopandas as gpd
import pandas as pd
import polars as pl
import numpy as np
import plotly.express as px

# # To get percentage for population according to lat lon
# parlimen_population = pl.read_parquet(file._population_parlimen)
# parlimen_population_geo = gpd.GeoDataFrame(data=parlimen_population.to_pandas()\
#                                                  .merge(gpd.read_file(file._map_parlimen).drop(columns="state"), 
#                                                         how="left", on="parlimen"),
#                                            geometry="geometry", crs='EPSG:4326')
# # long_df = population.convert_str_to_long(df = pl.read_parquet(file._file_spm_parquet)).group_by("code_parlimen").len("str_count")
# long_df = pl.read_parquet(file._file_spm_parquet).unique(subset="id_ben").group_by("code_parlimen").len("str_count")

# # Prepare on ASCII file
# population_ascii = map.convert_pandas_geopandas(pd.read_csv(file._population_ascii), lat="Y", lon="X")
# population_ascii.loc[:,"ascii_population"] = population_ascii.loc[:,"Z"] * 32447100 /population_ascii.Z.sum() * 1.0287

# # Filter the parliment population date
# temp_df = pl.from_pandas(parlimen_population_geo.drop(columns="geometry"))\
#             .filter(pl.col("date").cast(pl.Date).cast(pl.String) == "2020-01-01",
#                                         pl.col("sex") == "both",
#                                         pl.col("age") == "overall",
#                                         pl.col("ethnicity") == "overall")\
#                             .with_columns(pl.col("population") * 1000)\
#             .group_by("code_parlimen").agg(pl.col("population").sum())

# percentage_df = temp_df.join(long_df, on="code_parlimen")\
#                         .with_columns((pl.col("str_count")/pl.col("population")).alias("str_percentage"))

# # Merge both ascii and parlimen_population
# df = pl.from_pandas(gpd.sjoin(parlimen_population_geo, population_ascii)\
#                     .loc[:,("date", "state", "sex", "age", "ethnicity", "population", "code_parlimen", "parlimen", "X", "Y", "ascii_population")])\
#        .with_columns(pl.col("date").cast(pl.Date))\
#        .join(percentage_df.select("code_parlimen", "str_percentage"), how="left", on ="code_parlimen")\
#        .with_columns((pl.col("ascii_population") * pl.col("str_percentage")).alias("str_ascii"))

# df = map.convert_pandas_geopandas(df.to_pandas(), lat="Y", lon="X")

# final_df = df.sjoin(gpd.read_file(file._map_district).drop(columns="state")).drop(columns = ["geometry", "index_right"])

# final_df = pl.from_pandas(final_df)\
#     .with_columns(pl.col("date").cast(pl.Date))

population = pl.read_parquet(file._population_str_ascii_households)
# geojson_data = map.read_geojson_file(file._map_district)
population = population.filter(pl.col("sex")=="both",
                  pl.col("age")=="overall",
                  pl.col("ethnicity")=="overall",
                  pl.col("date").cast(pl.String)=="2022-01-01")\
.select("X","Y","ascii_population","str_percentage","str_ascii").to_pandas()

In [None]:
px.density_mapbox(population,
                  lat="Y",lon="X", z="str_ascii", radius=10,
                  mapbox_style="open-street-map",
                  color_continuous_scale="rainbow",
                  center={"lat": 4.389059008652357, "lon": 108.65244272591418},
                  zoom=5
)

In [None]:
from function.file import file
from function.provider import gp
from function.population import population
from function.map import map
import geopandas as gpd
import pandas as pd
import polars as pl
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

from dotenv import load_dotenv
import os

# To set the environement
load_dotenv()
px.set_mapbox_access_token(os.getenv("MAPBOX_TOKEN"))

geojson_data = map.read_geojson_file(file._map_district)

str_parlimen = pl.read_parquet(file._file_spm_parquet)\
                 .unique(subset="id_ben")\
                 .group_by("code_parlimen").len("str_count").to_pandas()

district = gpd.read_file(file._map_district)
parlimen = gpd.read_file(file._map_parlimen)
# parlime_district = parlimen.sjoin(district.drop(columns="state")).drop(columns="index_right")
population_ascii = map.convert_pandas_geopandas(pd.read_csv(file._population_ascii),lat="Y",lon="X")
temp_population = population_ascii.sjoin(parlimen, how="inner")\
                                  .drop(columns=["state", "index_right", "code_state"])\
                                  .sjoin(district, how="inner")\
                                  .drop(columns=["index_right", "geometry"])
temp_population = temp_population.merge(temp_population.pivot_table(index="code_parlimen", values="Z", aggfunc=sum)\
                      .reset_index().rename(columns={"Z":"parlimen_z"}),
                      how="left", on="code_parlimen")

str_parlimen = pl.read_parquet(file._file_spm_parquet)\
                 .unique(subset="id_ben")\
                 .group_by("code_parlimen").len("str_count").to_pandas()

temp_population = temp_population.merge(str_parlimen, how="left", on="code_parlimen")

temp_population.loc[:,"estimated_str"] = temp_population.loc[:,"str_count"] * temp_population.loc[:,"Z"] / temp_population.loc[:,"parlimen_z"]

In [None]:
def match_ppvgp(add_1):
    address = ppvgp_fac_code.apply(lambda x: fuzz.token_sort_ratio(add_1, x["ALAMAT"]), axis = 1)
    return ppvgp_fac_code.loc[address.idxmax(), "FACILITY CODE"]

# # Solve Facility Code via nearest geometry code
# def dist(lat1, long1, lat2, long2):
#     # Replicating the same formula as mentioned in Wiki convert decimal degrees to radians 
#     lat1, long1, lat2, long2 = map(radians, [lat1, long1, lat2, long2])
#     # Haversine formula 
#     dlon = long2 - long1
#     dlat = lat2 - lat1 
#     a = sin(dlat/2)**2 + cos(lat1) * cos(lat2) * sin(dlon/2)**2
#     c = 2 * asin(sqrt(a)) 
#     # Radius of earth in kilometers is 6371km
#     km = 6371* c
#     return km

# def find_nearest(lat, long):
#     distances = gp_df.apply(lambda row: dist(lat, long, row["Latitude"], row["Longitude"]), axis=1)
#     return gp_df.loc[distances.idxmin(), "id"]


# temp_population["id"] = temp_population.apply(lambda row: find_nearest(row["lat"], row["lon"]), axis = 1)
# temp_population

In [None]:
from function.file import file
from function.provider import gp
from function.population import population
from function.map import map
import geopandas as gpd
import pandas as pd
import polars as pl
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

from dotenv import load_dotenv
import os

# To set the environement
load_dotenv()
px.set_mapbox_access_token(os.getenv("MAPBOX_TOKEN"))

temp_population = pd.read_parquet(file._population_str_ascii_households)
temp_population.to_hdf("data/population/str_ascii_household.h5", index=False, key="str")

In [None]:
fig = go.Figure(go.Densitymapbox(lat=temp_population.Y, lon=temp_population.X, z=temp_population.loc[:,"estimated_str"],
                                 radius=20,
                                 autocolorscale = False, 
                                 colorscale="rainbow",
                                 colorbar_title="STR Population",
                                 opacity=0.5,
                                 showlegend=False,
                                 text=None,
                                 zmin=temp_population.loc[:,"estimated_str"].min(), zmax=temp_population.loc[:,"estimated_str"].max()))
fig.update_layout(mapbox_style="basic", 
                  mapbox_accesstoken=os.getenv("MAPBOX_TOKEN"),
                  mapbox_zoom=5, 
                  mapbox_center={"lat": 4.389059008652357, "lon": 108.65244272591418})
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [None]:
fig = go.Figure(go.Choroplethmapbox(geojson=geojson_data,
                                            locations=temp_population.loc[:,"parlimen"], 
                                            z=temp_population.loc[:,"str_count"],
                                            featureidkey=f"properties.parlimen",
                                            autocolorscale = False, 
                                            colorscale="rainbow",
                                            colorbar_title="STR Population",
                                            showlegend=False,
                                            text=None,
                                            zmin=temp_population.loc[:,"str_count"].min(), zmax=temp_population.loc[:,"str_count"].max(), 
                                            marker_opacity=0.5, 
                                            marker_line_width=1))
fig.update_layout(mapbox_style="basic", 
                    mapbox_accesstoken=os.getenv("MAPBOX_TOKEN"),
                    mapbox_zoom=5, 
                    mapbox_center={"lat": 4.389059008652357, "lon": 108.65244272591418},
                    )
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
fig.show()

In [None]:
from math import radians, sin, cos, asin, sqrt
from function.file import file
from function.provider import gp
from function.population import population
from function.map import map
import geopandas as gpd
import pandas as pd
import polars as pl
import numpy as np
import plotly.express as px
import plotly.graph_objects as go

from dotenv import load_dotenv
import os

# To set the environement
load_dotenv()
px.set_mapbox_access_token(os.getenv("MAPBOX_TOKEN"))

df = pd.read_parquet("/Users/wh0102/Downloads/github/mph/geo_project/streamlit/data/population/Population District.parquet")\
.pivot_table(index="district", columns="date", values="population", aggfunc=sum, margins=True)
df.columns

In [1]:
from math import radians, sin, cos, asin, sqrt
from scipy.stats import skew, kurtosis, shapiro, norm, spearmanr, iqr
from function.file import file
from function.provider import gp
from function.population import population
from function.map import map
import geopandas as gpd
import pandas as pd
import polars as pl
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import numpy as np
from dotenv import load_dotenv
import os

# To set the environement
load_dotenv()
px.set_mapbox_access_token(os.getenv("MAPBOX_TOKEN"))

geojson_data = map.read_geojson_file(file._map_district)

population = pd.read_parquet(file._population_str_ascii_gp_households).rename(columns={"X":'lon', "Y":"lat"})
target_population = population.query(f"code_state_district.isin({gp._district_code_list})")               
gp_df = pd.read_excel(file._gp_file)

In [3]:
target_population.loc[:,"observed_distance"] = target_population.loc[:,"distance"] * target_population.loc[:,"estimated_str"]
target_population

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  target_population.loc[:,"observed_distance"] = target_population.loc[:,"distance"] * target_population.loc[:,"estimated_str"]


Unnamed: 0,lon,lat,Z,parlimen,code_parlimen,state,district,code_state,code_district,code_state_district,parlimen_z,str_count,estimated_str,id,clinic_name,address,Latitude,Longitude,distance,observed_distance
17054,116.127916,6.128750,459.608429,P.171 Sepanggar,P.171,Sabah,Kota Kinabalu,12,7,12_7,492728.552631,47255,44.078623,940,MEDISINAR KLINIK & SURGERI,"LOT 34, GROUND FLOOR, BLOCK A-6, POLYTECHNIC C...",6.094047,116.158303,5.116395,225.523666
17368,116.094583,6.120417,195.142136,P.171 Sepanggar,P.171,Sabah,Kota Kinabalu,12,7,12_7,492728.552631,47255,18.715054,940,MEDISINAR KLINIK & SURGERI,"LOT 34, GROUND FLOOR, BLOCK A-6, POLYTECHNIC C...",6.094047,116.158303,7.630956,142.813753
17369,116.119583,6.120417,1060.403320,P.171 Sepanggar,P.171,Sabah,Kota Kinabalu,12,7,12_7,492728.552631,47255,101.697697,940,MEDISINAR KLINIK & SURGERI,"LOT 34, GROUND FLOOR, BLOCK A-6, POLYTECHNIC C...",6.094047,116.158303,5.188919,527.701119
17370,116.127916,6.120417,281.046906,P.171 Sepanggar,P.171,Sabah,Kota Kinabalu,12,7,12_7,492728.552631,47255,26.953728,940,MEDISINAR KLINIK & SURGERI,"LOT 34, GROUND FLOOR, BLOCK A-6, POLYTECHNIC C...",6.094047,116.158303,4.459260,120.193690
17690,116.094583,6.112084,217.092148,P.171 Sepanggar,P.171,Sabah,Kota Kinabalu,12,7,12_7,492728.552631,47255,20.820164,940,MEDISINAR KLINIK & SURGERI,"LOT 34, GROUND FLOOR, BLOCK A-6, POLYTECHNIC C...",6.094047,116.158303,7.325084,152.509445
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
381944,110.261250,1.070417,16.221607,P.198 Puncak Borneo,P.198,Sarawak,Kuching,13,1,13_1,140191.788817,16680,1.930045,911,17TH MILES FAMILY CLINIC,"GROUND FLOOR, LOT 1629 BLK 5 SSLD JALAN KUCHIN...",1.336466,110.407803,33.772913,65.183229
381945,110.269583,1.070417,15.162821,P.198 Puncak Borneo,P.198,Sarawak,Kuching,13,1,13_1,140191.788817,16680,1.804070,911,17TH MILES FAMILY CLINIC,"GROUND FLOOR, LOT 1629 BLK 5 SSLD JALAN KUCHIN...",1.336466,110.407803,33.335879,60.140271
381946,110.277916,1.070417,17.784443,P.198 Puncak Borneo,P.198,Sarawak,Kuching,13,1,13_1,140191.788817,16680,2.115991,911,17TH MILES FAMILY CLINIC,"GROUND FLOOR, LOT 1629 BLK 5 SSLD JALAN KUCHIN...",1.336466,110.407803,32.919120,69.656549
382128,110.261250,1.062084,16.893658,P.198 Puncak Borneo,P.198,Sarawak,Kuching,13,1,13_1,140191.788817,16680,2.010005,911,17TH MILES FAMILY CLINIC,"GROUND FLOOR, LOT 1629 BLK 5 SSLD JALAN KUCHIN...",1.336466,110.407803,34.587486,69.521023


In [46]:
class map:
    _summary_column_name = ["District Name", "Count of Points", "Mean", "Standard Deviation", "Min", "Max", "Median", "Inter-Quarter Range", "Skew", "Kurtosis", "shapiro"]
    _summary_function_list = [len, np.mean, np.std, min, max, np.median, iqr, skew, kurtosis, shapiro]

answer_dict = dict(zip(map._summary_column_name[1:-1],
                      [[formula(target_population["distance"])] for formula in map._summary_function_list[:-1]]))

# Count shapiro first
shapiro_value = shapiro(target_population["distance"])

answer_dict["Shapiro Stats"] = shapiro_value[0]
answer_dict["Shapiro p value"] = shapiro_value[1]



scipy.stats.shapiro: For N > 5000, computed p-value may not be accurate. Current N is 8510.



Unnamed: 0,Count of Points,Mean,Standard Deviation,Min,Max,Median,Inter-Quarter Range,Skew,Kurtosis,Shapiro Stats,Shapiro p value
10 District,8510,5.851097,6.085166,0.007844,34.587486,3.481661,7.634844,1.492881,2.033641,0.826765,5.640144e-70


In [32]:
px.histogram(target_population, x="distance", text_auto=True, marginal="box", 
             labels={'distance':'Distance in km'}, histnorm='probability density')

In [40]:
# for formula in [np.mean, np.std, min, max, np.median, iqr, skew, kurtosis, shapiro]:
    
# Define a custom aggregation function for the Shapiro-Wilk test
def shapiro_test(x):
    stat, p = shapiro(x)
    return (stat, p)

# Perform pivot table operation
pivot_table = target_population.pivot_table(
    index="district", 
    values="distance", 
    aggfunc=[np.mean, np.std, min, max, np.median, iqr, skew, kurtosis, shapiro]
).reset_index()

# Flatten the MultiIndex columns
# pivot_table.columns = ['_'.join(col).strip() if isinstance(col, tuple) else col for col in pivot_table.columns]
pivot_table.columns = ["district", "mean", "standard deviation", "min", "max", "median", "iqr", "skew", "kurtosis", "shapiro"]

# Separate the Shapiro-Wilk test results into two columns
# pivot_table[['shapiro_stat', 'shapiro_p']] = pd.DataFrame(pivot_table['shapiro'].tolist(), index=pivot_table.index)
for index, row in pivot_table.iterrows():
    # Some error cause by this
    pivot_table.loc[index, "Shapiro_stats"] = float(row["shapiro"][0])
    pivot_table.loc[index, "Shapiro_p_value"] = float(row["shapiro"][1])
# # Drop the original shapiro_test column
# pivot_table = pivot_table.drop(columns=['shapiro'])

pivot_table.drop(columns="shapiro")


The provided callable <function mean at 0x1049ba980> is currently using DataFrameGroupBy.mean. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "mean" instead.


The provided callable <function std at 0x1049baac0> is currently using DataFrameGroupBy.std. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "std" instead.


The provided callable <built-in function min> is currently using DataFrameGroupBy.min. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "min" instead.


The provided callable <built-in function max> is currently using DataFrameGroupBy.max. In a future version of pandas, the provided callable will be used directly. To keep current behavior pass the string "max" instead.


The provided callable <function median at 0x104b1b740> is currently using DataFrameGroupBy.median. I

Unnamed: 0,district,mean,standard deviation,min,max,median,iqr,skew,kurtosis,Shapiro_stats,Shapiro_p_value
0,Gombak,4.440584,3.606451,0.007844,14.894828,3.337327,4.937731,0.9694,-0.050979,0.889896,2.208564e-22
1,Johor Bahru,3.634853,3.347368,0.027453,16.032705,2.349377,4.219763,1.313299,1.108875,0.848968,2.783878e-32
2,Kinta,7.358301,6.557695,0.053119,24.73721,4.852479,9.592342,0.918674,-0.309057,0.873772,3.765693e-33
3,Klang,3.537567,3.720585,0.037973,14.996997,1.903555,4.121292,1.452885,1.156979,0.794781,3.93019e-29
4,Kota Kinabalu,4.419318,3.783152,0.073595,18.398427,3.181167,5.429778,1.163423,0.930961,0.880028,5.965255e-17
5,Kuching,10.25926,7.406708,0.072929,34.587486,8.741427,10.417349,0.861808,0.18456,0.931615,3.740258e-29
6,Petaling,0.906497,0.570048,0.041307,3.882908,0.796657,0.728455,1.190089,1.952003,0.922584,9.750883000000001e-17
7,Timur Laut,1.858716,1.204165,0.066443,4.654511,1.736024,1.968279,0.424346,-0.88937,0.949329,3.915816e-05
8,Ulu Langat,5.716903,5.115344,0.052333,18.704206,4.093679,8.314443,0.754187,-0.659912,0.882227,9.06718e-27
9,W.P. Kuala Lumpur,0.710739,0.421291,0.035236,2.425707,0.660009,0.583882,0.863324,0.947616,0.952678,5.91318e-08


In [13]:
px.histogram(target_population, x="distance", nbins=100)

In [2]:
import polars as pl

pl.read_excel("data/information/gp_list.xlsx").write_parquet("data/information/gp_list.parquet", use_pyarrow=True)

id,state,clinic_name,address,Latitude,Longitude,code_state_district,district
i64,str,str,str,f64,f64,str,str
1,"""W.P. Kuala Lumpur""","""QUALITAS HEALTH KLINIK NG DAN …","""NO.25, GROUND FLOOR, JALAN MET…",3.2147435,101.64014,"""14_1""","""W.P. Kuala Lumpur"""
2,"""W.P. Kuala Lumpur""","""POLIKLINIK REN AI BUKIT MALURI""","""23, JALAN BURUNG TIUNG TAMAN B…",3.2010499,101.632171,"""14_1""","""W.P. Kuala Lumpur"""
3,"""W.P. Kuala Lumpur""","""CARECLINICS KLINIK KWAN""","""1-0-12, DIAMOND SQUARE, BLOK C…",3.1956351,101.704895,"""14_1""","""W.P. Kuala Lumpur"""
4,"""W.P. Kuala Lumpur""","""AKTIV CARE CLINIC""","""NO 50A, JALAN DESA BAKTI OFF J…",3.1029965,101.685198,"""14_1""","""W.P. Kuala Lumpur"""
5,"""W.P. Kuala Lumpur""","""KLINIK TANAH AIR""","""NO.1, TINGKAT BAWAH JALAN 19/7…",3.1615818,101.648715,"""14_1""","""W.P. Kuala Lumpur"""
…,…,…,…,…,…,…,…
1253,"""Johor""","""KLINIK DR.ANIS""","""51(GF), 53(GF) , (TINGKAT BAWA…",1.5570161,103.712386,"""1_2""","""Johor Bahru"""
1254,"""Johor""","""KLINIK ANNU DAN SURGERI 24 JAM""","""185, JALAN PAHLAWAN SATU, TAMA…",1.520629,103.66223,"""1_2""","""Johor Bahru"""
1255,"""Johor""","""U.N.I KLINIK""","""NO. 97 (GROUND FLOOR), JALAN S…",1.454711,103.599425,"""1_2""","""Johor Bahru"""
1256,"""Johor""","""POLIKLINIK DR UMA""","""NO 27, (TINGKAT BAWAH), JALAN …",1.492267,103.640674,"""1_2""","""Johor Bahru"""
