In [5]:
import pandas as pd
import geopandas as gpd
import folium
from sklearn.cluster import KMeans


In [6]:
import pandas as pd
df = pd.read_csv('Indonesian Salary by Region (1997-2025).csv')
df.head()


FileNotFoundError: [Errno 2] No such file or directory: 'Indonesian Salary by Region (1997-2025).csv'

In [None]:
import geopandas as gpd
gdf = gpd.read_file('gadm41_IDN_1.json')
gdf.head()


In [None]:
df_5yr = df[(df['YEAR'] >= 2020) & (df['YEAR'] <= 2025)]


In [None]:
df_5yr['YEAR'].unique()


In [None]:
salary_prov = (
    df_5yr
    .groupby('REGION')['SALARY']
    .mean()
    .reset_index()
)


In [None]:
# ===============================
# 1️⃣ Import library
# ===============================
import pandas as pd
import geopandas as gpd
from sklearn.cluster import KMeans
import folium

# ===============================
# 2️⃣ Baca CSV dan GeoJSON
# ===============================
df = pd.read_csv("Indonesian Salary by Region (1997-2025).csv")
gdf = gpd.read_file("gadm41_IDN_1.json")

# Bersihkan nama provinsi
gdf['NAME_1'] = gdf['NAME_1'].str.strip()
df['REGION'] = df['REGION'].str.strip()

# ===============================
# 3️⃣ Mapping nama provinsi CSV → GeoJSON
# ===============================
mapping = {
    'Aceh':'Aceh',
    'Sumatera Utara':'Sumatera Utara',
    'Sumatera Barat':'Sumatera Barat',
    'Riau':'Riau',
    'Jambi':'Jambi',
    'Sumatera Selatan':'Sumatera Selatan',
    'Bengkulu':'Bengkulu',
    'Lampung':'Lampung',
    'Bangka Belitung':'Bangka Belitung',
    'Kepulauan Riau':'Kepulauan Riau',
    'DKI Jakarta':'Jakarta',
    'Jawa Barat':'Jawa Barat',
    'Jawa Tengah':'Jawa Tengah',
    'DI Yogyakarta':'Yogyakarta',
    'Jawa Timur':'Jawa Timur',
    'Banten':'Banten',
    'Bali':'Bali',
    'Nusa Tenggara Barat':'Nusa Tenggara Barat',
    'Nusa Tenggara Timur':'Nusa Tenggara Timur',
    'Kalimantan Barat':'Kalimantan Barat',
    'Kalimantan Tengah':'Kalimantan Tengah',
    'Kalimantan Selatan':'Kalimantan Selatan',
    'Kalimantan Timur':'Kalimantan Timur',
    'Kalimantan Utara':'Kalimantan Utara',
    'Sulawesi Utara':'Sulawesi Utara',
    'Sulawesi Tengah':'Sulawesi Tengah',
    'Sulawesi Selatan':'Sulawesi Selatan',
    'Sulawesi Tenggara':'Sulawesi Tenggara',
    'Sulawesi Barat':'Sulawesi Barat',
    'Maluku':'Maluku',
    'Maluku Utara':'Maluku Utara',
    'Papua':'Papua',
    'Papua Barat':'Papua Barat'
}
df['REGION'] = df['REGION'].str.strip().replace(mapping)
gdf['NAME_1'] = gdf['NAME_1'].str.strip()


# ===============================
# 4️⃣ Pilih tahun 2020–2025 dan buat cluster
# ===============================
years = [2020, 2021, 2022, 2023, 2024, 2025]
cluster_dfs = []

for y in years:
    df_year = df[df['YEAR']==y].copy()
    
    # Hanya ambil baris yang SALARY tidak NaN
    df_year = df_year[df_year['SALARY'].notna()]
    
    X = df_year[['SALARY']]
    
    kmeans = KMeans(n_clusters=3, random_state=42, n_init=10)
    df_year['cluster'] = kmeans.fit_predict(X)
    
    df_year['YEAR'] = y
    cluster_dfs.append(df_year[['REGION','YEAR','cluster']])

# Gabungkan semua tahun
df_cluster = pd.concat(cluster_dfs, ignore_index=True)

# ===============================
# 5️⃣ Merge dengan GeoJSON
# ===============================
# Contoh buat peta tahun 2020, bisa loop untuk tiap tahun
df_2020 = df_cluster[df_cluster['YEAR']==2020]
gdf_2020 = gdf.merge(
    df_2020[['REGION','cluster']],
    left_on='NAME_1',
    right_on='REGION',
    how='left'
)

# ===============================
# 6️⃣ Buat peta Folium
# ===============================
m = folium.Map(location=[-2, 118], zoom_start=5, tiles='cartodbpositron')

cluster_colors = {0:'#66c2a5', 1:'#fc8d62', 2:'#ffd92f'}

folium.GeoJson(
    gdf_2020.__geo_interface__,  # Embed GeoJSON agar HTML standalone
    style_function=lambda f: {
        'fillColor': cluster_colors.get(f['properties']['cluster'], '#cccccc'),
        'color':'black',
        'weight':0.5,
        'fillOpacity':0.8
    },
    tooltip=folium.GeoJsonTooltip(
        fields=['NAME_1','cluster'],
        aliases=['Provinsi','Cluster']
    )
).add_to(m)

# ===============================
# 7️⃣ Tambahkan legend
# ===============================
legend_html = """
<div style="
position: fixed;
top: 20px; right: 20px;
width: 220px;
background-color: white;
border:1.5px solid grey;
z-index:9999;
font-size:11px;
padding: 8px;
line-height:1.3;
">
<b>Cluster Upah Minimum Provinsi</b><br>(2020–2025)<br><br>

<i style="background:#66c2a5;width:14px;height:14px;float:left;margin-right:6px;"></i>
<b>Rendah</b><br>
<i>Rp1.919.745 – Rp2.821.295</i><br><br>

<i style="background:#fc8d62;width:14px;height:14px;float:left;margin-right:6px;"></i>
<b>Menengah</b><br>
<i>Rp2.935.736 – Rp3.795.024</i><br><br>

<i style="background:#ffd92f;width:14px;height:14px;float:left;margin-right:6px;"></i>
<b>Tinggi</b><br>
<i>Rp4.751.866</i>
</div>
"""
m.get_root().html.add_child(folium.Element(legend_html))

# ===============================
# 8️⃣ Simpan HTML
# ===============================
m.save("cluster_upah_minimum_2020.html")


In [None]:
# Nama provinsi di CSV
print(sorted(df['REGION'].unique()))

# Nama provinsi di GeoJSON
print(sorted(gdf['NAME_1'].unique()))


In [None]:
mapping = {
    'Yogyakarta': 'DI YOGYAKARTA',
    'JakartaRaya': 'DKI JAKARTA',
    'BangkaBelitung': 'KEP. BANGKA BELITUNG',
    'KepulauanRiau': 'KEP. RIAU',
    'NusaTenggaraBarat': 'NUSA TENGGARA BARAT',
    'NusaTenggaraTimur': 'NUSA TENGGARA TIMUR',
    'SulawesiBarat': 'SULAWESI BARAT',
    'SulawesiSelatan': 'SULAWESI SELATAN',
    'SulawesiTengah': 'SULAWESI TENGAH',
    'SulawesiTenggara': 'SULAWESI TENGGARA',
    'SulawesiUtara': 'SULAWESI UTARA',
    'SumateraBarat': 'SUMATERA BARAT',
    'SumateraSelatan': 'SUMATERA SELATAN',
    'SumateraUtara': 'SUMATERA UTARA',
    'Jakarta': 'DKI JAKARTA',  # tambahan jika ada
    'Aceh':'ACEH',
    'Bali':'BALI',
    'Banten':'BANTEN',
    'Bengkulu':'BENGKULU',
    'Gorontalo':'GORONTALO',
    'Jambi':'JAMBI',
    'JawaBarat':'JAWA BARAT',
    'JawaTengah':'JAWA TENGAH',
    'JawaTimur':'JAWA TIMUR',
    'KalimantanBarat':'KALIMANTAN BARAT',
    'KalimantanSelatan':'KALIMANTAN SELATAN',
    'KalimantanTengah':'KALIMANTAN TENGAH',
    'KalimantanTimur':'KALIMANTAN TIMUR',
    'KalimantanUtara':'KALIMANTAN UTARA',
    'Lampung':'LAMPUNG',
    'Maluku':'MALUKU',
    'MalukuUtara':'MALUKU UTARA',
    'Papua':'PAPUA',
    'PapuaBarat':'PAPUA BARAT',
    'Riau':'RIAU',
    'SumatraBarat':'SUMATRA BARAT',
    'SumatraSelatan':'SUMATRA SELATAN',
    'SumatraUtara':'SUMATRA UTARA',
    'Yogyakarta':'DI YOGYAKARTA'
}


In [None]:
# Terapkan mapping ke CSV
df['REGION'] = df['REGION'].replace(mapping)

# Merge dengan GeoJSON
gdf = gdf.merge(
    df[['REGION','YEAR','SALARY']],
    left_on='NAME_1',
    right_on='REGION',
    how='left'
)

# Cek hasil merge
print(gdf[['NAME_1','REGION','SALARY']])


In [20]:
df.columns


Index(['REGION', 'SALARY', 'YEAR'], dtype='object')

In [21]:
df_year = df[df['YEAR'] == 2025]


In [22]:
salary_prov = (
    df_year
    .groupby('REGION')['SALARY']
    .mean()
    .reset_index()
)


In [19]:
import pandas as pd
import geopandas as gpd

df = pd.read_csv('Indonesian Salary by Region (1997-2025).csv')
gdf_prov = gpd.read_file('gadm41_IDN_1.json')

In [23]:
gdf_prov['province'] = gdf_prov['NAME_1'].str.upper()
salary_prov['REGION'] = salary_prov['REGION'].str.upper()


In [25]:
gdf = gdf_prov.merge(salary_prov, left_on='province', right_on='REGION')
gdf.head()

Unnamed: 0,GID_1,GID_0,COUNTRY,NAME_1,VARNAME_1,NL_NAME_1,TYPE_1,ENGTYPE_1,CC_1,HASC_1,ISO_1,geometry,province,REGION,SALARY
0,IDN.1_1,IDN,Indonesia,Aceh,,,Propinisi,Province,11,ID.AC,ID-AC,"MULTIPOLYGON (((96.68130 2.09740, 96.68720 2.0...",ACEH,ACEH,3685615.0
1,IDN.2_1,IDN,Indonesia,Bali,,,Propinisi,Province,51,ID.BA,ID-BA,"MULTIPOLYGON (((115.52570 -8.80450, 115.52540 ...",BALI,BALI,2996560.0
2,IDN.4_1,IDN,Indonesia,Banten,,,Propinisi,Province,36,ID.BT,ID-BT,"MULTIPOLYGON (((106.38680 -6.99070, 106.38500 ...",BANTEN,BANTEN,2905119.0
3,IDN.5_1,IDN,Indonesia,Bengkulu,,,Propinisi,Province,17,ID.BE,ID-BE,"MULTIPOLYGON (((103.57300 -4.92300, 103.57130 ...",BENGKULU,BENGKULU,2670039.0
4,IDN.6_1,IDN,Indonesia,Gorontalo,,,Propinisi,Province,75,ID.GO,ID-GO,"MULTIPOLYGON (((123.54910 0.43250, 123.53990 0...",GORONTALO,GORONTALO,3221731.0


In [26]:
from sklearn.cluster import KMeans

X = gdf[['SALARY']]

kmeans = KMeans(n_clusters=3, random_state=42)
gdf['cluster'] = kmeans.fit_predict(X)


  super()._check_params_vs_input(X, default_n_init=10)


In [27]:
from sklearn.cluster import KMeans

X = gdf[['SALARY']]

kmeans = KMeans(
    n_clusters=3,
    random_state=42,
    n_init=10
)

gdf['cluster'] = kmeans.fit_predict(X)




In [9]:
import os
os.environ["OMP_NUM_THREADS"] = "1"


In [28]:
import folium

m = folium.Map(location=[-2, 118], zoom_start=5)

folium.Choropleth(
    geo_data=gdf,
    data=gdf,
    columns=['REGION', 'cluster'],
    key_on='feature.properties.REGION',
    fill_color='YlOrRd',
    legend_name='Cluster Upah Minimum Provinsi Indonesia'
).add_to(m)

m


In [29]:
df_5yr = df[(df['YEAR'] >= 2020) & (df['YEAR'] <= 2025)]


In [30]:
df_5yr['YEAR'].unique()

array([2020, 2021, 2022, 2023, 2024, 2025], dtype=int64)

In [31]:
salary_prov = (
    df_5yr
    .groupby('REGION')['SALARY']
    .mean()
    .reset_index()
)


In [32]:
gdf = gdf_prov.merge(salary_prov, left_on='province', right_on='REGION')
gdf.head()

Unnamed: 0,GID_1,GID_0,COUNTRY,NAME_1,VARNAME_1,NL_NAME_1,TYPE_1,ENGTYPE_1,CC_1,HASC_1,ISO_1,geometry,province,REGION,SALARY
0,IDN.1_1,IDN,Indonesia,Aceh,,,Propinisi,Province,11,ID.AC,ID-AC,"MULTIPOLYGON (((96.68130 2.09740, 96.68720 2.0...",ACEH,ACEH,3342746.0
1,IDN.2_1,IDN,Indonesia,Bali,,,Propinisi,Province,51,ID.BA,ID-BA,"MULTIPOLYGON (((115.52570 -8.80450, 115.52540 ...",BALI,BALI,2671479.0
2,IDN.4_1,IDN,Indonesia,Banten,,,Propinisi,Province,36,ID.BT,ID-BT,"MULTIPOLYGON (((106.38680 -6.99070, 106.38500 ...",BANTEN,BANTEN,2619568.0
3,IDN.5_1,IDN,Indonesia,Bengkulu,,,Propinisi,Province,17,ID.BE,ID-BE,"MULTIPOLYGON (((103.57300 -4.92300, 103.57130 ...",BENGKULU,BENGKULU,2489277.0
4,IDN.6_1,IDN,Indonesia,Gorontalo,,,Propinisi,Province,75,ID.GO,ID-GO,"MULTIPOLYGON (((123.54910 0.43250, 123.53990 0...",GORONTALO,GORONTALO,2935736.0


In [33]:
from sklearn.cluster import KMeans

X = gdf[['SALARY']]

kmeans = KMeans(
    n_clusters=3,
    random_state=42,
    n_init=10
)

gdf['cluster'] = kmeans.fit_predict(X)




In [34]:
gdf[['REGION', 'SALARY', 'cluster']].head()


Unnamed: 0,REGION,SALARY,cluster
0,ACEH,3342746.0,1
1,BALI,2671479.0,2
2,BANTEN,2619568.0,2
3,BENGKULU,2489277.0,2
4,GORONTALO,2935736.0,1


In [35]:
print("CSV:", salary_prov['REGION'].nunique())
print("GEOJSON:", gdf_prov['NAME_1'].nunique())


CSV: 42
GEOJSON: 34


In [36]:
gdf_prov.columns


Index(['GID_1', 'GID_0', 'COUNTRY', 'NAME_1', 'VARNAME_1', 'NL_NAME_1',
       'TYPE_1', 'ENGTYPE_1', 'CC_1', 'HASC_1', 'ISO_1', 'geometry',
       'province'],
      dtype='object')

In [37]:
salary_prov.columns


Index(['REGION', 'SALARY'], dtype='object')

In [38]:
gdf_prov = gdf_prov.drop(columns=['REGION', 'province'], errors='ignore')


In [39]:
gdf_prov.columns


Index(['GID_1', 'GID_0', 'COUNTRY', 'NAME_1', 'VARNAME_1', 'NL_NAME_1',
       'TYPE_1', 'ENGTYPE_1', 'CC_1', 'HASC_1', 'ISO_1', 'geometry'],
      dtype='object')

In [40]:
gdf_prov['NAME_1'].head(34)

0                  Aceh
1                  Bali
2        BangkaBelitung
3                Banten
4              Bengkulu
5             Gorontalo
6           JakartaRaya
7                 Jambi
8             JawaBarat
9            JawaTengah
10            JawaTimur
11      KalimantanBarat
12    KalimantanSelatan
13     KalimantanTengah
14      KalimantanTimur
15      KalimantanUtara
16        KepulauanRiau
17              Lampung
18               Maluku
19          MalukuUtara
20    NusaTenggaraBarat
21    NusaTenggaraTimur
22                Papua
23           PapuaBarat
24                 Riau
25        SulawesiBarat
26      SulawesiSelatan
27       SulawesiTengah
28     SulawesiTenggara
29        SulawesiUtara
30        SumateraBarat
31      SumateraSelatan
32        SumateraUtara
33           Yogyakarta
Name: NAME_1, dtype: object

In [41]:
gdf_prov['REGION'] = gdf_prov['NAME_1'].str.upper().str.strip()
salary_prov['REGION'] = salary_prov['REGION'].str.upper().str.strip()

In [42]:
cek = gdf_prov.merge(
    salary_prov,
    on='REGION',
    how='left'
)

cek[['NAME_1', 'REGION', 'SALARY']].head(34)

Unnamed: 0,NAME_1,REGION,SALARY
0,Aceh,ACEH,3342746.0
1,Bali,BALI,2671479.0
2,BangkaBelitung,BANGKABELITUNG,
3,Banten,BANTEN,2619568.0
4,Bengkulu,BENGKULU,2489277.0
5,Gorontalo,GORONTALO,2935736.0
6,JakartaRaya,JAKARTARAYA,
7,Jambi,JAMBI,2953984.0
8,JawaBarat,JAWABARAT,
9,JawaTengah,JAWATENGAH,


In [43]:
cek[cek['SALARY'].isna()][['NAME_1']]

Unnamed: 0,NAME_1
2,BangkaBelitung
6,JakartaRaya
8,JawaBarat
9,JawaTengah
10,JawaTimur
11,KalimantanBarat
12,KalimantanSelatan
13,KalimantanTengah
14,KalimantanTimur
15,KalimantanUtara


In [44]:
mapping = {
    'DKI JAKARTA': 'JAKARTA RAYA',
    'DI YOGYAKARTA': 'YOGYAKARTA',
    'BANGKA BELITUNG': 'KEP. BANGKA BELITUNG',
    'NUSA TENGGARA BARAT': 'WEST NUSA TENGGARA',
    'NUSA TENGGARA TIMUR': 'EAST NUSA TENGGARA',
    'PAPUA BARAT': 'WEST PAPUA',
    'PAPUA': 'PAPUA'
}

salary_prov['REGION'] = salary_prov['REGION'].replace(mapping)


In [45]:
gdf = gdf_prov.merge(
    salary_prov,
    on='REGION',
    how='inner'
)


In [46]:
gdf['REGION'].nunique()


11

In [47]:
cek = gdf_prov.merge(
    salary_prov,
    on='REGION',
    how='left'
)

cek_na = cek[cek['SALARY'].isna()][['NAME_1', 'REGION']]
cek_na


Unnamed: 0,NAME_1,REGION
2,BangkaBelitung,BANGKABELITUNG
6,JakartaRaya,JAKARTARAYA
8,JawaBarat,JAWABARAT
9,JawaTengah,JAWATENGAH
10,JawaTimur,JAWATIMUR
11,KalimantanBarat,KALIMANTANBARAT
12,KalimantanSelatan,KALIMANTANSELATAN
13,KalimantanTengah,KALIMANTANTENGAH
14,KalimantanTimur,KALIMANTANTIMUR
15,KalimantanUtara,KALIMANTANUTARA


In [48]:
gdf_prov['REGION_FIX'] = (
    gdf_prov['NAME_1']
    .str.upper()
    .str.replace(' ', '', regex=False)
)


In [49]:
salary_prov['REGION_FIX'] = (
    salary_prov['REGION']
    .str.upper()
    .str.replace(' ', '', regex=False)
)


In [50]:
gdf = gdf_prov.merge(
    salary_prov,
    on='REGION_FIX',
    how='inner'
)


In [51]:
gdf['REGION_FIX'].nunique()

29

In [52]:
from sklearn.cluster import KMeans

X = gdf[['SALARY']]

kmeans = KMeans(
    n_clusters=3,
    random_state=42,
    n_init=10
)

gdf['cluster'] = kmeans.fit_predict(X)




In [53]:
gdf[['NAME_1', 'SALARY', 'cluster']].head()


Unnamed: 0,NAME_1,SALARY,cluster
0,Aceh,3342746.0,1
1,Bali,2671479.0,0
2,Banten,2619568.0,0
3,Bengkulu,2489277.0,0
4,Gorontalo,2935736.0,1


In [56]:
import folium

# Buat peta dasar
m = folium.Map(location=[-2, 118], zoom_start=5, tiles='cartodbpositron')

# Warna diskrit per cluster
cluster_colors = {
    0: '#66c2a5',  # hijau
    1: '#fc8d62',  # pink
    2: '#ffd92f'   # kuning
}

# GeoJSON harus embed langsung, pakai __geo_interface__
folium.GeoJson(
    gdf.__geo_interface__,  # embed langsung
    style_function=lambda feature: {
        'fillColor': cluster_colors.get(feature['properties']['cluster'], '#cccccc'),
        'color': 'black',
        'weight': 0.5,
        'fillOpacity': 0.8
    },
    tooltip=folium.GeoJsonTooltip(
        fields=['NAME_1', 'cluster'],  # sesuaikan dengan kolom gdf
        aliases=['Provinsi', 'Cluster'],
        localize=True
    )
).add_to(m)

# Tambahkan legend
legend_html = """
<div style="
position: fixed;
top: 20px; right: 20px;
width: 200px;
background-color: white;
border:1.5px solid grey;
z-index:9999;
font-size:11px;
padding: 8px;
line-height:1.3;">
<b>Cluster Gaji per Provinsi</b><br>
Tahun (2020–2025)<br><br>

<i style="background:#66c2a5;width:14px;height:14px;float:left;margin-right:6px;"></i>
<b>Rendah</b><br>
Rp1.919.745 – Rp2.821.295<br>
(15 Provinsi)<br><br>

<i style="background:#fc8d62;width:14px;height:14px;float:left;margin-right:6px;"></i>
<b>Menengah</b><br>
Rp2.935.736 – Rp3.795.024<br>
(13 Provinsi)<br><br>

<i style="background:#ffd92f;width:14px;height:14px;float:left;margin-right:6px;"></i>
<b>Tinggi</b><br>
Rp4.751.866<br>
(1 Provinsi)
</div>
"""
m.get_root().html.add_child(folium.Element(legend_html))

# Simpan HTML
m.save("index.html")


In [2]:
# ke CSV
gdf.to_csv('gdf_merged.csv', index=False)

# atau ke GeoJSON (jika ada geometry)
gdf.to_file('gdf_merged.geojson', driver='GeoJSON')


NameError: name 'gdf' is not defined

In [86]:
!git status
!git add gdf_merged.csv
!git commit -m "Add merged GeoDataFrame"
!git push origin main


'git' is not recognized as an internal or external command,
operable program or batch file.
'git' is not recognized as an internal or external command,
operable program or batch file.
'git' is not recognized as an internal or external command,
operable program or batch file.
'git' is not recognized as an internal or external command,
operable program or batch file.


In [87]:
import os
os.getcwd()

'D:\\SMST 5\\PROPAS\\Anaconda\\UAS'

In [88]:
import os
os.getcwd()  # harus D:\SMST 5\PROPAS\Anaconda\UAS


'D:\\SMST 5\\PROPAS\\Anaconda\\UAS'

In [89]:
os.chdir(r'D:\SMST 5\PROPAS\Anaconda\UAS')


In [90]:
# CSV
gdf.to_csv('gdf_merged.csv', index=False)

# Atau GeoJSON
gdf.to_file('gdf_merged.geojson', driver='GeoJSON')


In [92]:
# CSV
gdf.to_csv('gdf_merged.csv', index=False)

# Atau GeoJSON
gdf.to_file('gdf_merged.geojson', driver='GeoJSON')
