In [4]:
import pandas as pd
import re

In [5]:
# Load data
df = pd.read_csv("gunung_indonesia.csv")

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207 entries, 0 to 206
Data columns (total 12 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Nama                207 non-null    object 
 1   Provinsi            207 non-null    object 
 2   Kabupaten           207 non-null    object 
 3   Kecamatan           207 non-null    object 
 4   Koordinat           207 non-null    object 
 5   Ketinggian (dpl)    207 non-null    int64  
 6   Jenis Gunung        207 non-null    object 
 7   Status              207 non-null    object 
 8   Akses               207 non-null    object 
 9   Jarak (km)          207 non-null    float64
 10  Jarak (m)           207 non-null    int64  
 11  Elevation gain (m)  207 non-null    int64  
dtypes: float64(1), int64(3), object(8)
memory usage: 19.5+ KB


In [7]:
# There is 2 column for jarak, drop km and keep m (for data consistency between column)
df.drop(columns='Jarak (km)', inplace=True)

In [8]:
def clean_column(col):
    col = re.sub(r"\s*\([^)]*\)", "", col)  # remove (m), (dpl), etc
    col = col.strip().lower()  # lowercase the column name
    return col

df.columns = [clean_column(col) for col in df.columns]

In [11]:
# Parse coordinate into latitude and longitude
def parse_koordinat(k):
    match = re.match(r"([\d\.\-]+)\s*(LU|LS),\s*([\d\.\-]+)\s*(BT|BB)", k)
    if not match:
        return pd.Series([None, None])
    lat, lat_dir, lon, lon_dir = match.groups()
    lat = float(lat) * (1 if lat_dir == "LU" else -1)
    lon = float(lon) * (1 if lon_dir == "BT" else -1)
    return pd.Series([lat, lon])

df[['latitude', 'longitude']] = df['koordinat'].apply(parse_koordinat)
df.drop(columns='koordinat', inplace=True)

In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 207 entries, 0 to 206
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   nama            207 non-null    object 
 1   provinsi        207 non-null    object 
 2   kabupaten       207 non-null    object 
 3   kecamatan       207 non-null    object 
 4   ketinggian      207 non-null    int64  
 5   jenis gunung    207 non-null    object 
 6   status          207 non-null    object 
 7   akses           207 non-null    object 
 8   jarak           207 non-null    int64  
 9   elevation gain  207 non-null    int64  
 10  latitude        207 non-null    float64
 11  longitude       207 non-null    float64
dtypes: float64(2), int64(3), object(7)
memory usage: 19.5+ KB


In [13]:
# Lowercase the values for all columns
df = df.apply(lambda x: x.str.lower() if x.dtype == "object" else x)

In [15]:
df.head()

Unnamed: 0,nama,provinsi,kabupaten,kecamatan,ketinggian,jenis gunung,status,akses,jarak,elevation gain,latitude,longitude
0,gunung bandahara,nanggroe aceh darussalam,gayo lues,ketambe,3030,stratovolcano,tidak aktif,buka,23500,1900,3.74994,97.78162
1,gunung kemiri,nanggroe aceh darussalam,gayo lues,puteri betung,3315,non volcanic,tidak aktif,buka,20000,2100,3.762063,97.483192
2,gunung kurik,nanggroe aceh darussalam,aceh timur,serba jadi,3085,non volcanic,tidak aktif,buka,16000,1300,4.252266,97.419957
3,gunung mugajah,nanggroe aceh darussalam,aceh timur,serba jadi,2055,non volcanic,tidak aktif,buka,17500,1400,4.252522,97.420833
4,gunung burni telong,nanggroe aceh darussalam,bener meriah,timang gajah,2624,stratovolcano,aktif,buka,5100,989,4.817674,96.818949


In [None]:
# Save file
df.to_csv('gunung_indonesia.csv', index=False)
print("[DONE] File saved to 'gunung_indonesia.csv'")

[DONE] File saved to 'gunung_indonesia.csv'
