In [7]:
import csv
import pandas as pd
import glob
import joblib
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder

In [8]:
csv_files = glob.glob("*.csv")
data_frames = []
for file in csv_files:
    df = pd.read_csv(file)
    data_frames.append(df)
data_frames

[   Tahun Populasi(01 Jan) Perubahan Alam Tingkat pertumbuhan(%)  \
 0   2024      115.371.047        941.428                 0,816%   
 1   2023      114.411.351        955.335                 0,835%   
 2   2022      113.517.326        889.976                 0,784%   
 3   2021      112.684.574        829.358                 0,736%   
 4   2020      111.477.954      1.200.618                 1,077%   
 5   2019      110.131.413      1.338.097                 1,215%   
 6   2018      108.799.160      1.324.086                 1,217%   
 7   2017      107.440.227      1.350.524                 1,257%   
 8   2016      106.031.211      1.399.612                  1,32%   
 9   2015      104.594.772      1.426.673                 1,364%   
 
   Jumlah penduduk(31 December) Nama Negara  
 0                  116.312.475    Filipina  
 1                  115.366.686    Filipina  
 2                  114.407.302    Filipina  
 3                  113.513.932    Filipina  
 4                  

In [9]:
merged_df = pd.concat(data_frames, ignore_index=True)

if 'Populasi(01 Jan)' in merged_df.columns:
    merged_df.rename(columns={'Populasi(01 Jan)': 'Populasi'}, inplace=True)

In [10]:
merged_csv_filename = "populasi.csv"
merged_df.to_csv(merged_csv_filename, index=False)
merged_df.head()

Unnamed: 0,Tahun,Populasi,Perubahan Alam,Tingkat pertumbuhan(%),Jumlah penduduk(31 December),Nama Negara
0,2024,115.371.047,941.428,"0,816%",116.312.475,Filipina
1,2023,114.411.351,955.335,"0,835%",115.366.686,Filipina
2,2022,113.517.326,889.976,"0,784%",114.407.302,Filipina
3,2021,112.684.574,829.358,"0,736%",113.513.932,Filipina
4,2020,111.477.954,1.200.618,"1,077%",112.678.572,Filipina


In [11]:
merged_df = merged_df[['Tahun', 'Populasi', 'Nama Negara']]
merged_df = merged_df.dropna()
merged_df['Tahun'] = merged_df['Tahun'].astype(int)
merged_df.tail()

Unnamed: 0,Tahun,Populasi,Nama Negara
45,2019,33.184.239,Malaysia
46,2018,32.637.695,Malaysia
47,2017,32.073.594,Malaysia
48,2016,31.505.777,Malaysia
49,2015,30.959.818,Malaysia


In [12]:
merged_df['Populasi'] = merged_df['Populasi'].astype(str).str.replace(r'[^0-9]', '', regex=True)
merged_df = merged_df[merged_df['Populasi'].str.isnumeric()] 
merged_df['Populasi'] = merged_df['Populasi'].astype(int)
merged_df.head()

Unnamed: 0,Tahun,Populasi,Nama Negara
0,2024,115371047,Filipina
1,2023,114411351,Filipina
2,2022,113517326,Filipina
3,2021,112684574,Filipina
4,2020,111477954,Filipina


In [13]:
merged_df = pd.get_dummies(merged_df, columns=['Nama Negara'], drop_first=True)

In [14]:
X = merged_df.drop(columns=['Populasi'])
y = merged_df['Populasi']
X, y

(    Tahun  Nama Negara_Indonesia  Nama Negara_Jepang  Nama Negara_Korea  \
 0    2024                  False               False              False   
 1    2023                  False               False              False   
 2    2022                  False               False              False   
 3    2021                  False               False              False   
 4    2020                  False               False              False   
 5    2019                  False               False              False   
 6    2018                  False               False              False   
 7    2017                  False               False              False   
 8    2016                  False               False              False   
 9    2015                  False               False              False   
 10   2024                   True               False              False   
 11   2023                   True               False              False   
 12   2022  

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
model = LinearRegression()
model.fit(X_train, y_train)

In [17]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

y_pred = model.predict(X_test)
mae = mean_absolute_error(y_test, y_pred)
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Model Evaluation:")
print(f"Mean Absolute Error (MAE): {mae:,.2f}")
print(f"Mean Squared Error (MSE): {mse:,.2f}")
print(f"R Score: {r2:.4f}")

Model Evaluation:
Mean Absolute Error (MAE): 2,386,033.37
Mean Squared Error (MSE): 14,314,268,161,741.41
R Score: 0.9985


In [18]:
joblib.dump(model, "model_populasi.pkl")
joblib.dump(X.columns, "model_features.pkl")

['model_features.pkl']