In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!mkdir ~/.kaggle

# copying api file
!cp drive/MyDrive/kaggle.json ~/.kaggle/
# modding permission
!chmod 600 ~/.kaggle/kaggle.json

In [None]:
# downloading data using api command from the site
!kaggle datasets download -d ardikasatria/datasettanamanpadisumatera
# unzip the downloaded file
!unzip datasettanamanpadisumatera.zip

Downloading datasettanamanpadisumatera.zip to /content
  0% 0.00/4.75k [00:00<?, ?B/s]
100% 4.75k/4.75k [00:00<00:00, 7.57MB/s]
Archive:  datasettanamanpadisumatera.zip
  inflating: Data_Tanaman_Padi_Sumatera_version_1.csv  


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer

In [None]:
data = pd.read_csv('/content/Data_Tanaman_Padi_Sumatera_version_1.csv')
le = LabelEncoder()
data['Provinsi'] = le.fit_transform(data['Provinsi'])
data.head()

Unnamed: 0,Provinsi,Tahun,Produksi,Luas Panen,Curah hujan,Kelembapan,Suhu rata-rata
0,0,1993,1329536.0,323589.0,1627.0,82.0,26.06
1,0,1994,1299699.0,329041.0,1521.0,82.12,26.92
2,0,1995,1382905.0,339253.0,1476.0,82.72,26.27
3,0,1996,1419128.0,348223.0,1557.0,83.0,26.08
4,0,1997,1368074.0,337561.0,1339.0,82.46,26.31


In [None]:
x = data[['Provinsi','Tahun','Luas Panen','Curah hujan','Kelembapan','Suhu rata-rata']]
y = data[['Produksi']]
column = x.columns

In [None]:
x

Unnamed: 0,Provinsi,Tahun,Luas Panen,Curah hujan,Kelembapan,Suhu rata-rata
0,0,1993,323589.00,1627.0,82.00,26.06
1,0,1994,329041.00,1521.0,82.12,26.92
2,0,1995,339253.00,1476.0,82.72,26.27
3,0,1996,348223.00,1557.0,83.00,26.08
4,0,1997,337561.00,1339.0,82.46,26.31
...,...,...,...,...,...,...
219,3,2016,390799.00,2317.6,79.40,26.45
220,3,2017,396559.00,1825.1,77.04,26.36
221,3,2018,511940.93,1385.8,76.05,25.50
222,3,2019,464103.42,1706.4,78.03,27.23


Base Accuracy

In [79]:
scaler = MinMaxScaler()
mlr = LinearRegression()

pipeline = Pipeline([('scaler', scaler), ('model', mlr)])

cv = KFold(n_splits =5, shuffle=True, random_state=100)
R2_scores = cross_val_score(pipeline, x, y, cv= cv)
MAE_scores = cross_val_score(pipeline, x, y, cv = cv,scoring="neg_mean_absolute_error")
MSE_scores = cross_val_score(pipeline, x, y, cv = cv, scoring="neg_mean_squared_error")
RMSE_scores = cross_val_score(pipeline, x, y, cv = cv, scoring="neg_root_mean_squared_error")

for i, (train_index, test_index) in enumerate(cv.split(x)):
  print(f"Iteration {i+1}:")
  print(f"Score: R2 {R2_scores[i]}, MAE = {MAE_scores[i]}, MSE = {MSE_scores[i]}, RMSE = {RMSE_scores[i]}")

print("")

print(f"R2 : {R2_scores.mean()*100}")
print(f"MAE : {MAE_scores.mean()}")
print(f"MSE : {MSE_scores.mean()}")
print(f"RMSE : {RMSE_scores.mean()}")

Iteration 1:
Score: R2 0.853297097481976, MAE = -259025.2947148299, MSE = -213020576030.01724, RMSE = -461541.52145827276
Iteration 2:
Score: R2 0.7996535199685146, MAE = -233138.15386785477, MSE = -202719459801.82156, RMSE = -450243.7781933489
Iteration 3:
Score: R2 0.8856565929620946, MAE = -230493.51833843614, MSE = -170990746500.56268, RMSE = -413510.27375454974
Iteration 4:
Score: R2 0.8846085449867995, MAE = -239998.23555130404, MSE = -176579504387.40225, RMSE = -420213.64136282186
Iteration 5:
Score: R2 0.8674402317203311, MAE = -240957.84855521505, MSE = -126261133398.86568, RMSE = -355332.42660762847

R2 : 85.81311974239432
MAE : -240722.610205528
MSE : -177914284023.7339
RMSE : -420168.3282753243


Weather Variables

In [80]:
scaler = MinMaxScaler()
mlr = LinearRegression()
pca = PCA(n_components=1)

pipeline = Pipeline([('scaler', scaler), ('ct', ColumnTransformer([("pca", pca, [3, 4, 5]), ("pass", "passthrough", [0, 1, 2])])), ('model', mlr)])

cv = KFold(n_splits =5, shuffle=True, random_state=100)
R2_scores = cross_val_score(pipeline, x, y, cv= cv)
MAE_scores = cross_val_score(pipeline, x, y, cv = cv,scoring="neg_mean_absolute_error")
MSE_scores = cross_val_score(pipeline, x, y, cv = cv, scoring="neg_mean_squared_error")
RMSE_scores = cross_val_score(pipeline, x, y, cv = cv, scoring="neg_root_mean_squared_error")

for i, (train_index, test_index) in enumerate(cv.split(x)):
  print(f"Iteration {i+1}:")
  print(f"Score: R2 {R2_scores[i]}, MAE = {MAE_scores[i]}, MSE = {MSE_scores[i]}, RMSE = {RMSE_scores[i]}")

print("")

print(f"R2 : {R2_scores.mean()*100}")
print(f"MAE : {MAE_scores.mean()}")
print(f"MSE : {MSE_scores.mean()}")
print(f"RMSE : {RMSE_scores.mean()}")

Iteration 1:
Score: R2 0.8690497506757265, MAE = -249611.0956142631, MSE = -190146868695.4159, RMSE = -436058.3317578233
Iteration 2:
Score: R2 0.7950951735016023, MAE = -235373.8231995574, MSE = -207331797054.84845, RMSE = -455337.0148086453
Iteration 3:
Score: R2 0.887324020477758, MAE = -233069.39730666412, MSE = -168497251833.70932, RMSE = -410484.1675798341
Iteration 4:
Score: R2 0.8901433873139691, MAE = -235948.56649265028, MSE = -168109728918.4803, RMSE = -410011.8643630697
Iteration 5:
Score: R2 0.8680924532049606, MAE = -237787.84526411025, MSE = -125639902500.94585, RMSE = -354457.1941729295

R2 : 86.19409570348033
MAE : -238358.14557544905
MSE : -171945109800.67993
RMSE : -413269.7145364604


All Variables

In [81]:
scaler = MinMaxScaler()
mlr = LinearRegression()
pca = PCA(n_components=1)

pipeline = Pipeline([('scaler', scaler), ("pca", pca), ('model', mlr)])

cv = KFold(n_splits=5, shuffle=True, random_state=100)
R2_scores = cross_val_score(pipeline, x, y, cv= cv)
MAE_scores = cross_val_score(pipeline, x, y, cv = cv,scoring="neg_mean_absolute_error")
MSE_scores = cross_val_score(pipeline, x, y, cv = cv, scoring="neg_mean_squared_error")
RMSE_scores = cross_val_score(pipeline, x, y, cv = cv, scoring="neg_root_mean_squared_error")

for i, (train_index, test_index) in enumerate(cv.split(x)):
  print(f"Iteration {i+1}:")
  print(f"Score: R2 {R2_scores[i]}, MAE = {MAE_scores[i]}, MSE = {MSE_scores[i]}, RMSE = {RMSE_scores[i]}")

print("")

print(f"R2 : {R2_scores.mean()*100}")
print(f"MAE : {MAE_scores.mean()}")
print(f"MSE : {MSE_scores.mean()}")
print(f"RMSE : {RMSE_scores.mean()}")

Iteration 1:
Score: R2 0.672628230949819, MAE = -502944.2381922432, MSE = -475361575143.1189, RMSE = -689464.7018833661
Iteration 2:
Score: R2 0.49040607595337293, MAE = -499307.47068228805, MSE = -515629747948.5937, RMSE = -718073.6368566902
Iteration 3:
Score: R2 0.6568573442800998, MAE = -593737.1332696192, MSE = -513140375800.42145, RMSE = -716338.1713970165
Iteration 4:
Score: R2 0.6063864259768729, MAE = -583480.7649703007, MSE = -602333074084.2709, RMSE = -776101.2009295379
Iteration 5:
Score: R2 0.45997031530057453, MAE = -610008.9069669197, MSE = -514369940020.7781, RMSE = -717195.8867846205

R2 : 57.72496784921477
MAE : -557895.7028162741
MSE : -524166942599.4365
RMSE : -723434.7195702463
