In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!mkdir ~/.kaggle

# copying api file
!cp drive/MyDrive/kaggle.json ~/.kaggle/
# modding permission
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
# downloading data using api command from the site
!kaggle datasets download -d ardikasatria/datasettanamanpadisumatera
# unzip the downloaded file
!unzip datasettanamanpadisumatera.zip

Downloading datasettanamanpadisumatera.zip to /content
  0% 0.00/4.75k [00:00<?, ?B/s]
100% 4.75k/4.75k [00:00<00:00, 11.3MB/s]
Archive:  datasettanamanpadisumatera.zip
  inflating: Data_Tanaman_Padi_Sumatera_version_1.csv  


In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer

In [5]:
data = pd.read_csv('/content/Data_Tanaman_Padi_Sumatera_version_1.csv')
le = LabelEncoder()
data['Provinsi'] = le.fit_transform(data['Provinsi'])
data.head()

Unnamed: 0,Provinsi,Tahun,Produksi,Luas Panen,Curah hujan,Kelembapan,Suhu rata-rata
0,0,1993,1329536.0,323589.0,1627.0,82.0,26.06
1,0,1994,1299699.0,329041.0,1521.0,82.12,26.92
2,0,1995,1382905.0,339253.0,1476.0,82.72,26.27
3,0,1996,1419128.0,348223.0,1557.0,83.0,26.08
4,0,1997,1368074.0,337561.0,1339.0,82.46,26.31


In [6]:
x = data[['Provinsi','Tahun','Luas Panen','Curah hujan','Kelembapan','Suhu rata-rata']]
y = data[['Produksi']]
column = x.columns

In [7]:
x

Unnamed: 0,Provinsi,Tahun,Luas Panen,Curah hujan,Kelembapan,Suhu rata-rata
0,0,1993,323589.00,1627.0,82.00,26.06
1,0,1994,329041.00,1521.0,82.12,26.92
2,0,1995,339253.00,1476.0,82.72,26.27
3,0,1996,348223.00,1557.0,83.00,26.08
4,0,1997,337561.00,1339.0,82.46,26.31
...,...,...,...,...,...,...
219,3,2016,390799.00,2317.6,79.40,26.45
220,3,2017,396559.00,1825.1,77.04,26.36
221,3,2018,511940.93,1385.8,76.05,25.50
222,3,2019,464103.42,1706.4,78.03,27.23


Model and Standarization

In [8]:
scaler = MinMaxScaler()
cv = KFold(n_splits =5, shuffle=True, random_state=100)
mlr = RandomForestRegressor(n_jobs=1, n_estimators=150, max_depth=None)

In [9]:
pipeline = Pipeline([('scaler', scaler), ('model', mlr)])
R2_scores = cross_val_score(pipeline, x, y.values.ravel(), cv= cv)
print(f"R2 = {R2_scores.mean()}")

R2 = 0.8509748821824811


Base Accuracy

In [10]:
pipeline = Pipeline([('scaler', scaler), ('model', mlr)])

R2_scores = cross_val_score(pipeline, x, y.values.ravel(), cv= cv)
MAE_scores = cross_val_score(pipeline, x, y.values.ravel(), cv = cv,scoring="neg_mean_absolute_error")
MSE_scores = cross_val_score(pipeline, x, y.values.ravel(), cv = cv, scoring="neg_mean_squared_error")
RMSE_scores = cross_val_score(pipeline, x, y.values.ravel(), cv = cv, scoring="neg_root_mean_squared_error")

for i, (train_index, test_index) in enumerate(cv.split(x)):
  print(f"Iteration {i+1}:")
  print(f"Score: R2 {R2_scores[i]}, MAE = {MAE_scores[i]}, MSE = {MSE_scores[i]}, RMSE = {RMSE_scores[i]}")
print("")
print("Final Scores:")
print(f"R2 = {R2_scores.mean()}")
print(f"MAE = {MAE_scores.mean()}")
print(f"MSE = {MSE_scores.mean()}")
print(f"RMSE = {RMSE_scores.mean()}")

Iteration 1:
Score: R2 0.9299728431114056, MAE = -146191.7739274073, MSE = -97135350813.81395, RMSE = -327249.9288934238
Iteration 2:
Score: R2 0.797663844681483, MAE = -200569.17666074078, MSE = -202114002838.44522, RMSE = -454691.32917257
Iteration 3:
Score: R2 0.9431190924922809, MAE = -192734.23156592596, MSE = -85392321040.87807, RMSE = -305969.7870887715
Iteration 4:
Score: R2 0.8705895727541355, MAE = -186031.61860740744, MSE = -175803576482.82602, RMSE = -432497.60050196416
Iteration 5:
Score: R2 0.7110587650751377, MAE = -228066.860810606, MSE = -288666353414.2794, RMSE = -528924.87701707

Final Scores:
R2 = 0.8504808236228886
MAE = -190718.73231441752
MSE = -169822320918.04852
RMSE = -409866.7045347599


Weather Variables

In [12]:
pca = PCA(n_components=1)

pipeline = Pipeline([('scaler', scaler), ('ct', ColumnTransformer([("pca", pca, [3, 4, 5]), ("pass", "passthrough", [0, 1, 2])])), ('model', mlr)])

R2_scores = cross_val_score(pipeline, x, y.values.ravel(), cv= cv)
MAE_scores = cross_val_score(pipeline, x, y.values.ravel(), cv = cv,scoring="neg_mean_absolute_error")
MSE_scores = cross_val_score(pipeline, x, y.values.ravel(), cv = cv, scoring="neg_mean_squared_error")
RMSE_scores = cross_val_score(pipeline, x, y.values.ravel(), cv = cv, scoring="neg_root_mean_squared_error")

for i, (train_index, test_index) in enumerate(cv.split(x)):
  print(f"Iteration {i+1}:")
  print(f"Score: R2 {R2_scores[i]}, MAE = {MAE_scores[i]}, MSE = {MSE_scores[i]}, RMSE = {RMSE_scores[i]}")

print("")
print("Final Scores:")
print(f"R2 = {R2_scores.mean()}")
print(f"MAE = {MAE_scores.mean()}")
print(f"MSE = {MSE_scores.mean()}")
print(f"RMSE = {RMSE_scores.mean()}")

Iteration 1:
Score: R2 0.9050783220882905, MAE = -180040.4054459258, MSE = -134066718547.19803, RMSE = -382585.27885057515
Iteration 2:
Score: R2 0.7986544934008413, MAE = -177340.57950518528, MSE = -201806152963.0957, RMSE = -447831.79913294385
Iteration 3:
Score: R2 0.9492899634017194, MAE = -168357.80975259264, MSE = -80272341336.70891, RMSE = -286309.8301900002
Iteration 4:
Score: R2 0.9065769193984878, MAE = -168599.62765629622, MSE = -158432600855.06104, RMSE = -392870.0158557025
Iteration 5:
Score: R2 0.7026507369592461, MAE = -230196.89050454533, MSE = -295469445695.40375, RMSE = -529912.4312883064

Final Scores:
R2 = 0.8524500870497171
MAE = -184907.06257290905
MSE = -174009451879.4935
RMSE = -407901.87106350565


All Variables

In [13]:
pca = PCA(n_components=1)

pipeline = Pipeline([('scaler', scaler), ("pca", pca), ('model', mlr)])

R2_scores = cross_val_score(pipeline, x, y.values.ravel(), cv= cv)
MAE_scores = cross_val_score(pipeline, x, y.values.ravel(), cv = cv,scoring="neg_mean_absolute_error")
MSE_scores = cross_val_score(pipeline, x, y.values.ravel(), cv = cv, scoring="neg_mean_squared_error")
RMSE_scores = cross_val_score(pipeline, x, y.values.ravel(), cv = cv, scoring="neg_root_mean_squared_error")

for i, (train_index, test_index) in enumerate(cv.split(x)):
  print(f"Iteration {i+1}:")
  print(f"Score: R2 {R2_scores[i]}, MAE = {MAE_scores[i]}, MSE = {MSE_scores[i]}, RMSE = {RMSE_scores[i]}")
print("")
print("Final Scores:")
print(f"R2 = {R2_scores.mean()}")
print(f"MAE = {MAE_scores.mean()}")
print(f"MSE = {MSE_scores.mean()}")
print(f"RMSE = {RMSE_scores.mean()}")

Iteration 1:
Score: R2 0.700022251290545, MAE = -433540.62780592625, MSE = -435919340218.7634, RMSE = -658811.6658689367
Iteration 2:
Score: R2 0.3667490597886003, MAE = -515396.27492888883, MSE = -609207620259.3997, RMSE = -781933.5353463787
Iteration 3:
Score: R2 0.6528262174175993, MAE = -470827.09865777765, MSE = -519935080301.50305, RMSE = -718689.4868471078
Iteration 4:
Score: R2 0.4628297754499535, MAE = -469462.7993229631, MSE = -815399485696.6329, RMSE = -908879.5383986863
Iteration 5:
Score: R2 0.8057532531266167, MAE = -214050.8303227273, MSE = -190269273914.7413, RMSE = -429771.5344263294

Final Scores:
R2 = 0.5976361114146629
MAE = -420655.5262076566
MSE = -514146160078.208
RMSE = -699617.1521774877
