In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!mkdir ~/.kaggle

# copying api file
!cp drive/MyDrive/kaggle.json ~/.kaggle/
# modding permission
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
# downloading data using api command from the site
!kaggle datasets download -d ardikasatria/datasettanamanpadisumatera
# unzip the downloaded file
!unzip datasettanamanpadisumatera.zip

Downloading datasettanamanpadisumatera.zip to /content
  0% 0.00/4.75k [00:00<?, ?B/s]
100% 4.75k/4.75k [00:00<00:00, 17.8MB/s]
Archive:  datasettanamanpadisumatera.zip
  inflating: Data_Tanaman_Padi_Sumatera_version_1.csv  


In [4]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.decomposition import PCA
from sklearn.compose import ColumnTransformer

In [5]:
data = pd.read_csv('/content/Data_Tanaman_Padi_Sumatera_version_1.csv')
le = LabelEncoder()
data['Provinsi'] = le.fit_transform(data['Provinsi'])
data.head()

Unnamed: 0,Provinsi,Tahun,Produksi,Luas Panen,Curah hujan,Kelembapan,Suhu rata-rata
0,0,1993,1329536.0,323589.0,1627.0,82.0,26.06
1,0,1994,1299699.0,329041.0,1521.0,82.12,26.92
2,0,1995,1382905.0,339253.0,1476.0,82.72,26.27
3,0,1996,1419128.0,348223.0,1557.0,83.0,26.08
4,0,1997,1368074.0,337561.0,1339.0,82.46,26.31


In [6]:
x = data[['Provinsi','Tahun','Luas Panen','Curah hujan','Kelembapan','Suhu rata-rata']]
y = data[['Produksi']]
column = x.columns

In [None]:
x

Unnamed: 0,Provinsi,Tahun,Luas Panen,Curah hujan,Kelembapan,Suhu rata-rata
0,0,1993,323589.00,1627.0,82.00,26.06
1,0,1994,329041.00,1521.0,82.12,26.92
2,0,1995,339253.00,1476.0,82.72,26.27
3,0,1996,348223.00,1557.0,83.00,26.08
4,0,1997,337561.00,1339.0,82.46,26.31
...,...,...,...,...,...,...
219,3,2016,390799.00,2317.6,79.40,26.45
220,3,2017,396559.00,1825.1,77.04,26.36
221,3,2018,511940.93,1385.8,76.05,25.50
222,3,2019,464103.42,1706.4,78.03,27.23


Base Accuracy

In [12]:
scaler = MinMaxScaler()
mlr = LinearRegression()

pipeline = Pipeline([('scaler', scaler), ('model', mlr)])

cv = KFold(n_splits =5, shuffle=True, random_state=65)
R2_scores = cross_val_score(pipeline, x, y, cv= cv)
MAE_scores = cross_val_score(pipeline, x, y, cv = cv,scoring="neg_mean_absolute_error")
MSE_scores = cross_val_score(pipeline, x, y, cv = cv, scoring="neg_mean_squared_error")
RMSE_scores = cross_val_score(pipeline, x, y, cv = cv, scoring="neg_root_mean_squared_error")

for i, (train_index, test_index) in enumerate(cv.split(x)):
  print(f"Iteration {i+1}:")
  print(f"Score: R2 {R2_scores[i]}, MAE = {MAE_scores[i]}, MSE = {MSE_scores[i]}, RMSE = {RMSE_scores[i]}")

print("")

print(f"R2 : {R2_scores.mean()*100}")
print(f"MAE : {MAE_scores.mean()}")
print(f"MSE : {MSE_scores.mean()}")
print(f"RMSE : {RMSE_scores.mean()}")

Iteration 1:
Score: R2 0.770667419545164, MAE = -275668.263195625, MSE = -302530133489.5125, RMSE = -550027.3933991947
Iteration 2:
Score: R2 0.9448777843847342, MAE = -193516.80833341397, MSE = -53577247642.12107, RMSE = -231467.5952312139
Iteration 3:
Score: R2 0.9480093733783213, MAE = -199440.15252271827, MSE = -59772967145.49365, RMSE = -244485.1061833699
Iteration 4:
Score: R2 0.7765525265983151, MAE = -351043.5135487369, MSE = -432043476267.9038, RMSE = -657300.1416916809
Iteration 5:
Score: R2 0.9446662248361716, MAE = -208637.4291088408, MSE = -66852101489.625, RMSE = -258557.73337810842

R2 : 87.69546657485412
MAE : -245661.23334186702
MSE : -182955185206.9312
RMSE : -388367.5939767136


Weather Variables

In [11]:
scaler = MinMaxScaler()
mlr = LinearRegression()
pca = PCA(n_components=1)

pipeline = Pipeline([('scaler', scaler), ('ct', ColumnTransformer([("pca", pca, [3, 4, 5]), ("pass", "passthrough", [0, 1, 2])])), ('model', mlr)])

cv = KFold(n_splits =5, shuffle=True, random_state=65)
R2_scores = cross_val_score(pipeline, x, y, cv= cv)
MAE_scores = cross_val_score(pipeline, x, y, cv = cv,scoring="neg_mean_absolute_error")
MSE_scores = cross_val_score(pipeline, x, y, cv = cv, scoring="neg_mean_squared_error")
RMSE_scores = cross_val_score(pipeline, x, y, cv = cv, scoring="neg_root_mean_squared_error")

for i, (train_index, test_index) in enumerate(cv.split(x)):
  print(f"Iteration {i+1}:")
  print(f"Score: R2 {R2_scores[i]}, MAE = {MAE_scores[i]}, MSE = {MSE_scores[i]}, RMSE = {RMSE_scores[i]}")

print("")

print(f"R2 : {R2_scores.mean()*100}")
print(f"MAE : {MAE_scores.mean()}")
print(f"MSE : {MSE_scores.mean()}")
print(f"RMSE : {RMSE_scores.mean()}")

Iteration 1:
Score: R2 0.7707538093864055, MAE = -280577.1693060669, MSE = -302416170047.6366, RMSE = -549923.7856718298
Iteration 2:
Score: R2 0.9429927296932636, MAE = -196262.59835915078, MSE = -55409467934.73014, RMSE = -235392.15775962066
Iteration 3:
Score: R2 0.9460020522941373, MAE = -202334.40393817896, MSE = -62080758857.420334, RMSE = -249160.10687391416
Iteration 4:
Score: R2 0.7735502144978184, MAE = -349792.3394485945, MSE = -437848551335.403, RMSE = -661701.2553527483
Iteration 5:
Score: R2 0.9438514631182556, MAE = -205566.194477872, MSE = -67836464708.91939, RMSE = -260454.3428490287

R2 : 87.5430053797976
MAE : -246906.5411059726
MSE : -185118282576.82193
RMSE : -391326.32970142836


All Variables

In [10]:
scaler = MinMaxScaler()
mlr = LinearRegression()
pca = PCA(n_components=1)

pipeline = Pipeline([('scaler', scaler), ("pca", pca), ('model', mlr)])

cv = KFold(n_splits=5, shuffle=True, random_state=65)
R2_scores = cross_val_score(pipeline, x, y, cv= cv)
MAE_scores = cross_val_score(pipeline, x, y, cv = cv,scoring="neg_mean_absolute_error")
MSE_scores = cross_val_score(pipeline, x, y, cv = cv, scoring="neg_mean_squared_error")
RMSE_scores = cross_val_score(pipeline, x, y, cv = cv, scoring="neg_root_mean_squared_error")

for i, (train_index, test_index) in enumerate(cv.split(x)):
  print(f"Iteration {i+1}:")
  print(f"Score: R2 {R2_scores[i]}, MAE = {MAE_scores[i]}, MSE = {MSE_scores[i]}, RMSE = {RMSE_scores[i]}")

print("")

print(f"R2 : {R2_scores.mean()*100}")
print(f"MAE : {MAE_scores.mean()}")
print(f"MSE : {MSE_scores.mean()}")
print(f"RMSE : {RMSE_scores.mean()}")

Iteration 1:
Score: R2 0.47243679740114786, MAE = -636443.6216498532, MSE = -695948939264.7257, RMSE = -834235.5418373912
Iteration 2:
Score: R2 0.6446542397591752, MAE = -489668.6119411905, MSE = -345386113067.0494, RMSE = -587695.595582483
Iteration 3:
Score: R2 0.7363712491362864, MAE = -421653.0138607513, MSE = -303090647063.17004, RMSE = -550536.6900245342
Iteration 4:
Score: R2 0.5105536503918616, MAE = -727538.5574321877, MSE = -946361572642.1606, RMSE = -972811.1700850071
Iteration 5:
Score: R2 0.6907691102765381, MAE = -515336.2345254831, MSE = -373600658229.3274, RMSE = -611228.810045246

R2 : 61.09570093930018
MAE : -558128.0078818931
MSE : -532877586053.28674
RMSE : -711301.5615149323
