In [1]:

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.metrics import mean_squared_error, accuracy_score, classification_report
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from joblib import dump

df = pd.read_csv('Crop_recommendation.csv')

def compute_soil_score(row):
    N,P,K,pH,temp,hum,rain = row['N'],row['P'],row['K'],row['ph'],row['temperature'],row['humidity'],row['rainfall']
    score = (N*0.15 + P*0.15 + K*0.15 + (100-abs(pH-7)*10)*0.1 + (100-temp)*0.1 + hum*0.1 + rain*0.15)
    return np.clip(score,0,100)

df['soil_score'] = df.apply(compute_soil_score,axis=1)

def classify_fertility(score):
    if score >= 70: return 'subur'
    elif score >= 40: return 'kurang_subur'
    return 'tidak_subur'

df['fertility'] = df['soil_score'].apply(classify_fertility)

X1 = df[['N','P','K','temperature','humidity','ph','rainfall']]
y1 = df['soil_score']

X1_train, X1_test, y1_train, y1_test = train_test_split(X1,y1,test_size=0.2,random_state=42)

reg = RandomForestRegressor(n_estimators=200,random_state=42)
reg.fit(X1_train,y1_train)
pred1 = reg.predict(X1_test)
from math import sqrt
rmse = sqrt(mean_squared_error(y1_test, pred1))


df['location_detail'] = np.random.choice(['pekarangan','sawah-irigasi','sawah-tadah_hujan','kebun-dataran-tinggi','kebun-dataran-rendah'], len(df))

enc = OneHotEncoder(sparse_output=False)
loc_encoded = enc.fit_transform(df[['location_detail']])

X2 = np.column_stack([df['soil_score'], loc_encoded])
y2 = df['fertility']

X2_train,X2_test,y2_train,y2_test = train_test_split(X2,y2,test_size=0.2,random_state=42)

clf = RandomForestClassifier(n_estimators=200,random_state=42)
clf.fit(X2_train,y2_train)
pred2 = clf.predict(X2_test)

accuracy = accuracy_score(y2_test,pred2)
report = classification_report(y2_test,pred2)

dump(reg,'model_phase1_rf.joblib')
dump(enc,'encoder_loc.joblib')
dump(clf,'model_phase2_rf.joblib')

print("Phase1 RMSE:", rmse)
print("Phase2 Accuracy:", accuracy)
print(report)


print("\nJumlah masing-masing kategori kesuburan:")
print(df['fertility'].value_counts())


print("\nContoh data tanah:")
print("\nContoh subur:")
print(df[df['fertility'] == 'subur'].head())
print("\nContoh kurang subur:")
print(df[df['fertility'] == 'kurang_subur'].head())
print("\nContoh tidak subur:")
print(df[df['fertility'] == 'tidak_subur'].head())


Phase1 RMSE: 1.3311200205096092
Phase2 Accuracy: 0.9977272727272727
              precision    recall  f1-score   support

kurang_subur       1.00      1.00      1.00       311
       subur       1.00      1.00      1.00       122
 tidak_subur       1.00      0.86      0.92         7

    accuracy                           1.00       440
   macro avg       1.00      0.95      0.97       440
weighted avg       1.00      1.00      1.00       440


Jumlah masing-masing kategori kesuburan:
fertility
kurang_subur    1560
subur            613
tidak_subur       27
Name: count, dtype: int64

Contoh data tanah:

Contoh subur:
    N   P   K  temperature   humidity        ph    rainfall label  soil_score  \
0  90  42  43    20.879744  82.002744  6.502985  202.935536  rice   82.305616   
1  85  58  41    21.770462  80.319644  7.038096  226.655537  rice   87.415152   
2  60  55  44    23.004459  82.320763  7.840207  263.964248  rice   88.536060   
3  74  35  40    26.491096  80.158363  6.980401  24

In [2]:
from joblib import load
import numpy as np

# Load model
reg = load('model_phase1_rf.joblib')    # Phase 1 regressor
enc = load('encoder_loc.joblib')         # Location encoder
clf = load('model_phase2_rf.joblib')     # Phase 2 classifier

# Rekomendasi tanaman berdasarkan label
recommendations = {
    "subur": ["Padi", "Jagung", "Kedelai", "Kopi", "Teh", "Pisang"],
    "kurang_subur": ["Singkong", "Ubi Jalar", "Sorgum", "Kacang Hijau"],
    "tidak_subur": ["Lidah buaya", "Tebu", "Jarak pagar", "Agave"]
}

def prediksi_tanah(input_data):
    # Extract numeric input untuk fase 1
    X1 = np.array([[ 
        input_data['N'],
        input_data['P'],
        input_data['K'],
        input_data['temperature'],
        input_data['humidity'],
        input_data['ph'],
        input_data['rainfall']
    ]])
    
    # Prediksi soil quality score (fase 1)
    score = reg.predict(X1)[0]

    # Encode lokasi
    loc = enc.transform([[input_data['location_detail']]])

    # Gabungkan score + lokasi â†’ fase 2
    X2 = np.column_stack([score, loc])
    fertility_class = clf.predict(X2)[0]

    # Ambil rekomendasi tanaman
    tanaman = recommendations[fertility_class]

    return score, fertility_class, tanaman


In [3]:
data_saya = {
    'N': 71,
    'P': 54,
    'K': 16,
    'temperature': 28,
    'humidity': 70,
    'ph': 6.3,
    'rainfall': 150,
    'location_detail': 'sawah-tadah_hujan'
}

score, kategori, rekomendasi = prediksi_tanah(data_saya)

print("Skor Kesuburan :", round(score,2))
print("Kategori Tanah :", kategori)
print("Tanaman Rekomendasi :", rekomendasi)


Skor Kesuburan : 69.78
Kategori Tanah : kurang_subur
Tanaman Rekomendasi : ['Singkong', 'Ubi Jalar', 'Sorgum', 'Kacang Hijau']


