In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error
from xgboost import XGBRegressor
import pickle

# Load the dataset from an Excel file
data = pd.read_csv('../data/cleaned_data.csv')  # Change to .csv if using a CSV file

# Display the first few rows
print("First few rows of the dataset:")
print(data.head())

# Convert all feature columns to numeric, forcing errors to NaN
data = data.apply(pd.to_numeric, errors='coerce')

# Drop rows with NaN values (if any)
data = data.dropna()

# Feature columns and target variable
features = ['Rain Fall (mm)', 'Fertilizer', 'Temperatue', 'Nitrogen (N)', 'Phosphorus (P)', 'Potassium (K)']
target = 'Yeild (Q/acre)'

# Extract features and target variable
X = data[features]
y = data[target]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Feature Scaling
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the RandomForestRegressor model
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)
rf_model.fit(X_train_scaled, y_train)

# Initialize and train the XGBRegressor model
xgb_model = XGBRegressor(objective='reg:squarederror')
xgb_model.fit(X_train_scaled, y_train)

# Predict with both models
y_pred_rf = rf_model.predict(X_test_scaled)
y_pred_xgb = xgb_model.predict(X_test_scaled)

# Evaluate both models
print("RandomForestRegressor Metrics:")
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred_rf))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_rf))
print("Root Mean Squared Error:", mean_squared_error(y_test, y_pred_rf, squared=False))

print("\nXGBRegressor Metrics:")
print("Mean Absolute Error:", mean_absolute_error(y_test, y_pred_xgb))
print("Mean Squared Error:", mean_squared_error(y_test, y_pred_xgb))
print("Root Mean Squared Error:", mean_squared_error(y_test, y_pred_xgb, squared=False))

# Save the scaler and models to disk for future use
with open('scaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)

with open('rf_model.pkl', 'wb') as f:
    pickle.dump(rf_model, f)

with open('xgb_model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)

# Function for price prediction using the saved models
def predict_yield(rain_fall, fertilizer, temperature, nitrogen, phosphorus, potassium):
    # Load the scaler and models from disk
    with open('scaler.pkl', 'rb') as f:
        scaler = pickle.load(f)
    
    with open('rf_model.pkl', 'rb') as f:
        rf_model = pickle.load(f)
    
    with open('xgb_model.pkl', 'rb') as f:
        xgb_model = pickle.load(f)
    
    # Prepare the input data
    input_data = pd.DataFrame([[rain_fall, fertilizer, temperature, nitrogen, phosphorus, potassium]], 
                              columns=features)
    
    # Scale the input data
    input_data_scaled = scaler.transform(input_data)
    
    # Predict using both models
    predicted_yield_rf = rf_model.predict(input_data_scaled)
    predicted_yield_xgb = xgb_model.predict(input_data_scaled)
    
    return predicted_yield_rf[0], predicted_yield_xgb[0]

# Example usage
rain_fall = 1230.0
fertilizer = 80.0
temperature = 28
nitrogen = 80.0
phosphorus = 24.0
potassium = 20.0

rf_yield, xgb_yield = predict_yield(rain_fall, fertilizer, temperature, nitrogen, phosphorus, potassium)
print(f"Predicted Yield using RandomForestRegressor: {rf_yield}")
print(f"Predicted Yield using XGBRegressor: {xgb_yield}")

First few rows of the dataset:
   Rain Fall (mm)  Fertilizer  Temperatue  Nitrogen (N)  Phosphorus (P)  \
0          1230.0        80.0          28          80.0            24.0   
1           480.0        60.0          36          70.0            20.0   
2          1250.0        75.0          29          78.0            22.0   
3           450.0        65.0          35          70.0            19.0   
4          1200.0        80.0          27          79.0            22.0   

   Potassium (K)  Yeild (Q/acre)  
0           20.0            12.0  
1           18.0             8.0  
2           19.0            11.0  
3           18.0             9.0  
4           19.0            11.0  
RandomForestRegressor Metrics:
Mean Absolute Error: 0.5895
Mean Squared Error: 0.5584758333333332
Root Mean Squared Error: 0.7473124067840258

XGBRegressor Metrics:
Mean Absolute Error: 0.6400861422220866
Mean Squared Error: 0.6419318422544014
Root Mean Squared Error: 0.8012064916451946
Predicted Yield usin

