In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

# Load your dataset
data = pd.read_csv('archive.zip', compression='zip')

# Preview the data
print(data.head())

# Rename columns to avoid spaces (if necessary)
data.rename(columns=lambda x: x.strip().lower().replace(' ', '_'), inplace=True)

# One-hot encode categorical columns ('region', 'soil_type', 'crop')
data_encoded = pd.get_dummies(data, columns=['region', 'soil_type', 'crop'])

# Define the input features and the target variable (crop yield)
X = data_encoded[['rainfall_mm', 'temperature_celsius'] + [col for col in data_encoded.columns if col.startswith(('region_', 'soil_type_', 'crop_'))]]
y = data_encoded['yield_tons_per_hectare']

# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the RandomForestRegressor model
model = RandomForestRegressor(n_estimators=100, random_state=42)

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test set
y_pred = model.predict(X_test)

# Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
print(f"Root Mean Squared Error (RMSE): {rmse}")

# Predict crop yield for new data (replace with new input values)
# For new data, make sure it's in the same encoded format
new_data = np.array([[150.0, 25.0] + [0]*len([col for col in X.columns if col.startswith(('region_', 'soil_type_', 'crop_'))])])  # Example input
predicted_yield = model.predict(new_data)
print(f"Predicted Crop Yield: {predicted_yield[0]}")


  Region Soil_Type     Crop  Rainfall_mm  Temperature_Celsius  \
0   West     Sandy   Cotton   897.077239            27.676966   
1  South      Clay     Rice   992.673282            18.026142   
2  North      Loam   Barley   147.998025            29.794042   
3  North     Sandy  Soybean   986.866331            16.644190   
4  South      Silt    Wheat   730.379174            31.620687   

   Fertilizer_Used  Irrigation_Used Weather_Condition  Days_to_Harvest  \
0            False             True            Cloudy              122   
1             True             True             Rainy              140   
2            False            False             Sunny              106   
3            False             True             Rainy              146   
4             True             True            Cloudy              110   

   Yield_tons_per_hectare  
0                6.555816  
1                8.527341  
2                1.127443  
3                6.517573  
4                7.24825