In [None]:
# Step 1: Install Dependencies (if needed)
!pip install xgboost joblib scikit-learn pandas numpy

# Step 2: Import Libraries
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Step 3: Load Dataset
df = pd.read_csv("/content/cleaned_data (1).csv")

# Step 4: Handle Missing Values (if any)
df.dropna(inplace=True)

# Step 5: Encode Categorical Features using OneHotEncoder (Updated Fix)
categorical_cols = ["State_Name", "District_Name", "Season", "Crop"]
onehot_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)  # ✅ FIXED HERE
encoded_categorical = onehot_encoder.fit_transform(df[categorical_cols])

# Step 6: Scale Numerical Features (Area)
scaler = StandardScaler()
df["Area"] = scaler.fit_transform(df[["Area"]])

# Step 7: Define Features (X) and Target (y)
X = np.hstack((encoded_categorical, df[["Crop_Year", "Area"]].values))  # Combine encoded + numerical features
y = df["Production"]

# Step 8: Split Data into Training and Testing Sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 9: Train XGBoost Model
xgb_reg = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=100, learning_rate=0.1, max_depth=6)
xgb_reg.fit(X_train, y_train)

# Step 10: Model Evaluation
y_pred = xgb_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Model Performance:")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.4f}")

# Step 11: Save the Model and Preprocessing Objects
joblib.dump(xgb_reg, "xgboost_production_model2.pkl")  # Save trained model
joblib.dump(onehot_encoder, "onehot_encoder.pkl")  # Save OneHotEncoder
joblib.dump(scaler, "scaler.pkl")  # Save StandardScaler

print("✅ Model and encoders saved successfully!")


Model Performance:
RMSE: 5448071.57
R² Score: 0.8140
✅ Model and encoders saved successfully!


In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score

# Load Dataset
df = pd.read_csv("/content/cleaned_data (1).csv")

# ✅ Fix Area Scaling (Log Transformation)
df["Log_Area"] = np.log1p(df["Area"])  # Log transformation ensures better learning

# ✅ Encode Categorical Features with OneHotEncoder
onehot_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_categorical = onehot_encoder.fit_transform(df[["State_Name", "District_Name", "Season", "Crop"]])

# Define Features & Target
X = np.hstack((encoded_categorical, df[["Crop_Year", "Log_Area"]].values))  # ✅ Use Log_Area
y = df["Production"]

# ✅ Train-Test Split (80-20)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ Train XGBoost Model
xgb_reg = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=200, learning_rate=0.1, max_depth=6)
xgb_reg.fit(X_train, y_train)

# Step 10: Model Evaluation
y_pred = xgb_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Model Performance:")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.4f}")

# Step 11: Save the Model and Preprocessing Objects
'''joblib.dump(xgb_reg, "xgboost_production_model2.pkl")  # Save trained model
joblib.dump(onehot_encoder, "onehot_encoder.pkl")  # Save OneHotEncoder
joblib.dump(scaler, "scaler.pkl")  # Save StandardScaler'''

# ✅ Save Model and Encoders for Deployment
'''joblib.dump(xgb_reg, "xgboost_production_model.pkl")
joblib.dump(onehot_encoder, "onehot_encoder.pkl")'''

print("🎯 Model training complete! Files saved: xgboost_production_model.pkl, onehot_encoder.pkl")

Model Performance:
RMSE: 5359832.49
R² Score: 0.8200
🎯 Model training complete! Files saved: xgboost_production_model.pkl, onehot_encoder.pkl


In [None]:
import pandas as pd
import numpy as np

# Load Data
df = pd.read_csv("/content/cleaned_data (1).csv")

# Check Correlation
correlation = df[["Area", "Production"]].corr()
print("Correlation between Area and Production:\n", correlation)


Correlation between Area and Production:
                 Area  Production
Area        1.000000    0.040545
Production  0.040545    1.000000


In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.metrics import mean_squared_error, r2_score


# Load Dataset
df = pd.read_csv("/content/cleaned_data (1).csv")

# ✅ Encode Categorical Features
onehot_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_categorical = onehot_encoder.fit_transform(df[["State_Name", "District_Name", "Season", "Crop"]])

# ✅ Define Features & Target (Remove 'Area' completely)
X = np.hstack((encoded_categorical, df[["Crop_Year"]].values))  # ❌ Area is removed
y = df["Production"]

# ✅ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ Train XGBoost Model
xgb_reg = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=500, learning_rate=0.05, max_depth=6)
xgb_reg.fit(X_train, y_train)
# Step 10: Model Evaluation
y_pred = xgb_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Model Performance:")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.4f}")

'''# Step 11: Save the Model and Preprocessing Objects
joblib.dump(xgb_reg, "xgboost_production_model2.pkl")  # Save trained model
joblib.dump(onehot_encoder, "onehot_encoder.pkl")  # Save OneHotEncoder
joblib.dump(scaler, "scaler.pkl")  # Save StandardScaler

# ✅ Save Model and Encoders for Deployment
joblib.dump(xgb_reg, "xgboost_production_model.pkl")
joblib.dump(onehot_encoder, "onehot_encoder.pkl")'''

print("🎯 Model training complete! Files saved: xgboost_production_model.pkl, onehot_encoder.pkl")

Model Performance:
RMSE: 6400077.53
R² Score: 0.7433
🎯 Model training complete! Files saved: xgboost_production_model.pkl, onehot_encoder.pkl


In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.metrics import mean_squared_error, r2_score

# Load Dataset
df = pd.read_csv("/content/cleaned_data (1).csv")

# ✅ Convert Year into a Relative Feature
df["Year_Diff"] = df["Crop_Year"] - df["Crop_Year"].min()

# ✅ Add Crop Impact (Average Production per Crop)
df["Crop_Avg_Production"] = df.groupby("Crop")["Production"].transform("mean")

# ✅ OneHot Encode Categorical Features (State, District, Season, Crop)
onehot_encoder = OneHotEncoder(handle_unknown="ignore", sparse_output=False)
encoded_categorical = onehot_encoder.fit_transform(df[["State_Name", "District_Name", "Season", "Crop"]])

# ✅ Scale Numerical Features
scaler = StandardScaler()
df[["Scaled_Year_Diff", "Scaled_Crop_Avg_Production"]] = scaler.fit_transform(df[["Year_Diff", "Crop_Avg_Production"]])

# ✅ Define Features & Target
X = np.hstack((encoded_categorical, df[["Scaled_Year_Diff", "Scaled_Crop_Avg_Production"]].values))
y = df["Production"]

# ✅ Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# ✅ Train XGBoost Model
xgb_reg = xgb.XGBRegressor(objective="reg:squarederror", n_estimators=500, learning_rate=0.05, max_depth=6)
xgb_reg.fit(X_train, y_train)

# Step 10: Model Evaluation
y_pred = xgb_reg.predict(X_test)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)

print(f"Model Performance:")
print(f"RMSE: {rmse:.2f}")
print(f"R² Score: {r2:.4f}")

'''# Step 11: Save the Model and Preprocessing Objects
joblib.dump(xgb_reg, "xgboost_production_model2.pkl")  # Save trained model
joblib.dump(onehot_encoder, "onehot_encoder.pkl")  # Save OneHotEncoder
joblib.dump(scaler, "scaler.pkl")  # Save StandardScaler

# ✅ Save Model and Encoders for Deployment
joblib.dump(xgb_reg, "xgboost_production_model.pkl")
joblib.dump(onehot_encoder, "onehot_encoder.pkl")
joblib.dump(scaler, "year_scaler.pkl")  # Save scaler for numerical features'''

print("🎯 Model training complete! Crop Name now affects predictions!")


Model Performance:
RMSE: 6576824.57
R² Score: 0.7289
🎯 Model training complete! Crop Name now affects predictions!
