In [1]:
# Step 1: Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Step 2: Load the dataset
data = pd.read_csv("crop_yield.csv")
print("Dataset Loaded Successfully")
print(data.head())

# Step 3: Inspect the columns
print("\nActual Columns in Dataset:")
print(data.columns)

# Step 4: Data Preprocessing
print("\nDataset Info:")
print(data.info())

# Check for missing values
print("\nMissing Values:")
print(data.isnull().sum())

# Drop rows with missing values
data.dropna(inplace=True)

# Step 5: Feature and Target Selection
# Make sure to include any categorical variables and encode them
X = data[['Annual_Rainfall', 'Fertilizer', 'Pesticide', 'Area']]  # Features
y = data['Yield']  # Target variable

# Optional: Scaling features if necessary
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Step 6: Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)

# Step 7: Train the model
model = LinearRegression()
model.fit(X_train, y_train)

# Step 8: Evaluate the model
y_pred = model.predict(X_test)
print("\nModel Evaluation:")
print("Mean Squared Error:", mean_squared_error(y_test, y_pred))
print("R² Score:", r2_score(y_test, y_pred))

# Step 9: Save the model as a .pkl file
with open('crop_yield_model.pkl', 'wb') as file:
    pickle.dump(model, file)

print("\n✅ Model saved as 'crop_yield_model.pkl'")


Dataset Loaded Successfully
           Crop  Crop_Year       Season  State     Area  Production  \
0      Arecanut       1997  Whole Year   Assam  73814.0       56708   
1     Arhar/Tur       1997  Kharif       Assam   6637.0        4685   
2   Castor seed       1997  Kharif       Assam    796.0          22   
3      Coconut        1997  Whole Year   Assam  19656.0   126905000   
4  Cotton(lint)       1997  Kharif       Assam   1739.0         794   

   Annual_Rainfall  Fertilizer  Pesticide        Yield  
0           2051.4  7024878.38   22882.34     0.796087  
1           2051.4   631643.29    2057.47     0.710435  
2           2051.4    75755.32     246.76     0.238333  
3           2051.4  1870661.52    6093.36  5238.051739  
4           2051.4   165500.63     539.09     0.420909  

Actual Columns in Dataset:
Index(['Crop', 'Crop_Year', 'Season', 'State', 'Area', 'Production',
       'Annual_Rainfall', 'Fertilizer', 'Pesticide', 'Yield'],
      dtype='object')

Dataset Info:
<class