# Crop Price Prediction 
## Step 1: Load and Inspect Data

In [1]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv('Crop_Price.csv')

# Initial inspection
print(f'Dataset shape: {df.shape}')
df.head()

Dataset shape: (49, 9)


Unnamed: 0,State,Crop,CostCultivation,CostCultivation2,Production,Yield,Temperature,RainFall Annual,Price
0,Uttar Pradesh,ARHAR,9794.05,9800.25,1941.55,9.83,28.96,3373.2,19589.1
1,Karnataka,ARHAR,10593.15,10594.15,2172.46,7.47,29.22,3520.7,21187.3
2,Gujarat,ARHAR,13468.82,13469.82,1898.3,9.59,28.47,2957.4,26938.64
3,Andhra Pradesh,ARHAR,17051.66,17052.66,3670.54,6.42,28.49,3079.6,34104.32
4,Maharashtra,ARHAR,17130.55,17131.55,2775.8,8.72,28.3,2566.7,34262.1


## Step 2: Data Cleaning
### Handle missing values and inconsistencies

In [2]:
# Remove empty rows
df = df.dropna(how='all')

# Standardize crop names
df['Crop'] = df['Crop'].str.strip().str.upper()

# Check duplicate columns
if (df['CostCultivation'] == df['CostCultivation2']).all():
    df = df.drop(columns=['CostCultivation2'])

# Handle missing values
num_cols = ['Production', 'Yield', 'Temperature', 'RainFall Annual']
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

cat_cols = ['State', 'Crop']
df[cat_cols] = df[cat_cols].fillna('Unknown')

## Step 3: Feature Engineering
### Create new features and transform existing ones

In [3]:
# Create cost efficiency feature
df['CostPerYield'] = df['CostCultivation'] / df['Yield']

# Log transform skewed features
df['LogPrice'] = np.log1p(df['Price'])

# Temporal aggregation (example)
state_avg = df.groupby('State')['Price'].mean().to_dict()
df['StateAvgPrice'] = df['State'].map(state_avg)

## Step 4: Feature Encoding
### Convert categorical variables

In [4]:
from sklearn.preprocessing import OneHotEncoder

# One-hot encode categorical features
encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
encoded_cols = encoder.fit_transform(df[cat_cols])
encoded_df = pd.DataFrame(encoded_cols, columns=encoder.get_feature_names_out(cat_cols))

# Combine with numerical features
processed_df = pd.concat([df.drop(columns=cat_cols), encoded_df], axis=1)

## Step 5: Feature Scaling
### Normalize numerical features

In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

# Split data
X = processed_df.drop(columns=['Price', 'LogPrice'])
y = processed_df['LogPrice']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

## Step 6: Train the model 

In [7]:
from xgboost import XGBRegressor

# Initialize and fit the XGBoost regressor
xgb_model = XGBRegressor(random_state=42)
xgb_model.fit(X_train_scaled, y_train)

print("XGBoost model has been fitted on the training data.")

XGBoost model has been fitted on the training data.


## Step 7: Example data to predict value

In [8]:
# Create an example row (using values from row 0 as an example)
example_dict = {
    'CostCultivation': 9794.05,
    'CostCultivation2': 9800.25,
    'Production': 1941.55,
    'Yield': 9.83,
    'Temperature': 28.96,
    'RainFall Annual': 3373.2,
    'CostPerYield': 996.342828,
    'StateAvgPrice': 31880.037143,
    'State_Andhra Pradesh': 0.0,
    'State_Bihar': 0.0,
    'State_Gujarat': 0.0,
    'State_Haryana': 0.0,
    'State_Karnataka': 0.0,
    'State_Madhya Pradesh': 0.0,
    'State_Maharashtra': 0.0,
    'State_Orissa': 0.0,
    'State_Punjab': 0.0,
    'State_Rajasthan': 0.0,
    'State_Tamil Nadu': 0.0,
    'State_Uttar Pradesh': 1.0,
    'State_West Bengal': 0.0,
    'Crop_ARHAR': 1.0,
    'Crop_COTTON': 0.0,
    'Crop_GRAM': 0.0,
    'Crop_GROUNDNUT': 0.0,
    'Crop_MAIZE': 0.0,
    'Crop_MOONG': 0.0,
    'Crop_MUSTARD': 0.0,
    'Crop_PADDY': 0.0,
    'Crop_SUGARCANE': 0.0,
    'Crop_WHEAT': 0.0
}

# Ensure the column order matches the training X columns
X_columns = ['CostCultivation', 'CostCultivation2', 'Production', 'Yield', 'Temperature', 
             'RainFall Annual', 'CostPerYield', 'StateAvgPrice', 'State_Andhra Pradesh', 
             'State_Bihar', 'State_Gujarat', 'State_Haryana', 'State_Karnataka', 
             'State_Madhya Pradesh', 'State_Maharashtra', 'State_Orissa', 'State_Punjab', 
             'State_Rajasthan', 'State_Tamil Nadu', 'State_Uttar Pradesh', 'State_West Bengal', 
             'Crop_ARHAR', 'Crop_COTTON', 'Crop_GRAM', 'Crop_GROUNDNUT', 'Crop_MAIZE', 
             'Crop_MOONG', 'Crop_MUSTARD', 'Crop_PADDY', 'Crop_SUGARCANE', 'Crop_WHEAT']

example_row = pd.DataFrame([example_dict], columns=X_columns)

# Scale the features using the previously fitted scaler
example_row_scaled = scaler.transform(example_row)

# Predict using the trained xgb_model
pred_logprice = xgb_model.predict(example_row_scaled)

# Optionally convert back from log price using np.expm1
pred_price = np.expm1(pred_logprice)

print("Predicted LogPrice:", pred_logprice[0])
print("Predicted Price:", pred_price[0])

Predicted LogPrice: 9.882299
Predicted Price: 19579.695


In [9]:
accuracy = xgb_model.score(X_test_scaled, y_test)
print("Model R^2 Accuracy:", accuracy)

Model R^2 Accuracy: 0.9845567375789991


## Saving the model

In [11]:
import pickle

with open('Crop_price_prediction_model.pkl', 'wb') as f:
    pickle.dump(xgb_model, f)