# Crop price prediction .ipynb

### Importing necessary libraries

In [None]:
import pandas as pd
import numpy as np
import xgboost as xgb
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestRegressor

### Reading Data Set

In [None]:
data = pd.read_csv("/content/Price_Agriculture_commodities_Week.csv")
data.head(10)

Unnamed: 0,State,District,Market,Commodity,Variety,Grade,Arrival_Date,Min Price,Max Price,Modal Price
0,Gujarat,Amreli,Damnagar,Bhindi(Ladies Finger),Bhindi,FAQ,27-07-2023,4100.0,4500.0,4350.0
1,Gujarat,Amreli,Damnagar,Brinjal,Other,FAQ,27-07-2023,2200.0,3000.0,2450.0
2,Gujarat,Amreli,Damnagar,Cabbage,Cabbage,FAQ,27-07-2023,2350.0,3000.0,2700.0
3,Gujarat,Amreli,Damnagar,Cauliflower,Cauliflower,FAQ,27-07-2023,7000.0,7500.0,7250.0
4,Gujarat,Amreli,Damnagar,Coriander(Leaves),Coriander,FAQ,27-07-2023,8400.0,9000.0,8850.0
5,Gujarat,Amreli,Damnagar,Ginger(Green),Green Ginger,FAQ,27-07-2023,11000.0,14000.0,12500.0
6,Gujarat,Amreli,Damnagar,Green Chilli,Green Chilly,FAQ,27-07-2023,7200.0,8000.0,7550.0
7,Gujarat,Amreli,Damnagar,Guar,Gwar,FAQ,27-07-2023,6800.0,7500.0,7350.0
8,Gujarat,Amreli,Damnagar,Lemon,Lemon,FAQ,27-07-2023,1850.0,2550.0,2200.0
9,Gujarat,Amreli,Damnagar,Tomato,Local,FAQ,27-07-2023,9800.0,10000.0,9950.0


In [None]:
data = data.dropna()

### Data set information

In [None]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23093 entries, 0 to 23092
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   State         23093 non-null  object 
 1   District      23093 non-null  object 
 2   Market        23093 non-null  object 
 3   Commodity     23093 non-null  object 
 4   Variety       23093 non-null  object 
 5   Grade         23093 non-null  object 
 6   Arrival_Date  23093 non-null  object 
 7   Min Price     23093 non-null  float64
 8   Max Price     23093 non-null  float64
 9   Modal Price   23093 non-null  float64
dtypes: float64(3), object(7)
memory usage: 1.8+ MB


In [None]:
data.rename(columns={
    "State": "state",
    "District": "district",
    "Market": "market",
    "Commodity": "commodity",
    "Variety": "variety",
    "Arrival_Date": "arrival_date",
    "Min Price": "min_price",
    "Max Price": "max_price",
    "Modal Price": "modal_price"
}, inplace=True)

In [None]:
data['min_price'] = data['min_price']/100
data['max_price'] = data['max_price']/100
data['modal_price'] = data['modal_price'] / 100

In [None]:
data.head()

Unnamed: 0,state,district,market,commodity,variety,Grade,arrival_date,min_price,max_price,modal_price
0,Gujarat,Amreli,Damnagar,Bhindi(Ladies Finger),Bhindi,FAQ,27-07-2023,41.0,45.0,43.5
1,Gujarat,Amreli,Damnagar,Brinjal,Other,FAQ,27-07-2023,22.0,30.0,24.5
2,Gujarat,Amreli,Damnagar,Cabbage,Cabbage,FAQ,27-07-2023,23.5,30.0,27.0
3,Gujarat,Amreli,Damnagar,Cauliflower,Cauliflower,FAQ,27-07-2023,70.0,75.0,72.5
4,Gujarat,Amreli,Damnagar,Coriander(Leaves),Coriander,FAQ,27-07-2023,84.0,90.0,88.5


### X and Y are training and testing data

In [None]:
X1 = data.iloc[:,:8]

In [None]:
Y1 = data['modal_price']

In [None]:
X = data[['commodity','min_price','max_price']]
Y = data['modal_price']
cat_mask = (X.dtypes==object)
cat_cols = X.columns[cat_mask].tolist()
le = LabelEncoder()
X[cat_cols] = X[cat_cols].apply(lambda x:le.fit_transform(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[cat_cols] = X[cat_cols].apply(lambda x:le.fit_transform(x))


In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.2,random_state=42)

### XGBoost Model

In [None]:
def xgb_model():
    st = StandardScaler()
    xgb_reg = xgb.XGBRegressor()
    steps = [('scaler',st),('model',xgb_reg)]
    xgb_pipeline = Pipeline(steps)

    param = {
        'model__subsample' : np.arange(0.05,1.05),
        'model__max_depth': np.arange(3,20,1),
        'model__colsample_bytree':np.arange(.1,1.05,.05),
        'model__learning_rate': np.arange(0,1,.1)
    }
    rand = RandomizedSearchCV(estimator=xgb_pipeline,param_distributions = param,n_iter=3,scoring='neg_mean_squared_error',cv=4)
    rand.fit(X_train,Y_train)
    model = rand.best_estimator_
    return model

In [None]:
model = xgb_model()

In [None]:
model.fit(X_train,Y_train)

### Testing data Accuracy is **93.35%**

In [None]:
model.score(X_test,Y_test)

0.9335376916523213

In [None]:
model1 = xgb_model()

In [None]:
model1.fit(X_train,Y_train)

#### Training data Accuracy is **95.16%**

In [None]:
model1.score(X_test,Y_test)

0.9516380363710533

In [None]:
import pickle

In [None]:
import pickle
import pandas as pd
from sklearn.preprocessing import LabelEncoder
# First we load the model
model = xgb_model()
# We also need the label encoder
X = data[['commodity','min_price','max_price']]
cat_mask = (X.dtypes==object)
cat_cols = X.columns[cat_mask].tolist()
le_com = LabelEncoder()
X[cat_cols] = X[cat_cols].apply(lambda x:le_com.fit_transform(x))

# We'll create a dictionary to store the model and label encoder, then save it for later use
deploy = {}
deploy['model'] = model
deploy['le_c'] = le_com

# Now we can load from the dictionary:
regressor = deploy['model']
le_com = deploy['le_c']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X[cat_cols] = X[cat_cols].apply(lambda x:le_com.fit_transform(x))


In [None]:
y_pred = regressor.predict(X_test)

In [None]:
Y_test

Unnamed: 0,modal_price
14987,63.3
3813,74.0
9201,50.0
19057,22.5
3571,13.4
...,...
4577,21.0
621,9.5
22601,16.0
20848,16.0


#### Using Random Forst Regressor

In [None]:
rf = RandomForestRegressor()

In [None]:
cat_mask = (X1.dtypes==object)
cat_cols = X1.columns[cat_mask].tolist()
le = LabelEncoder()
X1[cat_cols] = X1[cat_cols].apply(lambda x:le.fit_transform(x))

In [None]:
rf.fit(X1,Y1)

In [None]:
rf.feature_importances_

array([4.05655882e-03, 3.22742752e-03, 4.60922646e-03, 9.34367079e-03,
       2.52067605e-03, 3.79434683e-04, 1.06775009e-03, 9.74795256e-01])

In [None]:
from sklearn.feature_selection import RFE

In [None]:
rfe = RFE(estimator=rf,n_features_to_select = 4, step = 5, verbose=1 )

In [None]:
rfe.fit(X1,Y1)

Fitting estimator with 8 features.


In [None]:
rfe.support_

array([ True, False,  True,  True, False, False, False,  True])

In [None]:
Xt = data[['state','commodity']]
cat_mask = (Xt.dtypes==object)
cat_cols = Xt.columns[cat_mask].tolist()
le = LabelEncoder()
Xt[cat_cols] = Xt[cat_cols].apply(lambda x:le.fit_transform(x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Xt[cat_cols] = Xt[cat_cols].apply(lambda x:le.fit_transform(x))


In [None]:
X_t,X_te,Y_t,Y_te = train_test_split(Xt,Y1,test_size=0.2)

In [None]:
rf.fit(X_t,Y_t)

#### Testing Data Accuracy is 74.02%

In [None]:
rf.score(X_te,Y_te)

0.7402001270023368

In [None]:
rf.predict(X_te)

array([107.85993673,  61.61807747,  52.6028729 , ...,  25.80291946,
        47.49402017,  75.47710449])

In [None]:
Y_te

Unnamed: 0,modal_price
17682,90.00
1239,57.50
9231,80.00
19263,7.00
4315,18.50
...,...
1534,48.00
17181,28.00
17973,43.00
17885,49.83


## Rebuilding the model

In [None]:
df = pd.read_csv("/content/Price_Agriculture_commodities_Week.csv")

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23093 entries, 0 to 23092
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   State         23093 non-null  object 
 1   District      23093 non-null  object 
 2   Market        23093 non-null  object 
 3   Commodity     23093 non-null  object 
 4   Variety       23093 non-null  object 
 5   Grade         23093 non-null  object 
 6   Arrival_Date  23093 non-null  object 
 7   Min Price     23093 non-null  float64
 8   Max Price     23093 non-null  float64
 9   Modal Price   23093 non-null  float64
dtypes: float64(3), object(7)
memory usage: 1.8+ MB


In [None]:
df.head()

Unnamed: 0,State,District,Market,Commodity,Variety,Grade,Arrival_Date,Min Price,Max Price,Modal Price
0,Gujarat,Amreli,Damnagar,Bhindi(Ladies Finger),Bhindi,FAQ,27-07-2023,4100.0,4500.0,4350.0
1,Gujarat,Amreli,Damnagar,Brinjal,Other,FAQ,27-07-2023,2200.0,3000.0,2450.0
2,Gujarat,Amreli,Damnagar,Cabbage,Cabbage,FAQ,27-07-2023,2350.0,3000.0,2700.0
3,Gujarat,Amreli,Damnagar,Cauliflower,Cauliflower,FAQ,27-07-2023,7000.0,7500.0,7250.0
4,Gujarat,Amreli,Damnagar,Coriander(Leaves),Coriander,FAQ,27-07-2023,8400.0,9000.0,8850.0


In [None]:
df = df.dropna()

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23093 entries, 0 to 23092
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   State         23093 non-null  object 
 1   District      23093 non-null  object 
 2   Market        23093 non-null  object 
 3   Commodity     23093 non-null  object 
 4   Variety       23093 non-null  object 
 5   Grade         23093 non-null  object 
 6   Arrival_Date  23093 non-null  object 
 7   Min Price     23093 non-null  float64
 8   Max Price     23093 non-null  float64
 9   Modal Price   23093 non-null  float64
dtypes: float64(3), object(7)
memory usage: 1.8+ MB


#### Dropping unnecessary Columns

In [None]:
df.drop(columns=['Grade'], inplace=True)
df.drop(columns=['Variety'], inplace=True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23093 entries, 0 to 23092
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   State         23093 non-null  object 
 1   District      23093 non-null  object 
 2   Market        23093 non-null  object 
 3   Commodity     23093 non-null  object 
 4   Arrival_Date  23093 non-null  object 
 5   Min Price     23093 non-null  float64
 6   Max Price     23093 non-null  float64
 7   Modal Price   23093 non-null  float64
dtypes: float64(3), object(5)
memory usage: 1.4+ MB


In [None]:
df.rename(columns={
    "State": "state",
    "District": "district",
    "Market": "market",
    "Commodity": "commodity_name",
    "arrival_Date": "date",
    "Min Price": "min_price",
    "Max Price": "max_price",
    "Modal Price": "modal_price"
}, inplace=True)

In [None]:
print(df.columns.tolist())

['state', 'district', 'market', 'commodity_name', 'arrival_date', 'min_price', 'max_price', 'modal_price']


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23093 entries, 0 to 23092
Data columns (total 8 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   state           23093 non-null  object 
 1   district        23093 non-null  object 
 2   market          23093 non-null  object 
 3   commodity_name  23093 non-null  object 
 4   arrival_date    23093 non-null  object 
 5   min_price       23093 non-null  float64
 6   max_price       23093 non-null  float64
 7   modal_price     23093 non-null  float64
dtypes: float64(3), object(5)
memory usage: 1.4+ MB


In [None]:
new_order = ['commodity_name', 'state', 'district', 'market', 'min_price', 'max_price','modal_price', 'arrival_date']  # Change as needed

# Reorder the DataFrame
df = df[new_order]

In [None]:
df.head(5)

Unnamed: 0,commodity_name,state,district,market,min_price,max_price,modal_price,arrival_date
0,Bhindi(Ladies Finger),Gujarat,Amreli,Damnagar,4100.0,4500.0,4350.0,27-07-2023
1,Brinjal,Gujarat,Amreli,Damnagar,2200.0,3000.0,2450.0,27-07-2023
2,Cabbage,Gujarat,Amreli,Damnagar,2350.0,3000.0,2700.0,27-07-2023
3,Cauliflower,Gujarat,Amreli,Damnagar,7000.0,7500.0,7250.0,27-07-2023
4,Coriander(Leaves),Gujarat,Amreli,Damnagar,8400.0,9000.0,8850.0,27-07-2023


In [None]:
X = df.iloc[:,:-2]

In [None]:
print(X.columns)

Index(['commodity_name', 'state', 'district', 'market', 'min_price',
       'max_price'],
      dtype='object')


In [None]:
X['commodity_name'] = le.fit_transform(X['commodity_name'])
X['state'] = le.fit_transform(X['state'])
X['district'] = le.fit_transform(X['district'])
X['market'] = le.fit_transform(X['market'])

In [None]:
Y = df['modal_price']

#### Using Random Forest Regressor

In [None]:
X_train,X_test,Y_train,Y_test = train_test_split(X,Y,test_size=0.3,random_state=42)

In [None]:
rf.fit(X_train,Y_train)

#### Training data Accuracy is **98.83%**

In [None]:
rf.score(X_test,Y_test)

0.9883438249738651

In [None]:
model = xgb_model()

In [None]:
model.fit(X_train,Y_train)

#### Testing data Accuracy is **93.89%**

In [None]:
model.score(X_test,Y_test)

0.9389158343038285

In [None]:
le_c = LabelEncoder()
le_s = LabelEncoder()
le_d = LabelEncoder()
le_m = LabelEncoder()

In [None]:
deploy = {'model':rf , 'le_c': le_c,'le_s':le_s,'le_d':le_d,'le_m':le_m}
with open('saved_steps.pkl','wb') as file:
    pickle.dump(deploy,file)

In [None]:
with open('saved_steps.pkl','rb') as file:
    mod = pickle.load(file)

In [None]:
df['market'].unique()

array(['Damnagar', 'Gurgaon', 'Palampur', ..., 'Balarampur', 'Kasipur',
       'Jainagar'], dtype=object)