# Vegetable Price Prediction Using Machine Learning
Implemented for West Bengal, Kolkata district, Bara Bazar market (Commodity = Onion) 

In [2]:
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import time
%matplotlib inline

# Reading Dataset
Vegetable Data is taken from https://agmarknet.gov.in

In [3]:
vegetables = pd.read_csv('West_Bengal.csv')
vegetables.head()

Unnamed: 0,Sl no.,District Name,Market Name,Commodity,Variety,Grade,Min Price (Rs./Quintal),Max Price (Rs./Quintal),Modal Price (Rs./Quintal),Price Date
0,1,Jalpaiguri,Alipurduar,Potato,Red,FAQ,2185,2210,2200,30-Dec-19
1,2,Jalpaiguri,Alipurduar,Potato,Jyoti,FAQ,1785,1810,1800,30-Dec-19
2,3,Jalpaiguri,Alipurduar,Potato,Red,FAQ,2185,2210,2200,26-Dec-19
3,4,Jalpaiguri,Alipurduar,Potato,Jyoti,FAQ,1785,1810,1800,26-Dec-19
4,5,Jalpaiguri,Alipurduar,Potato,Red,FAQ,2185,2210,2200,24-Dec-19


# Drop Unnecessary Colums
Sl no, Commodity, Grade are not  useful so drop it

In [4]:
vegetables = vegetables.drop(['Sl no.','Commodity','Grade'], axis = 1)

# Data Selection

select District Name == Kolkata <br>
Select Market Name == Bara Bazar (Posta Bazar) <br>
Then Select Variety == Jyoti

In [5]:
vegetables = vegetables[vegetables['District Name'] == "Kolkata"]
vegetables = vegetables[vegetables['Market Name'] == "Bara Bazar (Posta Bazar)"]
vegetables = vegetables[vegetables['Variety'] == "Jyoti"]
vegetables.head()

Unnamed: 0,District Name,Market Name,Variety,Min Price (Rs./Quintal),Max Price (Rs./Quintal),Modal Price (Rs./Quintal),Price Date
1550,Kolkata,Bara Bazar (Posta Bazar),Jyoti,2200,2220,2220,31-Dec-19
1552,Kolkata,Bara Bazar (Posta Bazar),Jyoti,2200,2220,2220,30-Dec-19
1554,Kolkata,Bara Bazar (Posta Bazar),Jyoti,2280,2300,2300,27-Dec-19
1556,Kolkata,Bara Bazar (Posta Bazar),Jyoti,2300,2320,2320,26-Dec-19
1558,Kolkata,Bara Bazar (Posta Bazar),Jyoti,2360,2370,2360,24-Dec-19


In [6]:
vegetables['Price Date'] = pd.to_datetime(vegetables['Price Date'])
vegetables['Price Date'] = vegetables['Price Date'].dt.day
vegetables = vegetables.iloc[::-1]
vegetables.tail(10)

Unnamed: 0,District Name,Market Name,Variety,Min Price (Rs./Quintal),Max Price (Rs./Quintal),Modal Price (Rs./Quintal),Price Date
1567,Kolkata,Bara Bazar (Posta Bazar),Jyoti,2000,2020,2000,17
1565,Kolkata,Bara Bazar (Posta Bazar),Jyoti,2200,2210,2200,18
1564,Kolkata,Bara Bazar (Posta Bazar),Jyoti,2160,2170,2160,19
1562,Kolkata,Bara Bazar (Posta Bazar),Jyoti,2200,2240,2220,20
1560,Kolkata,Bara Bazar (Posta Bazar),Jyoti,2350,2360,2360,23
1558,Kolkata,Bara Bazar (Posta Bazar),Jyoti,2360,2370,2360,24
1556,Kolkata,Bara Bazar (Posta Bazar),Jyoti,2300,2320,2320,26
1554,Kolkata,Bara Bazar (Posta Bazar),Jyoti,2280,2300,2300,27
1552,Kolkata,Bara Bazar (Posta Bazar),Jyoti,2200,2220,2220,30
1550,Kolkata,Bara Bazar (Posta Bazar),Jyoti,2200,2220,2220,31


# Reading Climate Data
Weather Data is taken from https://www.wunderground.com/ 

In [7]:
climate = pd.read_csv('climate_Kolkata_210.csv')
vegetables.reset_index(inplace=True)
vegetables[['Temp_Max', 'Temp_Avg', 'Temp_Min', 'Dew_Max', 'Dew_Avg', 'Dew_Min', 'Humid_Max', 'Humid_Avg','Humid_Min','Wind_Max','Wind_Avg','Wind_Min','Pressure_Avg','Precipitation_Total']] = climate[['Temp_Max', 'Temp_Avg', 'Temp_Min', 'Dew_Max', 'Dew_Avg', 'Dew_Min', 'Humid_Max', 'Humid_Avg','Humid_Min','Wind_Max','Wind_Avg','Wind_Min', 'Pressure_Avg', 'Precipitation_Total']]
print(vegetables.head())
data = vegetables[['Price Date', 'Modal Price (Rs./Quintal)', 'Temp_Max', 'Temp_Avg', 'Temp_Min', 'Dew_Max', 'Dew_Avg', 'Dew_Min', 'Humid_Max', 'Humid_Avg','Humid_Min','Wind_Max','Wind_Avg','Wind_Min','Pressure_Avg', 'Precipitation_Total']] 
X = vegetables[['Price Date','Temp_Max', 'Temp_Avg', 'Temp_Min', 'Dew_Max', 'Dew_Avg', 'Dew_Min', 'Humid_Max', 'Humid_Avg','Humid_Min','Wind_Max','Wind_Avg','Wind_Min', 'Pressure_Avg', 'Precipitation_Total']]
y = vegetables[['Modal Price (Rs./Quintal)']]
print(X.shape)

FileNotFoundError: [Errno 2] File b'climate_Kolkata_210.csv' does not exist: b'climate_Kolkata_210.csv'

In [None]:
sns.pairplot(data)

# Data is splitted into train and test set.

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
import math

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X[:-1], y[:-1], test_size=0.3, random_state=42, shuffle = True)

# Implementation Of Algorithm 
Implementing Random Forest algorithm

In [None]:
start = time.time()
regr = RandomForestRegressor(n_estimators = 140,max_depth=300, random_state=42, criterion="mae", min_samples_split=2, oob_score = True)
regr.fit(X_train,np.reshape(np.array(y_train), (len(y_train), )))
end = time.time()
print(f"Runtime of the program is {end - start}")

In [None]:
predicted = regr.predict(X_test)

# Accuracy For Training and Testing Set

In [None]:
#print(y_test, predicted)
print(regr.score(X_train, y_train))
print(regr.score(X_test, y_test))

# Feature Importance

In [None]:
a = np.reshape(regr.feature_importances_, (len(regr.feature_importances_),1))
a = a.tolist()
col = X.columns
for i in range(0,15):
    print(col[i],  "->" , a[i][0])

# predicted vs Actual

In [None]:
b= y_test.values
b = b.reshape((len(b),))
l = len(b)
sum_a = 0
for i in range(len(predicted)):
    print(str(predicted[i]) +"->"+ str(b[i]))
    sum_a = sum_a + abs(predicted[i] - b[i])

In [None]:
print(sum_a/l)

# Create picke file

In [None]:
import pickle
filename = 'Kolkata_potato'
outfile = open(filename,'wb')
pickle.dump(regr,outfile)
outfile.close()

In [None]:
filename = 'Kolkata_potato'
with open(filename, "rb") as f:
    rf = pickle.load(f)
predicted = rf.predict(X)
actual= np.reshape(np.array(y),(len(y),))
#print(predicted,actual)
sum = 0
for i in range(len(predicted)):
    #print(str(actual[i]) + "->" + str(predicted[i]))
    sum = sum +abs(actual[i] - predicted[i])
f.close()

# Total Error

In [None]:
print(sum/len(actual)) 