In [6]:
#importing libraries

import pandas as pd
import sklearn as sk

import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split #preprocessing
from sklearn.linear_model import LinearRegression

import seaborn as sns
from IPython.display import display
import matplotlib.pyplot as plt  # Matlab-style plotting

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:   
        print(os.path.join(dirname, filename))

In [11]:
data = pd.read_csv('Vegetable-and-Fruits-Prices-in-India.csv')

In [8]:
os.getcwd()

'G:\\MCA\\6thSem\\myproject\\Vegetable-and-Fruite-Price-Prediction-main'

In [9]:
data

Unnamed: 0,datesk,Item Name,Date,price
0,20110101,Kashini greens,1/1/2011,
1,20110101,Pineapple juice,1/1/2011,
2,20110101,Thailand Juice,1/1/2011,
3,20110101,Thailand Jelly,1/1/2011,
4,20110101,Orange juice,1/1/2011,
...,...,...,...,...
1009819,20201231,Mint Leaves,31/12/2020,
1009820,20201231,Chillies small (C.B.P),31/12/2020,
1009821,20201231,Pumpkin Red,31/12/2020,
1009822,20201231,Potato(M),31/12/2020,


In [None]:
#Feature Engineering

In [10]:
data.sort_values("Item Name", inplace = True)
  
# making a bool series
bool_series = data["Item Name"].duplicated()
  
# displaying data
data.head()
  
# display data
data[bool_series]

Unnamed: 0,datesk,Item Name,Date,price
760110,20170406,Amla,6/4/2017,78.0
192237,20120731,Amla,31-07-2012,76.0
73267,20110809,Amla,9/8/2011,35.0
406350,20140507,Amla,7/5/2014,60.0
353473,20131127,Amla,27/11/2013,30.0
...,...,...,...,...
1009457,20201230,,30/12/2020,
1009458,20201230,,30/12/2020,
1009815,20201231,,31/12/2020,
1009816,20201231,,31/12/2020,


In [None]:
#load the data into a Pandas dataframe
#data = pd.read_csv('Vegetable-and-Fruits-Prices-in-India.csv')

print("Training Data")
display(data)

#look for duplicate data, invalid data or corrupt data and remove it
data.duplicated().sum()
                                    
def isDataMissing(DataToCheck):
#missing data
    total = DataToCheck.isnull().sum().sort_values(ascending=False)
    percent = (DataToCheck.isnull().sum()/DataToCheck.isnull().count()).sort_values(ascending=False)
    missing_data = pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])
    display(missing_data.head(30))
    

In [None]:
data.sort_values("Item Name", inplace = True)
display(data)

In [None]:
uniqueValues = data['Item Name'].unique()
print('Unique elements in column "Item Name" ')
print(uniqueValues)

In [None]:
uniqueValues = data['Item Name'].nunique()
print('Number of unique values in column "Item Name" of the dataframe : ')
print(uniqueValues)

In [None]:
print("Check for Missing Data in Training Set")
isDataMissing(data)
#no /total

In [None]:
#Since Datesk is id field so removing it
data.drop(columns =['datesk'],inplace=True)
print("Removing rows where item name is blank")
#Remove rows where item name is blank 
data = data[~data['Item Name'].isnull()]
display(data)
#Extract year from date column
data['Date'] = pd.to_datetime(data['Date'])
data['year'] = data['Date'].dt.year
data.drop(columns='Date',inplace=True)

In [None]:
uniqueValues = data['Item Name'].unique()
print('Unique elements in column "Item Name" ')
print(uniqueValues)

In [None]:
uniqueValues = data['Item Name'].nunique()
print('Number of unique values in column "Item Name" of the dataframe : ')
print(uniqueValues)

In [None]:
MeanPrices = data.groupby(['Item Name','year']).mean()
print("Mean Prices for all the items on yearwise basis")
display(MeanPrices)

In [None]:
#Save records where price is null or 0 for the items in null prices dataframe
print("Null Prices Dataset containing null and 0 values for price")
nullPrices = data[(data['price'].isnull()) | (data['price']== 0)]
display(nullPrices)

In [None]:
#Remove rows where price is null or price is 0 as we will assign mean values for them later

data = data[~((data['price'].isnull()) | (data['price']== 0))]
print("Removing null and 0 prices from the training dataset for feature engineering")
display(data)

In [None]:
print("Assigning Mean Price to training set data where price is 0 or Nan")
nullPrices = pd.merge(nullPrices,MeanPrices,left_on=['Item Name','year'],right_on=['Item Name','year'])
display(nullPrices.drop(columns=['price_x']))

df = pd.DataFrame({"Item Name":nullPrices['Item Name'], 
                    "year":nullPrices['year'],"price":nullPrices['price_y']}) 
train_data = data.append(df)
display("Combine dataset after feature engineering")
display(train_data)

In [None]:
#price cannot be zero so filter out data where price is around zero
train_data = train_data[(train_data['price']> 1)]
backup = train_data
print("Backup")
display(backup)

In [None]:

train_data = pd.get_dummies(train_data).reset_index(drop=True)
sns.displot(train_data['price']);

print("Skewness: " + str(train_data['price'].skew()))


plt.savefig("skewness.png") # save as png

In [None]:
#Histogram is not normally distributed
#sns.distplot(train_data['price']);
#applying log transformation
train_data['price'] = np.log(train_data['price'])
sns.displot(train_data['price']);
plt.savefig("histogram.png")

In [None]:
train_output = train_data['price']
#Remove target output column Price from training dataset
train_data.drop(columns='price',inplace=True)
print("Training Dataset")
display(train_data)

X_train, X_test, y_train, y_test = train_test_split(train_data, train_output, test_size = 0.1, random_state = 0)
regressor = LinearRegression()
regressor.fit(X_train, y_train)

y_pred = regressor.predict(X_test)
mean_squared_error(y_pred,y_test)

In [None]:
#Preparing Test Data for 2021 Price predictions
test_data = pd.DataFrame(columns = [ 'year','Item Name'])
#Since there are 330 unique items predicting prices for these items in 2021 year
for item in backup['Item Name'].unique():
    test_data = test_data.append({'Item Name': str(item), 'year':2021},ignore_index=True)

display(test_data)

In [None]:
#One Hot Encoding

In [None]:
test_data_one_hot = pd.get_dummies(test_data).reset_index(drop=True)

test_data_one_hot.rename(columns ={'year_2021':'year'},inplace=True)
test_data_one_hot['year'] = 2021

print("Testing Data")
display(test_data_one_hot)

In [None]:
regressor.fit(train_data, train_output)

predictions = regressor.predict(test_data_one_hot)
#print(predictions)

preds = pd.DataFrame()
#convert the logarithmic values to normal form
preds['price'] =np.exp(predictions)
preds['Item Name'] = test_data['Item Name']
preds['year'] = test_data['year']
print("Average price predictions for 2021 year ")
display(preds)

In [None]:
result_df = preds[['Item Name','price']]

In [None]:
result_df.head()

In [None]:
result_df.tail()

In [None]:
result_df.to_csv('Predict_vegetable_2021_price.csv')