In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
%matplotlib inline
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv('/content/Forecasting_train_1 (1).csv')
df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Type,Item_MRP,Fat_Content,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,1,249.8092,1.0,1,3735.138
1,DRC01,5.92,2,48.2692,1.0,2,443.4228
2,FDN15,17.5,3,141.618,1.0,1,2097.27
3,FDX07,19.2,4,182.095,,4,732.38
4,NCD19,8.93,5,53.8614,2.0,1,994.7052


In [5]:
df.shape

(8523, 7)

In [6]:
# statistical info
df.describe()

Unnamed: 0,Item_Weight,Item_Type,Item_MRP,Fat_Content,Outlet_Type,Item_Outlet_Sales
count,7060.0,8523.0,8523.0,6113.0,8523.0,8523.0
mean,12.857645,6.700927,140.992782,1.933748,1.709492,2181.288914
std,4.643456,3.73099,62.275067,0.918306,1.089985,1706.499616
min,4.555,1.0,31.29,1.0,1.0,33.29
25%,8.77375,4.0,93.8265,1.0,1.0,834.2474
50%,12.6,6.0,143.0128,2.0,1.0,1794.331
75%,16.85,8.0,185.6437,3.0,2.0,3101.2964
max,21.35,16.0,266.8884,3.0,4.0,13086.9648


In [7]:
# datatype of attributes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Item_Identifier    8523 non-null   object 
 1   Item_Weight        7060 non-null   float64
 2   Item_Type          8523 non-null   int64  
 3   Item_MRP           8523 non-null   float64
 4   Fat_Content        6113 non-null   float64
 5   Outlet_Type        8523 non-null   int64  
 6   Item_Outlet_Sales  8523 non-null   float64
dtypes: float64(4), int64(2), object(1)
memory usage: 466.2+ KB


In [8]:
# check unique values in dataset
df.apply(lambda x: len(x.unique()))

Item_Identifier      1559
Item_Weight           416
Item_Type              16
Item_MRP             5938
Fat_Content             4
Outlet_Type             4
Item_Outlet_Sales    3493
dtype: int64

## Preprocessing Data

In [9]:
# check for null values
df.isnull().sum()

Item_Identifier         0
Item_Weight          1463
Item_Type               0
Item_MRP                0
Fat_Content          2410
Outlet_Type             0
Item_Outlet_Sales       0
dtype: int64

In [10]:
# check for categorical attributes
cat_col = []
for x in df.dtypes.index:
    if df.dtypes[x] == 'object':
        cat_col.append(x)
cat_col

['Item_Identifier']

In [11]:
# print the categorical columns
for col in cat_col:
    print(col)
    print(df[col].value_counts())
    print()

Item_Identifier
FDW13    10
FDG33    10
NCY18     9
FDD38     9
DRE49     9
         ..
FDY43     1
FDQ60     1
FDO33     1
DRF48     1
FDC23     1
Name: Item_Identifier, Length: 1559, dtype: int64



In [12]:
# fill the missing values
item_weight_mean = df.pivot_table(values = "Item_Weight", index = 'Item_Identifier')
item_weight_mean

Unnamed: 0_level_0,Item_Weight
Item_Identifier,Unnamed: 1_level_1
DRA12,11.600
DRA24,19.350
DRA59,8.270
DRB01,7.390
DRB13,6.115
...,...
NCZ30,6.590
NCZ41,19.850
NCZ42,10.500
NCZ53,9.600


In [13]:
miss_bool = df['Item_Weight'].isnull()
miss_bool

0       False
1       False
2       False
3       False
4       False
        ...  
8518    False
8519    False
8520    False
8521    False
8522    False
Name: Item_Weight, Length: 8523, dtype: bool

In [14]:
for i, item in enumerate(df['Item_Identifier']):
    if miss_bool[i]:
        if item in item_weight_mean:
            df['Item_Weight'][i] = item_weight_mean.loc[item]['Item_Weight']
        else:
            df['Item_Weight'][i] = np.mean(df['Item_Weight'])

In [15]:
df['Item_Weight'].isnull().sum()

0

In [17]:
Fat_Content_mode = df.pivot_table(values='Fat_Content', columns='Outlet_Type', aggfunc=(lambda x: x.mode()[0]))
Fat_Content_mode

Outlet_Type,1,2,3,4
Fat_Content,3.0,1.0,1.0,3.0


In [19]:
miss_bool = df['Fat_Content'].isnull()
df.loc[miss_bool, 'Fat_Content'] = df.loc[miss_bool, 'Outlet_Type'].apply(lambda x: Fat_Content_mode[x])

In [20]:
df['Fat_Content'].isnull().sum()

0

## Input Split

In [21]:
df.isnull().sum()

Item_Identifier      0
Item_Weight          0
Item_Type            0
Item_MRP             0
Fat_Content          0
Outlet_Type          0
Item_Outlet_Sales    0
dtype: int64

In [22]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Item_Identifier    8523 non-null   object 
 1   Item_Weight        8523 non-null   float64
 2   Item_Type          8523 non-null   int64  
 3   Item_MRP           8523 non-null   float64
 4   Fat_Content        8523 non-null   float64
 5   Outlet_Type        8523 non-null   int64  
 6   Item_Outlet_Sales  8523 non-null   float64
dtypes: float64(4), int64(2), object(1)
memory usage: 466.2+ KB


In [23]:
from sklearn.model_selection import train_test_split
X = df.drop(columns=['Item_Identifier', 'Item_Outlet_Sales'])
y = df['Item_Outlet_Sales']

In [24]:
X

Unnamed: 0,Item_Weight,Item_Type,Item_MRP,Fat_Content,Outlet_Type
0,9.300,1,249.8092,1.0,1
1,5.920,2,48.2692,1.0,2
2,17.500,3,141.6180,1.0,1
3,19.200,4,182.0950,3.0,4
4,8.930,5,53.8614,2.0,1
...,...,...,...,...,...
8518,6.865,7,214.5218,2.0,1
8519,8.380,6,108.1570,3.0,1
8520,10.600,13,85.1224,3.0,1
8521,7.210,7,103.1332,1.0,2


In [25]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8523 entries, 0 to 8522
Data columns (total 5 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Item_Weight  8523 non-null   float64
 1   Item_Type    8523 non-null   int64  
 2   Item_MRP     8523 non-null   float64
 3   Fat_Content  8523 non-null   float64
 4   Outlet_Type  8523 non-null   int64  
dtypes: float64(3), int64(2)
memory usage: 333.1 KB


In [26]:
y

0       3735.1380
1        443.4228
2       2097.2700
3        732.3800
4        994.7052
          ...    
8518    2778.3834
8519     549.2850
8520    1193.1136
8521    1845.5976
8522     765.6700
Name: Item_Outlet_Sales, Length: 8523, dtype: float64

In [27]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size  = 0.2, random_state = 42 )

In [28]:
print(X.shape, X_train.shape, X_test.shape)

(8523, 5) (6818, 5) (1705, 5)


In [29]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score
LR = LinearRegression()
LR.fit(X_train,y_train)
y_pred = LR.predict(X_test)
#coef2 = pd.Series(LR.coef_,features).sort_values()

In [30]:
r2_Linear_Regression = r2_score(y_test, y_pred)
print('R2 score of Linear regression:',r2_Linear_Regression)

R2 score of Linear regression: 0.431451107830363


In [31]:
y_pred

array([1609.5713745 ,  612.74185129, 1515.41519594, ..., 1042.95690915,
       1269.1074562 , 1621.30971956])

In [32]:
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler 
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
from sklearn.model_selection import KFold, cross_val_score
import sklearn.metrics as metrics
from math import sqrt
from sklearn import linear_model
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error
# Ignore Warnings
import warnings 
warnings.filterwarnings('ignore')

In [33]:
RF = RandomForestRegressor(n_estimators=200,max_depth=5, min_samples_leaf=200,n_jobs=5,random_state=32)

In [34]:
# Fitting the model on our trained dataset.
RF.fit(X_train,y_train)

# Making Predictions
y_pred = RF.predict(X_test)

In [35]:
r2_random = r2_score(y_test, y_pred)
print('R2 score of Random regression:',r2_random)


R2 score of Random regression: 0.5953329393766973


In [36]:
y_pred

array([1333.4672117 ,  685.17868117,  704.16743226, ...,  686.5920425 ,
        752.19091586, 1679.12299301])

In [37]:
training_data_prediction = RF.predict(X_train)
r2_random = metrics.r2_score(y_train, training_data_prediction)
print("R Squared value", r2_random)

R Squared value 0.5775454570584593


In [38]:
import xgboost as xgb
from xgboost import XGBRegressor

regressor = XGBRegressor()
regressor.fit(X_train, y_train)

In [40]:
X_train

Unnamed: 0,Item_Weight,Item_Type,Item_MRP,Fat_Content,Outlet_Type
549,9.500,4,171.3448,1.0,1
7757,18.000,5,170.5422,3.0,1
764,17.600,3,111.7202,3.0,1
6867,8.325,4,41.6138,3.0,1
2716,12.850,7,155.5630,3.0,1
...,...,...,...,...,...
5734,9.395,4,139.1838,3.0,4
5191,15.600,8,75.6670,3.0,1
5390,17.600,13,237.3590,3.0,1
860,20.350,7,117.9466,3.0,1


In [39]:
X_test

Unnamed: 0,Item_Weight,Item_Type,Item_MRP,Fat_Content,Outlet_Type
7503,14.300000,8,79.4302,2.0,1
2957,7.930000,13,42.7086,3.0,1
7031,14.500000,11,42.0454,1.0,1
1084,12.857645,2,173.7054,1.0,3
856,10.195000,3,197.5110,3.0,1
...,...,...,...,...,...
7205,11.800000,7,127.1704,3.0,1
3257,7.020000,4,148.1734,1.0,2
6346,14.500000,11,42.0454,2.0,1
6318,9.800000,6,50.5008,1.0,2


In [41]:

training_data_prediction = regressor.predict(X_train)
r2_train = metrics.r2_score(y_train, training_data_prediction)
print("R Squared value", r2_train)

R Squared value 0.8084955252266532


In [42]:
testing_data_prediction = regressor.predict(X_test)
r2_test = metrics.r2_score(y_test, testing_data_prediction)
print("R Squared value", r2_test)

R Squared value 0.5318036873314818


In [48]:
import pickle
pickle_out = open("RF.pkl","wb")
pickle.dump(RF, pickle_out)
pickle_out.close()

In [49]:
import numpy as np

In [50]:
#pipe.predict(pd.DataFrame(columns=['name','company','year','kms_driven','fuel_type'],data=np.array(['Maruti Suzuki Swift','Maruti',2019,100,'Petrol']).reshape(1,5)))


In [53]:
RF.predict(pd.DataFrame(columns=['Item_Weight','Item_Type','Item_MRP','Fat_Content','Outlet_Type'],data=np.array([20,11,80,1,1]).reshape(1,5)))

array([1391.29998871])

In [52]:
#regressor.predict([[20,11,80,1,1]])