In [None]:
#imports

import pandas as pd
import numpy as np
import seaborn as sns
import pandas_profiling as pf
import pickle

In [None]:
# Reading the Training data

data = pd.read_csv("Train.csv")
data

In [None]:
data.info()

In [None]:
#### Quick stats info

#pf.ProfileReport(data).to_widgets()

### Observations

#### Item_Weight

1. Numerical data
2. Has 1463 Missing Values
3. Mean and Median are almost equal hence no outliers are present (have to recheck)
4. Data is normally distributed with shorter and bulged tails

#### Item_Fat_Content

1. Categorical data
2. Contains the level of fat content - Low Fat and Regular Fat. But the data name is splitted and has to be combined.

#### Item_Visiblity

1. Numerical data
2. Has 526 Zeros, the zeros indicate that the item is not available
3. Right Skewed and has a long tail.
4. Outliers are not present while considering the mean and median
5. Data is diributed between 0 and 0.33

#### Item_Type

1. Catergorical data
2. Contains the item's Category

#### Item_MRP

1. Numerical data
2. No Skwness and Bulged Kurtosis at the ends

#### Outlet_Identifier

1. Categorical data
2. Contains 10 outlets and the data is collected almost equally from all outlets ( excpet 2 outlets)

#### Outlet_Extablishment_year

1. Categorical data
2. Contains the year of outlet's establishment
3. Can calculate the age of the outlet

#### Outlet_Size

1. Categorical Data
2. 2046 values are missing
3. 3 Categories - Small , Medium, High 
4. Half of the outlets are medium sized

#### Outlet_location

1. Categorical data
2. Contains the type of location the outlet is present
3. Tier1 , Tier2, Tier3

#### Outlet_Type

1. Categorical data
2. 2 major categories - Supermarket and Grocery. Supermarket is split into ( type 1, type 2, type 3)
3. Most data is from Supermarket Type 1

### Target : Item_Outlet_Sales

#### The feature denotes the price of an item at the specific outlet

1. Numerical data
2. Difference between mean and median is 400, therefore outliers may be present
3. Right Skewed and has a long tail with visible outliers



### Handling missing values

 Item_weight - 1463
 
 Outlet_size - 2406


In [None]:
def mean_imputer(data):
    data.fillna(data.mean() , inplace = True)
    return data

def mode_imputer(data):
    data.fillna(data.mode()[0], inplace = True)
    return data

In [None]:
mean_imputer(data.Item_Weight)

In [None]:
mode_imputer(data.Outlet_Size)

In [None]:
data.Outlet_Size.describe()

In [None]:
data["Outlet_Age"] = 2021 - data["Outlet_Establishment_Year"] 

In [None]:
data["Outlet_Age"]

In [None]:
data.drop("Outlet_Establishment_Year", inplace = True, axis =1)

In [None]:
data

In [None]:
corr_matrix = data.corr("spearman")

In [None]:
corr_matrix["Item_Outlet_Sales"]
# Item_MRP is highly correlated with the target feature

In [None]:
#Splitting the data to categorical and numerical data

def cat_num_split(data):
    cat_columns = []
    num_columns = []
    
    for i in data.columns:
        if data[i].dtype == object:
            cat_columns.append(i)
        else :
            num_columns.append(i)
    data_num = data[num_columns]
    data_cat = data[cat_columns]
    
    return data_num , data_cat
    

In [None]:
data_num , data_cat = cat_num_split(data)


In [None]:
data_num

In [None]:
data_cat

In [None]:
#To drop unwanted columns from the data
def drop_unwanted(data,columns):
    for i in columns:
        data.drop(i, axis = 1, inplace = True)
    return data

In [None]:

#drop_unwanted(data_cat, ["Item_Identifier"])
drop_unwanted(data_num, ["Item_Weight"])


In [None]:
# replacing repeated categorical names with the appropriate one

def repeated_cat_replace(data):
    data.replace("LF", "Low Fat", inplace = True)
    data.replace("low fat", "Low Fat", inplace = True)
    data.replace("reg", "Regular", inplace = True)
    
    return data

In [None]:
repeated_cat_replace(data_cat["Item_Fat_Content"])

In [None]:
#Encoding the catergorical data

def encode_cat_data(z):
    from sklearn.preprocessing import OneHotEncoder
    encoder = OneHotEncoder()
    return encoder.fit_transform(z).toarray() , encoder.categories_
    

In [None]:
#To normalize the data
def normalized_data(data):
    
    from sklearn.preprocessing import Normalizer
    nor = Normalizer()
    nor_df = nor.fit_transform(data)
    data_num_norm = pd.DataFrame(nor_df, columns = ["Item_Visibility","Item_Mrp","Item_Outlet_sales","Outlet_Age"])
    
    X = data_num_norm[["Item_Mrp"]]
    y = data_num_norm[["Item_Outlet_sales"]]
    
    return X ,y
    

In [None]:
X, y = normalized_data(data_num)

In [None]:
#regression_model(X,y)

In [None]:
# Trying out regression with multiple features
#X2 = cat_encoded
#y2 = normalized_data(data_num)[1]

#from sklearn.linear_model import LinearRegression
#regression = LinearRegression()
#regression.fit(X=X2, y=y2) 
#regression.score(X2,y2)


In [None]:
#Splitting categorical data into nominal and ordinal data

ordinal_cols = ["Item_Fat_Content","Outlet_Size","Outlet_Location_Type"]
nominal_cols = ["Item_Identifier","Item_Type","Outlet_Identifier","Outlet_Type"]

nominal_cat_data = data_cat[nominal_cols]
ordinal_cat_data = data_cat[ordinal_cols]

In [None]:
nominal_cat_data

In [None]:
# Handling ordinal data with label encoding

ordinal_cat_data.Item_Fat_Content = ordinal_cat_data.Item_Fat_Content.map({"Low Fat": 0, "Regular":1})
ordinal_cat_data.Outlet_Size = ordinal_cat_data.Outlet_Size.map({"Medium":1, "Small":0, "High":2})
ordinal_cat_data.Outlet_Location_Type = ordinal_cat_data.Outlet_Location_Type.map({"Tier 1":1, "Tier 2":2, "Tier 3":3})

ordinal_cat_data

# Only outlet size is somewhat correlated to the target
# will be using that to test 

ordinal_cat_data.drop("Item_Fat_Content", axis=1, inplace=True)
ordinal_cat_data.drop("Outlet_Location_Type", axis=1, inplace=True)



In [None]:
# Handling Nominal Data

# Count encoding for Item_Identifier since it has high cardinality

def count_encoding(data):
    data = data.map(data.value_counts().to_dict())
    return data

nominal_cat_data["Item_Identifier"]= count_encoding(nominal_cat_data["Item_Identifier"])

#Item_Identifier is not correlated to the target
nominal_cat_data.drop("Item_Identifier", axis=1, inplace=True)

# Item_Type is not correlated to the target
nominal_cat_data.drop("Item_Type", axis=1, inplace=True)


# Apllying one hot encoding on Outlet_Identifier 
vals , cols = encode_cat_data(nominal_cat_data[["Outlet_Identifier"]])
nominal_cat_data["Outlet_Identifier"]= pd.DataFrame(vals, columns = list(cols))

# Apllying one hot encoding on Outlet_Type
vals2 , cols2 = encode_cat_data(nominal_cat_data[["Outlet_Type"]])
nominal_cat_data["Outlet_Type"]= pd.DataFrame(vals2, columns = list(cols2))




In [None]:
final_cat_data = nominal_cat_data.copy()
final_cat_data["Outlet_Size"] = ordinal_cat_data

In [None]:
final_cat_data

In [None]:
#Combining categorical and Numerical features

final_features = final_cat_data.copy()
final_features["Item_MRP"] = normalized_data(data_num)[0]

In [None]:
final_features

In [None]:
# Linerar Regression Model

from sklearn.linear_model import LinearRegression
regression = LinearRegression()
regression.fit(X,y)
regression.score(X,y)

In [None]:
test = pd.read_csv("Test.csv")

In [None]:
from sklearn.preprocessing import Normalizer
nor = Normalizer()
nor_df = nor.fit(X =test[["Item_MRP"]])
#test_num = pd.DataFrame(nor_df, columns = ["Item_MRP"])
#nor_df   

nor_df.transform(test[["Item_MRP"]])
    


In [None]:
from sklearn.model_selection import train_test_split
X_train,X_test, y_train,y_test = train_test_split(X,y,test_size=0.20, random_state=42)

In [None]:
X = X_train
y = y_train

In [None]:
regression.score(X_test,y_test)

In [None]:
from sklearn.tree import DecisionTreeRegressor
tree_reg = DecisionTreeRegressor()
tree_reg.fit(X_train,y_train)

In [None]:
tree_reg.score(X_test,y_test)

In [None]:
from sklearn.ensemble import RandomForestRegressor

In [None]:
reg_forest = RandomForestRegressor()
reg_forest.fit(X_train,y_train)

In [None]:
reg_forest.score(X_test,y_test)

In [None]:
file = "Final_Model.sav"
pickle.dump(reg_forest,open(file,"wb"))

In [None]:
saved_model = pickle.load(open(file,"rb"))

In [None]:
saved_model