In [30]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder, \
OrdinalEncoder
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn import set_config
set_config(display='diagram')


##**Load Data**

In [31]:
# Load in the data
df = pd.read_csv('/content/sales_predictions - sales_predictions (5).csv')
df

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.300,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.1380
1,DRC01,5.920,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.500,Low Fat,0.016760,Meat,141.6180,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.2700
3,FDX07,19.200,Regular,0.000000,Fruits and Vegetables,182.0950,OUT010,1998,,Tier 3,Grocery Store,732.3800
4,NCD19,8.930,Low Fat,0.000000,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052
...,...,...,...,...,...,...,...,...,...,...,...,...
8518,FDF22,6.865,Low Fat,0.056783,Snack Foods,214.5218,OUT013,1987,High,Tier 3,Supermarket Type1,2778.3834
8519,FDS36,8.380,Regular,0.046982,Baking Goods,108.1570,OUT045,2002,,Tier 2,Supermarket Type1,549.2850
8520,NCJ29,10.600,Low Fat,0.035186,Health and Hygiene,85.1224,OUT035,2004,Small,Tier 2,Supermarket Type1,1193.1136
8521,FDN46,7.210,Regular,0.145221,Snack Foods,103.1332,OUT018,2009,Medium,Tier 3,Supermarket Type2,1845.5976


In [32]:
# make a copy of my database
eda_ml =df.copy()

## Duplication

In [33]:
# Drop rows

eda_ml.drop_duplicates(inplace=True)

In [34]:
# duplicated rows
eda_ml.duplicated().sum()

0

In [35]:
# info
eda_ml.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8523 entries, 0 to 8522
Data columns (total 12 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Item_Identifier            8523 non-null   object 
 1   Item_Weight                7060 non-null   float64
 2   Item_Fat_Content           8523 non-null   object 
 3   Item_Visibility            8523 non-null   float64
 4   Item_Type                  8523 non-null   object 
 5   Item_MRP                   8523 non-null   float64
 6   Outlet_Identifier          8523 non-null   object 
 7   Outlet_Establishment_Year  8523 non-null   int64  
 8   Outlet_Size                6113 non-null   object 
 9   Outlet_Location_Type       8523 non-null   object 
 10  Outlet_Type                8523 non-null   object 
 11  Item_Outlet_Sales          8523 non-null   float64
dtypes: float64(4), int64(1), object(7)
memory usage: 865.6+ KB


In [36]:
eda_ml.replace(["LF"], ["Low Fat"], inplace=True)
eda_ml.replace(["low fat"], ["Low Fat"], inplace=True)
eda_ml.replace(["reg"], ["Regular"], inplace=True)
eda_ml["Item_Fat_Content"].value_counts()

Low Fat    5517
Regular    3006
Name: Item_Fat_Content, dtype: int64

In [37]:
# descriptive for all collumns
eda_ml.describe(include='all')

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
count,8523,7060.0,8523,8523.0,8523,8523.0,8523,8523.0,6113,8523,8523,8523.0
unique,1559,,2,,16,,10,,3,3,4,
top,FDW13,,Low Fat,,Fruits and Vegetables,,OUT027,,Medium,Tier 3,Supermarket Type1,
freq,10,,5517,,1232,,935,,2793,3350,5577,
mean,,12.857645,,0.066132,,140.992782,,1997.831867,,,,2181.288914
std,,4.643456,,0.051598,,62.275067,,8.37176,,,,1706.499616
min,,4.555,,0.0,,31.29,,1985.0,,,,33.29
25%,,8.77375,,0.026989,,93.8265,,1987.0,,,,834.2474
50%,,12.6,,0.053931,,143.0128,,1999.0,,,,1794.331
75%,,16.85,,0.094585,,185.6437,,2004.0,,,,3101.2964


##**Split Data**

In [38]:
X = eda_ml.drop(columns = ['Item_Outlet_Sales'])
y = eda_ml['Item_Outlet_Sales']




In [39]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)


In [40]:
## The target is Item_Outlet_Sales which we try to predict

## Preprocessing 

In [41]:
cat_selector = make_column_selector(dtype_include='object')
num_selector = make_column_selector(dtype_include='number')



## Transformers

In [42]:
# Imputers
freq_imputer = SimpleImputer(strategy='most_frequent')
mean_imputer = SimpleImputer(strategy = 'mean')
# Scaler
scaler = StandardScaler()

# one Hot encoder
ohe = OneHotEncoder(sparse=False, handle_unknown='ignore')

- Nominal is one hot-encode
- Numeric is scaler 

##  Numeric Pipeline

In [43]:
numeric_pipe = make_pipeline(mean_imputer, scaler)
numeric_pipe

In [44]:
categorical_pipe = make_pipeline(freq_imputer, ohe)
categorical_pipe

##**Instantiate Column Transformer**

In [45]:
# Tuple for column Tranformer 
num_tuple= (numeric_pipe, num_selector)
cat_tuple = (categorical_pipe, cat_selector)

## ColumnTransformer

In [46]:
preprocessor = make_column_transformer(num_tuple, 
                                       cat_tuple, 
                                       remainder='passthrough')
preprocessor

## Fit and Transform Data

In [47]:
# fit on train 
preprocessor.fit(X_train)


In [48]:
X_train_transformed = preprocessor.transform(X_train)
X_test_transformed = preprocessor.transform(X_test)

In [49]:
## I hope that I did not have data leakage because I had not run the "Fit" comment with the preprocessor on the column transformer cell