<a href="https://colab.research.google.com/github/OsamaAwniHamdan/Prediction-of-Product-Sales/blob/main/Project_part_5.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Project 1 - Part 5 (Core)
  - Osama Hamdan

## Mount google drive

In [1]:
# prompt: Mount google drive

from google.colab import drive
drive.mount('/content/drive/')


Mounted at /content/drive/


In [2]:
fpath = '/content/drive/MyDrive/CodingDojo/01-Fundamentals/Week02/Data/sales_predictions_2023.csv'

## Load dataset

In [3]:
import pandas as pd
df = pd.read_csv(fpath)

df.head()

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27
3,FDX07,19.2,Regular,0.0,Fruits and Vegetables,182.095,OUT010,1998,,Tier 3,Grocery Store,732.38
4,NCD19,8.93,Low Fat,0.0,Household,53.8614,OUT013,1987,High,Tier 3,Supermarket Type1,994.7052


## Check Duplicates

In [4]:
df.duplicated(keep=False).sum()

0

## Columns types

In [5]:
df.dtypes

Item_Identifier               object
Item_Weight                  float64
Item_Fat_Content              object
Item_Visibility              float64
Item_Type                     object
Item_MRP                     float64
Outlet_Identifier             object
Outlet_Establishment_Year      int64
Outlet_Size                   object
Outlet_Location_Type          object
Outlet_Type                   object
Item_Outlet_Sales            float64
dtype: object

In [6]:
cat_cols = df.select_dtypes('object').columns
for col in cat_cols:
  print(df[col].value_counts())

Item_Identifier
FDW13    10
FDG33    10
NCY18     9
FDD38     9
DRE49     9
         ..
FDY43     1
FDQ60     1
FDO33     1
DRF48     1
FDC23     1
Name: count, Length: 1559, dtype: int64
Item_Fat_Content
Low Fat    5089
Regular    2889
LF          316
reg         117
low fat     112
Name: count, dtype: int64
Item_Type
Fruits and Vegetables    1232
Snack Foods              1200
Household                 910
Frozen Foods              856
Dairy                     682
Canned                    649
Baking Goods              648
Health and Hygiene        520
Soft Drinks               445
Meat                      425
Breads                    251
Hard Drinks               214
Others                    169
Starchy Foods             148
Breakfast                 110
Seafood                    64
Name: count, dtype: int64
Outlet_Identifier
OUT027    935
OUT013    932
OUT049    930
OUT046    930
OUT035    930
OUT045    929
OUT018    928
OUT017    926
OUT010    555
OUT019    528
Name: count, dt

### Replace Fats categories

In [7]:
df['Item_Fat_Content'] = df['Item_Fat_Content'].replace({'LF': 'Low Fat', 'reg': 'Regular', 'low fat': 'Low Fat' })


In [8]:
num_cols = df.select_dtypes('number').columns
for col in num_cols:
  print(df[col].value_counts())

Item_Weight
12.150    86
17.600    82
13.650    77
11.800    76
15.100    68
          ..
7.275      2
7.685      1
9.420      1
6.520      1
5.400      1
Name: count, Length: 415, dtype: int64
Item_Visibility
0.000000    526
0.076975      3
0.162462      2
0.076841      2
0.073562      2
           ... 
0.013957      1
0.110460      1
0.124646      1
0.054142      1
0.044878      1
Name: count, Length: 7880, dtype: int64
Item_MRP
172.0422    7
170.5422    6
196.5084    6
188.1872    6
142.0154    6
           ..
97.3384     1
83.1934     1
96.6752     1
152.6682    1
75.4670     1
Name: count, Length: 5938, dtype: int64
Outlet_Establishment_Year
1985    1463
1987     932
1999     930
1997     930
2004     930
2002     929
2009     928
2007     926
1998     555
Name: count, dtype: int64
Item_Outlet_Sales
958.7520     17
1342.2528    16
703.0848     15
1845.5976    15
1278.3360    14
             ..
4124.6310     1
6622.7126     1
1614.5650     1
5602.7070     1
2778.3834     1
Name: co

- Numeric columns seems fine

## imports

In [13]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
from sklearn import set_config
set_config(transform_output='pandas')

## Split train and test datasets

In [35]:
target = 'Item_Outlet_Sales'
y = df[target]
X = df.drop(columns=[target, 'Item_Identifier'])
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

## Number Tuple

In [36]:
num_cols = X_train.select_dtypes('number').columns
imputer = SimpleImputer(strategy='median')
scaler = StandardScaler()
pipeline = make_pipeline(imputer, scaler)
num_tuple = ('numeric', pipeline, num_cols)

## Ordinal Tuple

In [37]:
cat_cols

['Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Type']

In [38]:
ordinal_cols = ['Outlet_Location_Type', 'Outlet_Size']
outlet_types = ['Tier 1', 'Tier 2', 'Tier 3']
outlet_sizes = ['Small', 'Medium', 'High']
ord_orders = [outlet_types, outlet_sizes]
imputer = SimpleImputer(strategy='constant', fill_value='NA')
ord_encoder = OrdinalEncoder(categories=ord_orders, handle_unknown='use_encoded_value', unknown_value=-1)
scaler = StandardScaler()
pipeline = make_pipeline(imputer, ord_encoder, scaler)
ordinal_tuple = ('ordinal', pipeline, ordinal_cols)

## Categorical Tuple

In [39]:
cat_cols = ['Item_Fat_Content', 'Item_Type', 'Outlet_Identifier', 'Outlet_Type']
imputer = SimpleImputer(strategy='constant', fill_value='NA')
cat_encoder = OneHotEncoder(sparse_output=False, handle_unknown='ignore')
pipeline = make_pipeline(imputer, cat_encoder)
cat_tuple = ('categorical', pipeline, cat_cols)

## Column Transformer

In [40]:
col_trans = ColumnTransformer([num_tuple, ordinal_tuple, cat_tuple], verbose_feature_names_out=False)

In [41]:
col_trans.fit(X_train)
X_train_trns = col_trans.transform(X_train)
X_test_trns = col_trans.transform(X_test)

In [42]:
X_train_trns.head()

Unnamed: 0,Item_Weight,Item_Visibility,Item_MRP,Outlet_Establishment_Year,Outlet_Location_Type,Outlet_Size,Item_Fat_Content_Low Fat,Item_Fat_Content_Regular,Item_Type_Baking Goods,Item_Type_Breads,...,Outlet_Identifier_OUT019,Outlet_Identifier_OUT027,Outlet_Identifier_OUT035,Outlet_Identifier_OUT045,Outlet_Identifier_OUT046,Outlet_Identifier_OUT049,Outlet_Type_Grocery Store,Outlet_Type_Supermarket Type1,Outlet_Type_Supermarket Type2,Outlet_Type_Supermarket Type3
4776,0.827485,-0.712775,1.828109,1.327849,1.084948,0.748125,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
7510,0.566644,-1.291052,0.603369,1.327849,1.084948,0.748125,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
5828,-0.121028,1.813319,0.244541,0.136187,-1.384777,0.748125,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
5327,-1.158464,-1.004931,-0.952591,0.732018,-0.149914,-0.26437,1.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4810,1.53887,-0.965484,-0.33646,0.493686,-0.149914,-1.276865,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0
