# Importing Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import make_column_selector, make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer

# Loading the data

In [2]:
path = 'DataFiles/Cereal with missing values.xlsx - Sheet 1 - cereal.csv'
df = pd.read_csv(path)
df.head()

Unnamed: 0,name,Manufacturer,type,calories per serving,grams of protein,grams of fat,milligrams of sodium,grams of dietary fiber,grams of complex carbohydrates,grams of sugars,milligrams of potassium,vitamins and minerals (% of FDA recommendation),Display shelf,Weight in ounces per one serving,Number of cups in one serving,Rating of cereal
0,Apple Cinnamon Cheerios,General Mills,Cold,110.0,2,2.0,180.0,1.5,10.5,10.0,70,25.0,1,1.0,0.75,29.509541
1,Basic 4,General Mills,Cold,130.0,3,2.0,,2.0,18.0,,100,25.0,3,1.33,0.75,37.038562
2,Cheerios,General Mills,Cold,,6,2.0,290.0,2.0,17.0,1.0,105,25.0,1,1.0,1.25,50.764999
3,Cinnamon Toast Crunch,General Mills,Cold,120.0,1,3.0,210.0,0.0,13.0,9.0,45,25.0,2,1.0,0.75,19.823573
4,Clusters,General Mills,Cold,110.0,3,2.0,140.0,2.0,13.0,7.0,105,25.0,3,1.0,0.5,40.400208


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 77 entries, 0 to 76
Data columns (total 16 columns):
 #   Column                                           Non-Null Count  Dtype  
---  ------                                           --------------  -----  
 0   name                                             77 non-null     object 
 1   Manufacturer                                     77 non-null     object 
 2   type                                             68 non-null     object 
 3   calories per serving                             70 non-null     float64
 4   grams of protein                                 77 non-null     int64  
 5   grams of fat                                     69 non-null     float64
 6   milligrams of sodium                             76 non-null     float64
 7   grams of dietary fiber                           77 non-null     float64
 8   grams of complex carbohydrates                   77 non-null     float64
 9   grams of sugars                   

In [4]:
df.duplicated().sum()

0

# Whats the question?
We want to predict calories based on Manufacturer, cereal type, grams of fat, grams of sugar, and weight in ounces per serving
Lets set up the X and y, but before we do, we need to drop the rows with nulls for calories per serving, since thats our target and we will not impute any missing values here

In [5]:
df.dropna(subset = ['calories per serving'], inplace = True)
df['calories per serving'].isna().sum()

0

All set, now to create our X and y

In [6]:
X = df[['Manufacturer', 'type', 'grams of fat', 'grams of sugars', 'Weight in ounces per one serving']]
y = df['calories per serving']

X.head()

Unnamed: 0,Manufacturer,type,grams of fat,grams of sugars,Weight in ounces per one serving
0,General Mills,Cold,2.0,10.0,1.0
1,General Mills,Cold,2.0,,1.33
3,General Mills,Cold,3.0,9.0,1.0
4,General Mills,Cold,2.0,7.0,1.0
5,General Mills,Cold,1.0,13.0,1.0


# Pre-Processing

In [7]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 70 entries, 0 to 76
Data columns (total 5 columns):
 #   Column                            Non-Null Count  Dtype  
---  ------                            --------------  -----  
 0   Manufacturer                      70 non-null     object 
 1   type                              62 non-null     object 
 2   grams of fat                      62 non-null     float64
 3   grams of sugars                   62 non-null     float64
 4   Weight in ounces per one serving  70 non-null     float64
dtypes: float64(3), object(2)
memory usage: 3.3+ KB


## Feature Identification
We have 2 objects and the rest are floats. we also have some missing values to impute

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train

Unnamed: 0,Manufacturer,type,grams of fat,grams of sugars,Weight in ounces per one serving
63,Quaker Oats,Cold,0.0,0.0,0.5
52,Post,,1.0,12.0,1.0
18,General Mills,Cold,1.0,3.0,1.0
37,Kelloggs,Cold,2.0,7.0,1.33
46,Nabisco,,0.0,0.0,0.83
31,Kelloggs,Cold,0.0,7.0,1.0
8,General Mills,Cold,1.0,9.0,1.0
59,Quaker Oats,Cold,5.0,8.0,1.0
44,Kelloggs,Cold,0.0,3.0,1.0
48,Nabisco,Cold,0.0,0.0,1.0


## Creating my Instances
To be used later in the transformers

In [9]:
# Creating the Column Selectors
num_selector = make_column_selector(dtype_include = 'number')
cat_selector = make_column_selector(dtype_include = 'object')

# Creating the Imputers:
mean_imputer = SimpleImputer(strategy = 'mean')
freq_imputer = SimpleImputer(strategy = 'most_frequent')

# Creating the Scaler and OHE:
ohe = OneHotEncoder(sparse = False, handle_unknown = 'ignore')
scaler = StandardScaler()

## Combining the Instances

In [10]:
# Creating the pipelines:
num_pipeline = make_pipeline(mean_imputer, scaler)
cat_pipeline = make_pipeline(freq_imputer, ohe)

# Creating the tuples:
num_tuple = (num_pipeline, num_selector)
cat_tuple = (cat_pipeline, cat_selector)

# Combining into one transformer:
col_transformer = make_column_transformer(num_tuple, cat_tuple, remainder = 'passthrough')

## Transforming the data

In [11]:
col_transformer.fit(X_train)

In [12]:
X_train_processed = col_transformer.transform(X_train)
X_test_processed = col_transformer.transform(X_test)

X_train_processed[0:5]

array([[-1.00539366, -1.53835815, -3.52083059,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  1.        ,
         0.        ],
       [-0.04103648,  1.19210391, -0.23976983,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [-0.04103648, -0.85574264, -0.23976983,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [ 0.92332071,  0.05441138,  1.92573028,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [-1.00539366, -1.53835815, -1.35533049,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ]])

## Displaying the data

In [13]:
print(f'{np.isnan(X_train_processed).sum().sum()} missing values in training data')
print(f'{np.isnan(X_test_processed).sum().sum()} missing values in test data')
print('\n')
print(f'all data in X_train_processed are {X_train_processed.dtype}')
print(f'all data in X_test_processed are {X_test_processed.dtype}')
print('\n')
print(f'shape of training data is {X_train_processed.shape}')
print(f'shape of testing data is {X_test_processed.shape}')
print('\n')
X_train_processed[0:10]

0 missing values in training data
0 missing values in test data


all data in X_train_processed are float64
all data in X_test_processed are float64


shape of training data is (52, 11)
shape of testing data is (18, 11)




array([[-1.00539366, -1.53835815, -3.52083059,  0.        ,  0.        ,
         0.        ,  0.        ,  1.        ,  0.        ,  1.        ,
         0.        ],
       [-0.04103648,  1.19210391, -0.23976983,  0.        ,  0.        ,
         0.        ,  1.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [-0.04103648, -0.85574264, -0.23976983,  1.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [ 0.92332071,  0.05441138,  1.92573028,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [-1.00539366, -1.53835815, -1.35533049,  0.        ,  0.        ,
         1.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.        ],
       [-1.00539366,  0.05441138, -0.23976983,  0.        ,  1.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  1.        ,
         0.   