In [13]:
#importing the common libraries

import pandas as pd
import numpy as np
from sklearn.preprocessing import OrdinalEncoder
import warnings
warnings.filterwarnings('ignore')

In [14]:
#loading the dataframe and renaming it

impute_data = pd.read_csv('Retail-data.csv')

#checking the features of the dataframe

pd.set_option("display.max_columns",500)
impute_data.sample(5)

Unnamed: 0,hour,order_id,customer_id,product_id,product_price,variant_id,orders,gross_sales,discounts,returns,net_sales,shipping,taxes,total_sales,ordered_item_quantity,net_quantity,returned_item_quantity,order_name,product_type,product_title,product_vendor,variant_title,api_client_title
90095,19-04-2022 12:00,4727090000000.0,5409180000000.0,0.0,1.73,0.0,1,2.86,0.0,0.0,2.86,0.0,0.6,3.46,1,1,0,#18279,,Fresh Green chilli (175 g),,,Point of Sale
92832,28-04-2022 19:00,4738730000000.0,5268890000000.0,6739230000000.0,9.99,40007500000000.0,1,18.34,0.0,0.0,18.34,0.0,1.64,19.98,1,1,0,#18787,Rice & Rice products,Annam Sona Masoori Rice,Annam,5 kg,Point of Sale
18899,10-08-2021 18:00,3983830000000.0,5283840000000.0,5931100000000.0,1.99,40016800000000.0,1,3.64,-0.18,0.0,3.46,0.0,0.32,3.78,1,1,0,#4613,Sweets & Snacks,Parle G Biscuits - Gluco,Parle,200 g,Point of Sale
49060,04-12-2021 15:00,4584020000000.0,5313990000000.0,6798360000000.0,3.99,40279900000000.0,1,7.32,0.0,0.0,7.32,0.0,0.66,7.98,1,1,0,#10669,"Pulses, Grains & Millets",TRS Moong Dal - Yellow,TRS,1 kg,Point of Sale
6840,19-06-2021 18:00,3876100000000.0,5280100000000.0,5932340000000.0,1.69,37053800000000.0,1,3.1,0.0,0.0,3.1,0.0,0.28,3.38,1,1,0,#2383,Masalas & Spices,Everest Chhole Masala,Everest,100 g,Point of Sale


In [15]:
#checking the datatypes of the features

impute_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 101929 entries, 0 to 101928
Data columns (total 23 columns):
 #   Column                  Non-Null Count   Dtype  
---  ------                  --------------   -----  
 0   hour                    101929 non-null  object 
 1   order_id                101929 non-null  float64
 2   customer_id             101929 non-null  float64
 3   product_id              101929 non-null  float64
 4   product_price           101929 non-null  float64
 5   variant_id              101929 non-null  float64
 6   orders                  101929 non-null  int64  
 7   gross_sales             101929 non-null  float64
 8   discounts               101929 non-null  float64
 9   returns                 101929 non-null  float64
 10  net_sales               101929 non-null  float64
 11  shipping                101929 non-null  float64
 12  taxes                   101929 non-null  float64
 13  total_sales             101929 non-null  float64
 14  ordered_item_quantit

In [16]:
#checking the null values under each feature

impute_data.isnull().sum()

hour                          0
order_id                      0
customer_id                   0
product_id                    0
product_price                 0
variant_id                    0
orders                        0
gross_sales                   0
discounts                     0
returns                       0
net_sales                     0
shipping                      0
taxes                         0
total_sales                   0
ordered_item_quantity         0
net_quantity                  0
returned_item_quantity        0
order_name                 5961
product_type              22861
product_title              6483
product_vendor            22366
variant_title             31316
api_client_title           5961
dtype: int64

In [17]:
# installation of facnyimpute package

!pip list

Package                       Version
----------------------------- --------------------
alabaster                     0.7.12
anaconda-client               1.11.0
anaconda-navigator            2.3.1
anaconda-project              0.11.1
anyio                         3.5.0
appdirs                       1.4.4
argon2-cffi                   21.3.0
argon2-cffi-bindings          21.2.0
arrow                         1.2.2
astroid                       2.11.7
astropy                       5.1
atomicwrites                  1.4.0
attrs                         21.4.0
Automat                       20.2.0
autopep8                      1.6.0
Babel                         2.9.1
backcall                      0.2.0
backports.functools-lru-cache 1.6.4
backports.tempfile            1.0
backports.weakref             1.0.post1
bcrypt                        3.2.0
beautifulsoup4                4.11.1
binaryornot                   0.4.4
bitarray                      2.5.1
bkcharts                      0.2
blac

In [20]:
# calling the IterativeImputer

from fancyimpute import IterativeImputer

In [21]:
#instantiate both packages to use

encoder = OrdinalEncoder()
imputer = IterativeImputer()

# create a list of categorical columns to iterate over

cat_cols = ['hour','order_name','product_type','product_title','product_vendor','variant_title','api_client_title']

def encode(data):
    '''function to encode non-null data and replace it in the original data'''
    
    #retains only non-null values
    nonulls = np.array(data.dropna())
    
    #reshapes the data for encoding
    impute_reshape = nonulls.reshape(-1,1)
    
    #encode data
    impute_ordinal = encoder.fit_transform(impute_reshape)
    
    #Assign back encoded values to non-null values
    data.loc[data.notnull()] = np.squeeze(impute_ordinal)
    return data

#create a for loop to iterate through each column in the data
for columns in cat_cols:
    encode(impute_data[columns])

In [25]:
# impute data and convert 
encode_data = pd.DataFrame(np.round(imputer.fit_transform(impute_data)),columns = impute_data.columns)

In [26]:
encode_data.sample(5)

Unnamed: 0,hour,order_id,customer_id,product_id,product_price,variant_id,orders,gross_sales,discounts,returns,net_sales,shipping,taxes,total_sales,ordered_item_quantity,net_quantity,returned_item_quantity,order_name,product_type,product_title,product_vendor,variant_title,api_client_title
16893,180.0,3962590000000.0,5283690000000.0,6860970000000.0,2.0,40437700000000.0,1.0,4.0,0.0,0.0,4.0,0.0,0.0,4.0,1.0,1.0,0.0,13651.0,10.0,4063.0,97.0,55.0,2.0
19129,3066.0,3986840000000.0,5278100000000.0,6734100000000.0,4.0,39984500000000.0,1.0,25.0,0.0,0.0,25.0,0.0,2.0,27.0,3.0,3.0,0.0,14049.0,3.0,3319.0,62.0,82.0,2.0
81047,5536.0,4696510000000.0,5273590000000.0,6739200000000.0,3.0,40007300000000.0,1.0,6.0,0.0,0.0,6.0,0.0,1.0,7.0,1.0,1.0,0.0,7198.0,1.0,323.0,13.0,8.0,2.0
58006,1742.0,4622520000000.0,6024200000000.0,5913170000000.0,3.0,36985300000000.0,1.0,5.0,0.0,0.0,5.0,0.0,0.0,6.0,1.0,1.0,0.0,2546.0,10.0,118.0,9.0,55.0,1.0
1748,7986.0,3834480000000.0,5273800000000.0,6723260000000.0,1.0,39943500000000.0,1.0,2.0,0.0,0.0,2.0,0.0,0.0,2.0,1.0,1.0,0.0,4696.0,2.0,1978.0,173.0,108.0,2.0


In [28]:
encode_data.isnull().sum()

hour                      0
order_id                  0
customer_id               0
product_id                0
product_price             0
variant_id                0
orders                    0
gross_sales               0
discounts                 0
returns                   0
net_sales                 0
shipping                  0
taxes                     0
total_sales               0
ordered_item_quantity     0
net_quantity              0
returned_item_quantity    0
order_name                0
product_type              0
product_title             0
product_vendor            0
variant_title             0
api_client_title          0
dtype: int64