# Importing libraries

In [1]:
# import libraries
import pandas as pd
import numpy as np
import os

# Reading CSV files

In [2]:
#set the path variable
path = r'C:\Users\Sreelakshmi\Desktop\Instacart Basket Analysis'

In [3]:
#Read products.csv file into df_prods
df_prods = pd.read_csv(os.path.join(path, 'Data', 'Original Data', 'products.csv'), index_col = False)

In [4]:
#Read orders.csv file into df_ords
df_ords = pd.read_csv(os.path.join(path, 'Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

# Finding and addressing mixed datatype

### Creating a Test Dataframe

In [5]:
# create a test dataframe
df_test = pd.DataFrame()

In [7]:
# create a mixed type column
df_test['mix'] = ['a', 'b', 1, True]

In [8]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


### Function for checking mixed datatypes in all columns in a dataframe

In [11]:
# check for mixed types
for col in df_test.columns.tolist():
    wierd = (df_test[[col]].map(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
    if len(df_test[wierd]) > 0:
        print(col)

mix


### Addressing mixed datatype columns by changing its datatype

In [12]:
# change the datatype of col mix to str type
df_test['mix'] = df_test['mix'].astype('str')

# Finding and addressing missing values

### Finding missing values in a DataFrame 

In [14]:
#check for missing values
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [15]:
# Extracting records that has product name with null values
df_nan = df_prods[df_prods['product_name'].isnull() == True]

In [16]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


### Addressing missing values

In [17]:
df_prods.shape

(49693, 5)

In [18]:
# Removing all the records havinf product name as null
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [19]:
df_prods_clean.shape

(49677, 5)

# Finding and addressing duplicates

### Finding duplicates

In [20]:
# checking duplicates in the dataframe
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [21]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


### Addressing duplicate values

In [22]:
df_prods_clean.shape

(49677, 5)

In [25]:
# use drop_duplicates() function to delete all duplicate values from the dataframe
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [24]:
df_prods_clean_no_dups.shape

(49672, 5)

# Exporting the cleaned dataframe

In [26]:
df_prods_clean_no_dups.to_csv(os.path.join(path, 'Data','Prepared Data', 'products_checked.csv'))

# Task 4.5

## Step 2

In [27]:
# Runnung statistics on df_ords
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,order_days_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


#####  1. order_days_of_week:
Here min value is 0 and max value is 6, indicating the 7 days of the week from Monday to Sunday.
#####  2. order_hour_of_day:
Here min value is 0 (0:00 am/12:00 am) and max value is 23:00 pm, indicating 24 hours of the day.
#####  3. days_since_prior_order:
Here the count is less than other columns, indicating missing values. Here min value is 0, that is more than one orders has been placed on the same day and max value is 30, that is a gap of 30 days/1 month is there between two orders.
#####  4. order_id, user_id and order_number:
These columns need not be considered as it holds just numeric data for the identification purposes.

# Step 3

In [28]:
#checking for mixed datatypes in df_ords
for col in df_ords.columns.tolist():
    wierd = (df_ords[[col]].map(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
    if len(df_ords[wierd]) > 0:
        print(col)

In [29]:
df_ords.dtypes

Unnamed: 0                  int64
order_id                    int64
user_id                     int64
order_number                int64
order_days_of_week          int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

### Answer: There are no mixed datatypes in df_ords dataframe.

# Step 5 and 6

In [30]:
#check for missing values in df_ords
df_ords.isnull().sum()

Unnamed: 0                     0
order_id                       0
user_id                        0
order_number                   0
order_days_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

### Answer: There are 206209 missing values in the days_since_prior_order column

In [31]:
# Extracting records having days_since_prior_oder has null values
df_ords_nan = df_ords[df_ords['days_since_prior_order'].isnull() == True]

In [32]:
df_ords_nan

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,order_days_of_week,order_hour_of_day,days_since_prior_order
0,0,2539329,1,1,2,8,
11,11,2168274,2,1,2,11,
26,26,1374495,3,1,1,14,
39,39,3343014,4,1,6,11,
45,45,2717275,5,1,3,12,
...,...,...,...,...,...,...,...
3420930,3420930,969311,206205,1,4,12,
3420934,3420934,3189322,206206,1,3,18,
3421002,3421002,2166133,206207,1,6,19,
3421019,3421019,2227043,206208,1,1,15,


In [39]:
# finding the median value of days_since_prior_order column
df_ords.median()

Unnamed: 0                1710541.0
order_id                  1710542.0
user_id                    102689.0
order_number                   11.0
order_days_of_week              3.0
order_hour_of_day              13.0
days_since_prior_order          7.0
dtype: float64

In [42]:
# imputing the median value 7 to the missing values in df_ords. Because most of the people tend to order a week later(normally after 7 days) 
df_ords['days_since_prior_order'] = df_ords['days_since_prior_order'].fillna(7)

In [43]:
#Recheck for missing values in df_ords
df_ords.isnull().sum()

Unnamed: 0                0
order_id                  0
user_id                   0
order_number              0
order_days_of_week        0
order_hour_of_day         0
days_since_prior_order    0
dtype: int64

### All missing values are imputed by the median value 7. Now no missing values are present in the df_ords

# Step 7 and 8

In [44]:
# Run a check on df_ords to find dupplicate values
df_ords_dups = df_ords[df_ords.duplicated()]

### Answer: There are no duplicates in the df_ords data frame. If duplicates were present, then I would use df.drop_duplicates() function to remove the duplicates.

# Step 9

In [45]:
# exporting df_ords to orders_checked.csv file
df_ords.to_csv(os.path.join(path, 'Data','Prepared Data', 'orders_checked.csv'))