# Step 1: Perform Consistency Checks on df_prods

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np

# Define the file path
file_path = r'C:\Users\Asus\Music\CareerFoundry_Python_Session\products.csv'

# Load the dataset
df_prods = pd.read_csv(file_path)

# Display the first few rows to confirm it's loaded correctly
print(df_prods.head())


   product_id                                       product_name  aisle_id  \
0           1                         Chocolate Sandwich Cookies        61   
1           2                                   All-Seasons Salt       104   
2           3               Robust Golden Unsweetened Oolong Tea        94   
3           4  Smart Ones Classic Favorites Mini Rigatoni Wit...        38   
4           5                          Green Chile Anytime Sauce         5   

   department_id  
0             19  
1             13  
2              7  
3              1  
4             13  


# 1.2 Check for Missing Values

In [2]:
# Check for missing values
missing_values_prods = df_prods.isnull().sum()
print(missing_values_prods)


product_id       0
product_name     0
aisle_id         0
department_id    0
dtype: int64


# 1.3 Check for Duplicate Values

In [3]:
# Check for duplicates
duplicate_prods = df_prods.duplicated().sum()
print(duplicate_prods)


0


# 1.4 Check Data Types

In [4]:
# Check data types
data_types_prods = df_prods.dtypes
print(data_types_prods)


product_id        int64
product_name     object
aisle_id          int64
department_id     int64
dtype: object


# Step 2: Run df.describe() on df_ords

In [7]:
# Import necessary libraries
import pandas as pd
import os

# Define the file path
file_path = r'C:\Users\Asus\Music\CareerFoundry_Python_Session\cleaned_orders_data.csv'

# Load the dataset
df_ords = pd.read_csv(file_path, index_col=False)

# Display the first 5 rows to confirm successful loading
print(df_ords.head())


   order_id  user_id eval_set  order_number  order_dow  order_hour_of_day  \
0   2539329        1    prior             1          2                  8   
1   2398795        1    prior             2          3                  7   
2    473747        1    prior             3          3                 12   
3   2254736        1    prior             4          4                  7   
4    431534        1    prior             5          4                 15   

   days_since_prior_order  product_id product_name  aisle_id  department_id  \
0                     NaN         NaN          NaN       NaN            NaN   
1                    15.0         NaN          NaN       NaN            NaN   
2                    21.0         NaN          NaN       NaN            NaN   
3                    29.0         NaN          NaN       NaN            NaN   
4                    28.0         NaN          NaN       NaN            NaN   

  department  
0        NaN  
1        NaN  
2        NaN  
3 

# 2.2 Run df.describe()

In [8]:
# Run df.describe()
describe_ords = df_ords.describe()
print(describe_ords)


           order_id       user_id  order_number     order_dow  \
count  3.421083e+06  3.421083e+06  3.421083e+06  3.421083e+06   
mean   1.710542e+06  1.029782e+05  1.715486e+01  2.776219e+00   
std    9.875817e+05  5.953372e+04  1.773316e+01  2.046829e+00   
min    1.000000e+00  1.000000e+00  1.000000e+00  0.000000e+00   
25%    8.552715e+05  5.139400e+04  5.000000e+00  1.000000e+00   
50%    1.710542e+06  1.026890e+05  1.100000e+01  3.000000e+00   
75%    2.565812e+06  1.543850e+05  2.300000e+01  5.000000e+00   
max    3.421083e+06  2.062090e+05  1.000000e+02  6.000000e+00   

       order_hour_of_day  days_since_prior_order    product_id      aisle_id  \
count       3.421083e+06            3.214874e+06  49688.000000  49688.000000   
mean        1.345202e+01            1.111484e+01  24844.500000     67.769582   
std         4.226088e+00            9.206737e+00  14343.834425     38.316162   
min         0.000000e+00            0.000000e+00      1.000000      1.000000   
25%         1.

# Step 3: Check for Mixed-Type Data

In [9]:
# Check for mixed-type data
mixed_type_columns = df_ords.columns[df_ords.apply(lambda col: col.apply(type).nunique() > 1)]
print(mixed_type_columns)


Index(['product_name', 'department'], dtype='object')


# Step 4: Fix Mixed-Type Data

In [10]:
# Convert mixed-type column to a consistent type
df_ords['days_since_prior_order'] = df_ords['days_since_prior_order'].astype(str)


# Step 5: Check for Missing Values in df_ords

In [11]:
# Check for missing values
missing_values_ords = df_ords.isnull().sum()
print(missing_values_ords)

order_id                        0
user_id                         0
eval_set                        0
order_number                    0
order_dow                       0
order_hour_of_day               0
days_since_prior_order          0
product_id                3371395
product_name              3371395
aisle_id                  3371395
department_id             3371395
department                3371395
dtype: int64


# Step 6: Address Missing Values

In [13]:
# Fill missing values with 0 (assuming first orders have no prior orders)
df_ords['days_since_prior_order'] = df_ords['days_since_prior_order'].fillna(0)



# Step 7: Check for Duplicate Values

In [14]:
# Check for duplicate values
duplicate_ords = df_ords.duplicated().sum()
print(duplicate_ords)

0


# Step 8: Address Duplicate Values

In [15]:
# Remove duplicate values
df_ords.drop_duplicates(inplace=True)


# Step 9: Export the Final Cleaned Data

In [17]:
# Define export file path
export_path = r'C:\Users\Asus\Music\CareerFoundry_Python_Session\products_cleaned.csv'

# Save the cleaned data to a new CSV file
df_prods.to_csv(export_path, index=False)

print("Cleaned dataset saved successfully!")


Cleaned dataset saved successfully!
