In [7]:
import pandas as pd
from random import randint, random
from datetime import date
import numpy as np

def create_sales_data_with_nans(file_name):
    header_row = ['product_id', 'customer_id', 'customer_age', 'sales_date', 'quantity', 'price']
    data_rows = []
    
    for i in range(65, 91):  # for 26 letters to make it 26 products
        ##
            # Randomly decide whether to include NaN for quantity or price
            # product_id = f'Product {chr(i)}' if choice([True, False]) else None
            # customer_id = f'CUST {(i + 2) % 70 + 1}' if choice([True, False]) else None
            # sales_date = date(2026, 1, (i - 60)) if choice([True, False]) else None

            # # Randomly decide whether to include NaN for quantity or price
            # quantity = randint(10, 200) if choice([True, False]) else np.nan
            # price = randint(500, 10000) if choice([True, False]) else np.nan

        ##
            # The above method will create for you missing values and the rest, but it is too much because choice just has to choice from a range of 2 values
            # Let us try using a method that controls the pobabilty of missing values to encounter
        
        # Randomly introduce missing values (with about 10% chance for each field)
        product_id = f'Product {chr(i)}' if random() > 0.1 else None
        customer_id = f'CUST {(i+2)%70 + 1}' if random() > 0.1 else None
        sales_date = f'{date(2026,1,(i-60))}' if random() > 0.1 else None
        quantity = f'{randint(10, 200)}' if random() > 0.1 else np.nan
        price = f'{randint(500,10000)}' if random() > 0.1 else np.nan    
        customer_age = f'{randint(18,90)}' if random() > 0.1 else np.nan    

        details = [product_id, customer_id, customer_age, sales_date, quantity, price]
        data_rows.append(details)

    df = pd.DataFrame(data_rows, columns=header_row)
    df.to_csv(file_name, index=False)
    print(f"\n{file_name} creation done.")

create_sales_data_with_nans('sales_data_3.csv')


sales_data_3.csv creation done.


In [9]:
# 1 - Import Libraries and Load Data

import pandas as pd

# Load the sales data
sales_data = pd.read_csv('sales_data_3.csv')

# Display the first few rows
print(sales_data.head())

  product_id customer_id  customer_age  sales_date  quantity   price
0  Product A     CUST 68          72.0  2026-01-05      74.0  8133.0
1  Product B     CUST 69          56.0  2026-01-06      80.0  6413.0
2  Product C     CUST 70           NaN  2026-01-07      63.0  9971.0
3  Product D      CUST 1          54.0  2026-01-08      48.0  4194.0
4  Product E      CUST 2          88.0  2026-01-09      58.0     NaN


In [10]:
# 2 - Identify Missing Values

# Identify missing values
missing_values = sales_data.isna().sum()

# Calculate the percentage of missing values
missing_percentage = (missing_values / len(sales_data)) * 100

# Print the percentage of missing values
print(missing_values)
print()
print(missing_percentage)

product_id      0
customer_id     0
customer_age    2
sales_date      1
quantity        1
price           2
dtype: int64

product_id      0.000000
customer_id     0.000000
customer_age    7.692308
sales_date      3.846154
quantity        3.846154
price           7.692308
dtype: float64


In [11]:
# 3 - Remove Rows with Missing Values

# Remove rows with missing values
print(f"Shape before dropping: {sales_data.shape}")
sales_data_dropped = sales_data.dropna()
print(f"Shape after dropping: {sales_data_dropped.shape}")

# Print the shape of the DataFrame after removing rows
print(sales_data_dropped.shape)

Shape before dropping: (26, 6)
Shape after dropping: (21, 6)
(21, 6)


In [None]:
# 4 - Impute Missing Values with Mean/Median

# Impute missing values with the median
median_age = sales_data['customer_age'].median()
# sales_data['customer_age'].fillna(median_age, inplace=True) # outdated
# # Method 1 (Recommended): Direct assignment - no warning, works correctly
# sales_data['price'] = sales_data['price'].fillna(median_age)

# Alternative Method 2: Using DataFrame-level fillna (also recommended)
sales_data.fillna({'price': median_age}, inplace=True)

# Verify that the missing values have been filled
print(sales_data['customer_age'].isna().sum())

0


In [27]:
# I just wanted to check if the previous filling filled the missing values in the variable or in the file itself, and if dropping values really worked - but I guess the dropping happened in another variable

print(sales_data.shape)
print()
print(sales_data_dropped.shape)
print()
print(sales_data.head())
print()
print(sales_data.isna().sum())

(26, 6)

(21, 6)

  product_id customer_id  customer_age  sales_date  quantity   price
0  Product A     CUST 68          72.0  2026-01-05      74.0  8133.0
1  Product B     CUST 69          56.0  2026-01-06      80.0  6413.0
2  Product C     CUST 70          55.0  2026-01-07      63.0  9971.0
3  Product D      CUST 1          54.0  2026-01-08      48.0  4194.0
4  Product E      CUST 2          88.0  2026-01-09      58.0     NaN

product_id      0
customer_id     0
customer_age    0
sales_date      1
quantity        1
price           2
dtype: int64


In [None]:
# 5 - Impute Missing Values with Mode

# Impute missing values with the mode
mode_quantity = sales_data['quantity'].mode()[0]
# Method 1 (Recommended): Direct assignment - no warning, works correctly
sales_data['quantity'] = sales_data['quantity'].fillna(mode_quantity)

# Alternative Method 2: Using DataFrame-level fillna (also recommended)
# sales_data.fillna({'quantity': mode_quantity}, inplace=True)

# Verify that the missing values have been filled
print(sales_data['quantity'].isna().sum())
# print(sales_data['quantity'].mode()[0])

0


In [41]:
# checking if it took effect
print(sales_data.head())
print()
print(sales_data.isna().sum())

  product_id customer_id  customer_age  sales_date  quantity   price
0  Product A     CUST 68          72.0  2026-01-05      74.0  8133.0
1  Product B     CUST 69          56.0  2026-01-06      80.0  6413.0
2  Product C     CUST 70          55.0  2026-01-07      63.0  9971.0
3  Product D      CUST 1          54.0  2026-01-08      48.0  4194.0
4  Product E      CUST 2          88.0  2026-01-09      58.0     NaN

product_id      0
customer_id     0
customer_age    0
sales_date      1
quantity        0
price           2
dtype: int64


In [43]:
# 6 - Impute Missing Values with Mean

# Impute missing values with the mean
mean_price = round(sales_data['price'].mean(), 0)

# sales_data['quantity'].fillna(mean_price, inplace=True) # outdated

# # Method 1 (Recommended): Direct assignment - no warning, works correctly
# sales_data['price'] = sales_data['price'].fillna(mean_price)

# Alternative Method 2: Using DataFrame-level fillna (also recommended)
sales_data.fillna({'price': mean_price}, inplace=True)

# Verify that the missing values have been filled
# print(round(sales_data['price'].mean(), 0))
print(sales_data['price'].isna().sum())

0


In [44]:
# checking if it took effect
print(sales_data.head())
print()
print(sales_data.isna().sum())

  product_id customer_id  customer_age  sales_date  quantity   price
0  Product A     CUST 68          72.0  2026-01-05      74.0  8133.0
1  Product B     CUST 69          56.0  2026-01-06      80.0  6413.0
2  Product C     CUST 70          55.0  2026-01-07      63.0  9971.0
3  Product D      CUST 1          54.0  2026-01-08      48.0  4194.0
4  Product E      CUST 2          88.0  2026-01-09      58.0  5732.0

product_id      0
customer_id     0
customer_age    0
sales_date      1
quantity        0
price           0
dtype: int64


In [None]:
# 7 - Checking the percentage of missing values to know if the missing values are much

missing_dates = sales_data['sales_date'].isna().sum()

percent = (missing_dates / len(sales_data)) * 100

print(f'The percent of missing dates is: {percent}')

The percent of missing dates is: 3.8461538461538463


In [None]:
# 7b - Impute Missing Values with Mode

mode_date = sales_data['sales_date'].mode()[0]

# # Method 1 (Recommended): Direct assignment - no warning, works correctly
# sales_data['sales_date'] = sales_data['sales_date'].fillna(mode_date)

# Alternative Method 2: Using DataFrame-level fillna (also recommended)
sales_data.fillna({'sales_date': mode_date}, inplace=True)

# Verify that the missing values have been filled
# print(round(sales_data['price'].mean(), 0))
print(sales_data['sales_date'].isna().sum())

0


In [48]:
# 8 - Verify and Save Cleaned Data

# Verify that there are no remaining missing values
print(sales_data.isna().sum())

# Save the cleaned data
sales_data.to_csv('cleaned_sales_data.csv', index=False)

# Print a success message
print('Cleaned data saved to cleaned_sales_data.csv')

product_id      0
customer_id     0
customer_age    0
sales_date      0
quantity        0
price           0
dtype: int64
Cleaned data saved to cleaned_sales_data.csv
