In [1]:
# 1 - Confirming I have pandas

import pandas as pd
print(pd.__version__)

2.2.2


In [None]:
from random import randint
from datetime import date

def create_sales_data_with_duplicates(file_name):
    header_row = ['product_id', 'customer_id', 'customer_age', 'sales_date', 'quantity', 'price']
    data_rows = []
    
    for i in range(65, 91):  # for 26 letters to make it 26 products
        
        # Randomly introduce missing values (with about 10% chance for each field)
        product_id = f'Product {chr(i)}' 
        customer_id = f'CUST {(i+2)%70 + 1}' 
        sales_date = f'{date(2026,1,(i-60))}' 
        quantity = f'{randint(10, 200)}' 
        price = f'{randint(500,10000)}' 
        customer_age = f'{randint(18,90)}' 

        details = [product_id, customer_id, customer_age, sales_date, quantity, price]
        data_rows.append(details)

    # Adding a few some duplicate rows
    for _ in range(7):  # Add 7 duplicate rows
        duplicate_row = data_rows[randint(0, len(data_rows) - 1)]
        data_rows.append(duplicate_row.copy())
    
    df = pd.DataFrame(data_rows, columns=header_row)
    df.to_csv(file_name, index=False)
    print(f"\n{file_name} creation done.")

create_sales_data_with_duplicates('sales_data_4.csv')


sales_data_4.csv creation done.


In [3]:
# 2 - Load the Data

def load_data(file_path):
    try:
        df = pd.read_csv(file_path)
        print("Data loaded successfully.")
        return df
    except FileNotFoundError:
        print(f"Error: File not found at {file_path}")
        return None


In [4]:
# Checking if the duplicates were created
df = load_data('sales_data_4.csv')

# Check for duplicates
print("\n--- Checking for Duplicates ---")
print(f"Number of duplicate rows: {df.duplicated().sum()}")
print(f"Are there any duplicates? {df.duplicated().any()}")

# Show the duplicate rows
if df.duplicated().any():
    print("\nDuplicate rows:")
    print(df[df.duplicated()])

Data loaded successfully.

--- Checking for Duplicates ---
Number of duplicate rows: 7
Are there any duplicates? True

Duplicate rows:
   product_id customer_id  customer_age  sales_date  quantity  price
26  Product T     CUST 17            19  2026-01-24        94   7939
27  Product K      CUST 8            26  2026-01-15       118   9043
28  Product U     CUST 18            55  2026-01-25        26   8364
29  Product K      CUST 8            26  2026-01-15       118   9043
30  Product T     CUST 17            19  2026-01-24        94   7939
31  Product P     CUST 13            29  2026-01-20       182   6408
32  Product F      CUST 3            35  2026-01-10       143   5922


In [5]:
# 3 - Implement Duplicate Removal Function

def remove_duplicates(dataframe):
    initial_shape = dataframe.shape
    df = dataframe.drop_duplicates()
    final_shape = dataframe.shape
    print(f"Removed {initial_shape[0] - final_shape[0]} duplicate rows.")
    return df


In [6]:
# 4 - Integrating dataframe into the Data Cleaning Pipeline and dropping duplicate rows

def load_and_clean_data(file_path):
    df = load_data(file_path)
    if df is not None:
        df = remove_duplicates(df)
        return df
    else:
        return None
        

In [7]:
# 5 - Testing the Implementation

file_path = 'sales_data_4.csv'
df = load_and_clean_data(file_path)

if df is not None:
    print("Cleaned DataFrame shape:", df.shape) # Checking if the duplicates were removed from df
    print(df.head())
else:
    print("Failed to load and clean data.")

Data loaded successfully.
Removed 0 duplicate rows.
Cleaned DataFrame shape: (26, 6)
  product_id customer_id  customer_age  sales_date  quantity  price
0  Product A     CUST 68            53  2026-01-05       152    608
1  Product B     CUST 69            68  2026-01-06        75   7974
2  Product C     CUST 70            48  2026-01-07        72   5918
3  Product D      CUST 1            69  2026-01-08        79   7449
4  Product E      CUST 2            43  2026-01-09        66   6590


In [9]:
# Saving it to a file

df.to_csv('sales_data_4.csv')

In [10]:
# Checking if the duplicates were removed
df = load_data('sales_data_4.csv')

# Check for duplicates
print("\n--- Checking for Duplicates ---")
print(f"Number of duplicate rows: {df.duplicated().sum()}")
print(f"Are there any duplicates? {df.duplicated().any()}")

# Show the duplicate rows
if df.duplicated().any():
    print("\nDuplicate rows:")
    print(df[df.duplicated()])

Data loaded successfully.

--- Checking for Duplicates ---
Number of duplicate rows: 0
Are there any duplicates? False
