1. Load the dataset

In [1]:
import pandas as pd

df =  pd.read_csv('data.csv')

2. Handle Missing Values

In [3]:
missing_summary = df.isnull().sum()
# Step 1: Handle missing values
df['Value'] = df['Value'].fillna(df['Value'].mean())
df['Sales'] = df['Sales'].fillna(df['Sales'].mean())

# Step 2: Remove duplicate rows
df = df.drop_duplicates()

# Step 3: Standardize text values (lowercase and strip whitespace)
text_columns = df.select_dtypes(include = 'object').columns

# Step 4: Convert 'Date' to datetime format
df['Date'] = pd.to_datetime(df['Date'], errors = 'coerce', dayfirst = False)

# Step 5: Rename column headers to be lowercase with underscores
df.columns = df.columns.str.strip().str.lower().str.replace(' ','_')

# Step 6: Ensure correct data types
df['values'] = pd.to_numeric(df['value'], errors = 'coerce')
df['sales'] = pd.to_numeric(df['sales'], errors = 'coerce')

# Export the cleaned dataset
df.to_csv('cleaned_data.csv',index = False)

# Optional: Print a short summary
print("Date cleaned successfully. Here's a summary:")
print(f"Columns: {df.columns.tolist()}")
print(df.dtypes)
print(f"Rows: {len(df)}")



Date cleaned successfully. Here's a summary:
Columns: ['date', 'category', 'value', 'product', 'sales', 'region', 'values']
date        datetime64[ns]
category            object
value              float64
product             object
sales              float64
region              object
values             float64
dtype: object
Rows: 50
