In [2]:

# : Data Cleaning and Wrangling

import pandas as pd
import sqlite3

# Load CSV
df = pd.read_csv('Sales_Data.csv')

# 1️⃣ Inspect data
print("First 5 rows:")
print(df.head())
print("\nInfo:")
print(df.info())
print("\nMissing values:")
print(df.isnull().sum())
print("\nBasic stats:")
print(df.describe())

# 2️⃣ Standardize column names
# Lowercase, replace spaces with underscores, replace hyphens with underscores
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_').str.replace('-', '_')
print("\nCleaned column names:")
print(df.columns.tolist())

# 3️⃣ Convert date columns to datetime
if 'order_date' in df.columns:
    df['order_date'] = pd.to_datetime(df['order_date'], dayfirst=True)
if 'ship_date' in df.columns:
    df['ship_date'] = pd.to_datetime(df['ship_date'], dayfirst=True)

# 4️⃣ Extract month and year from order_date for trend analysis
if 'order_date' in df.columns:
    df['month'] = df['order_date'].dt.month
    df['year'] = df['order_date'].dt.year

# 5️⃣ Handle missing values
# Drop rows where 'sales' is missing
df.dropna(subset=['sales'], inplace=True)

# Optional: fill other missing values (example)
# df['postal_code'].fillna(0, inplace=True)
# df['customer_name'].fillna('Unknown', inplace=True)

# 6️⃣ Remove duplicates if any
df.drop_duplicates(inplace=True)

# 7️⃣ Type conversions if needed
df['postal_code'] = df['postal_code'].astype(str)

# 8️⃣ Save cleaned data to SQLite DB
conn = sqlite3.connect('sales.db')
df.to_sql('sales', conn, if_exists='replace', index=False)

print("\n✅ Data cleaning and wrangling complete!")
print(f"Cleaned data shape: {df.shape}")


First 5 rows:
   Row ID        Order ID  Order Date   Ship Date       Ship Mode Customer ID  \
0       1  CA-2017-152156  08/11/2017  11/11/2017    Second Class    CG-12520   
1       2  CA-2017-152156  08/11/2017  11/11/2017    Second Class    CG-12520   
2       3  CA-2017-138688  12/06/2017  16/06/2017    Second Class    DV-13045   
3       4  US-2016-108966  11/10/2016  18/10/2016  Standard Class    SO-20335   
4       5  US-2016-108966  11/10/2016  18/10/2016  Standard Class    SO-20335   

     Customer Name    Segment        Country             City       State  \
0      Claire Gute   Consumer  United States        Henderson    Kentucky   
1      Claire Gute   Consumer  United States        Henderson    Kentucky   
2  Darrin Van Huff  Corporate  United States      Los Angeles  California   
3   Sean O'Donnell   Consumer  United States  Fort Lauderdale     Florida   
4   Sean O'Donnell   Consumer  United States  Fort Lauderdale     Florida   

   Postal Code Region       Product 