In [None]:
import pandas as pd

# Load the sales data from the CSV file
sales_df = pd.read_csv('sales 1.csv')

# Display the number of rows before cleaning
initial_count = sales_df.shape[0]
print("Rows before cleaning:", initial_count)

# Drop rows where either 'SalesAmount' or 'Quantity' is missing
sales_df.dropna(subset=['SalesAmount', 'Quantity'], inplace=True)

# Display the number of rows after cleaning
final_count = sales_df.shape[0]
print("Rows after cleaning:", final_count)


Rows before cleaning: 10000
Rows after cleaning: 9397


In [None]:
# Load the customer data
customers_df = pd.read_json('customers.json')

# Check for duplicate CustomerID values
duplicate_count = customers_df.duplicated(subset=['CustomerID']).sum()

if duplicate_count > 0:
    print(f"Found {duplicate_count} duplicate CustomerID(s).")
    # Optionally display the rows with duplicate CustomerIDs
    duplicates = customers_df[customers_df.duplicated(subset=['CustomerID'], keep=False)]
    print("Rows with duplicate CustomerIDs:")
    print(duplicates)
else:
    print("No duplicates found. All CustomerID values are unique.")


Found 100 duplicate CustomerID(s).
Rows with duplicate CustomerIDs:
     CustomerID    FirstName   LastName  Gender       Region          SSN
22        C0023        Grant   Bartlett    male         Ohio  914-9819-50
25        C0026        Jesse  Wilkerson    male           NY  929/20/0753
46        C0047         Seth       Hess    Male        Texas    075643618
47        C0048        Frank       Ross       M         Ohho  466.83.1539
49        C0050     Kimberly     Wright  female      Nw York  556/57/7221
...         ...          ...        ...     ...          ...          ...
1095      C0939       Sandra       None  female  Californiya  879.28.3301
1096      C0965        Scott       None       M         Ohho    226607396
1097      C0966  Christopher    Meadows       M         Ohio  249/65/1683
1098      C0971         Jeff     Nelson    male           NY    202775865
1099      C0989        Lucas     Conley       M           NY  657/00/1176

[200 rows x 6 columns]


In [None]:
# Load the customers and products data
customers_df = pd.read_json('customers.json')
products_df = pd.read_csv('products.csv')
print("Sales rows before filtering invalid IDs:", sales_df.shape[0])
# Filter sales data: keep rows where CustomerID exists in customers_df and ProductID exists in products_df
sales_df = sales_df[
    (sales_df['CustomerID'].isin(customers_df['CustomerID'])) &
    (sales_df['ProductID'].isin(products_df['ProductID']))
]

# Number of customers before removing duplicates
initial_count = len(customers_df)
print("Number of customers before cleaning:", initial_count)

# --- Clean Gender ---
customers_df['Gender'] = customers_df['Gender'].replace({
    'M': 'Male', 
    'male': 'Male', 
    'F': 'Female', 
    'female': 'Female'
})

# --- Clean Region ---
customers_df['Region'] = customers_df['Region'].replace({
    'Texaz': 'Texas',
    'Ohho': 'Ohio',
    'New Yorkk': 'New York',
    'NY': 'New York',
    'Nw York': 'New York',
    'california': 'California',
    'Californiya': 'California'
})
# Replace nulls in Region with "Unknown"
customers_df['Region'] = customers_df['Region'].fillna("Unknown")

# --- Clean LastName ---
# Replace nulls in LastName with "Unknown", then replace "Unknown" with an empty string
customers_df['LastName'] = customers_df['LastName'].fillna("Unknown")
customers_df['LastName'] = customers_df['LastName'].replace("Unknown", "")

# --- Remove Duplicates ---
# Remove duplicate CustomerIDs, keeping only the first occurrence
customers_df = customers_df.drop_duplicates(subset=['CustomerID'], keep='first')

# --- Remove Unnecessary Columns ---
# Drop the SSN column if it exists
if 'SSN' in customers_df.columns:
    customers_df = customers_df.drop(columns=['SSN'])

# --- Merge Name Columns ---
# Combine FirstName and LastName into a new column "Name"
customers_df['Name'] = (customers_df['FirstName'].astype(str) + " " + customers_df['LastName'].astype(str)).str.strip()

# ------------------------------
# Final Output
# ------------------------------
final_count = len(customers_df)
print("Number of customers after cleaning:", final_count)
print("\nCleaned Customers Data:")
print(customers_df.head())

Sales rows before filtering invalid IDs: 9397
Number of customers before cleaning: 1100
Number of customers after cleaning: 1000

Cleaned Customers Data:
  CustomerID FirstName LastName Gender      Region             Name
0      C0001   Gregory   Miller   Male        Ohio   Gregory Miller
1      C0002    Marvin            Male  California           Marvin
2      C0003   Gregory    Smith   Male    New York    Gregory Smith
3      C0004    Edward    Davis   Male        Ohio     Edward Davis
4      C0005  Reginald   Dawson   Male       Texas  Reginald Dawson


In [4]:
# Convert the Timestamp column to datetime. Invalid formats will become NaT.
sales_df['Timestamp'] = pd.to_datetime(sales_df['Timestamp'], errors='coerce')

# Count how many rows have an invalid (NaT) Timestamp
invalid_timestamp_count = sales_df['Timestamp'].isna().sum()
print("Number of rows with invalid Timestamp format:", invalid_timestamp_count)

# Review the rows with invalid timestamps:
if invalid_timestamp_count > 0:
    print("Rows with invalid Timestamp:")
    print(sales_df[sales_df['Timestamp'].isna()])

# Check that SaleID is unique
duplicate_saleid_count = sales_df['SaleID'].duplicated().sum()
print("Number of duplicate SaleID entries:", duplicate_saleid_count)


Number of rows with invalid Timestamp format: 0
Number of duplicate SaleID entries: 0


In [5]:
#%pip install pymysql

In [6]:
from sqlalchemy import create_engine

username = 'root'
password = '12345'
host = 'localhost'
port = '3306'
database = 'case2'

# Create a MySQL engine using PyMySQL driver
engine = create_engine(f'mysql+pymysql://{username}:{password}@{host}:{port}/{database}')

# 1. Load Transformed Data (Fact Table)
sales_df.to_sql('fact_sales', engine, if_exists='replace', index=False)
print("Transformed sales data loaded into fact_sales table.")

# 2. Load Dimension Tables
# Load customers (dimension table for customers)
customers_df.to_sql('dim_customers', engine, if_exists='replace', index=False)
print("Customer data loaded into dim_customers table.")

# Load products (dimension table for products)
products_df.to_sql('dim_products', engine, if_exists='replace', index=False)
print("Product data loaded into dim_products table.")


Transformed sales data loaded into fact_sales table.
Customer data loaded into dim_customers table.
Product data loaded into dim_products table.
