In [2]:
import pandas as pd
import re
from datetime import datetime

In [3]:
# Function to calculate age
def calculate_age(birthdate):
    if pd.isna(birthdate):
        return None
    today = datetime.today()
    return today.year - birthdate.year - ((today.month, today.day) < (birthdate.month, birthdate.day))

# Function to extract numeric value from string
def extract_numeric(value):
    numeric_string = re.sub(r'[^\d.]', '', value)
    return float(numeric_string) if numeric_string else 0.0

In [4]:
# Dictionary to store the DataFrames
dataframes = {}

# List of filenames
files = ['Customers', 'Exchange_Rates', 'Products', 'Sales', 'Stores']

# Try different encodings for each file
for file in files:
    for encoding in ['utf-8', 'latin1', 'iso-8859-1']:
        try:
            df = pd.read_csv(f"Dataset-Copy1/{file}.csv", encoding=encoding)
            dataframes[file] = df
            break
        except UnicodeDecodeError:
            continue
    else:
        print(f"Failed to load {file} with utf-8, latin1, or iso-8859-1 encodings")

In [5]:
# Load customer data from CSV and preprocess
df_customer = dataframes['Customers']
df_customer.columns = df_customer.columns.str.lower().str.replace(" ", "_")
df_customer['birthday'] = pd.to_datetime(df_customer['birthday'], errors='coerce')
df_customer['age'] = df_customer['birthday'].apply(calculate_age)
df_customer['state_code'] = df_customer['state_code'].fillna("NA")

In [6]:
# Load and preprocess the Products data
df_product = dataframes['Products']
df_product.columns = df_product.columns.str.lower().str.replace(" ", "_")
df_product['cost_usd'] = df_product['unit_cost_usd'].apply(extract_numeric)
df_product['price_usd'] = df_product['unit_price_usd'].apply(extract_numeric)

In [7]:
# Clean and process the Sales DataFrame
df_sales = dataframes['Sales']
df_sales.columns = df_sales.columns.str.lower().str.replace(" ", "_")
df_sales['order_date'] = pd.to_datetime(df_sales['order_date'], errors='coerce')
df_sales['delivery_date'] = pd.to_datetime(df_sales['delivery_date'], errors='coerce')

# Filter out rows where either 'order_date' or 'delivery_date' is missing
filtered_df = df_sales.dropna(subset=['order_date', 'delivery_date']).copy()

# Calculate the difference in days
filtered_df['date_difference'] = (filtered_df['delivery_date'] - filtered_df['order_date']).dt.days

# Compute the average difference
average_difference = filtered_df['date_difference'].mean()

# Fill missing 'delivery_date' values with 'order_date' + average difference
df_sales['delivery_date'] = df_sales['delivery_date'].fillna(df_sales['order_date'] + pd.Timedelta(days=average_difference))
df_sales['shipping_days'] = (df_sales['delivery_date'] - df_sales['order_date']).dt.days

In [8]:
# Clean and process the Stores DataFrame
df_stores = dataframes['Stores']
df_stores.columns = df_stores.columns.str.lower().str.replace(" ", "_")
df_stores['open_date'] = pd.to_datetime(df_stores['open_date'], errors='coerce')
df_stores['square_meters'] = df_stores['square_meters'].fillna(0.0)

In [9]:
# Clean and process the Exchange Rates DataFrame
df_exchange_rates = dataframes['Exchange_Rates']
df_exchange_rates.columns = df_exchange_rates.columns.str.lower().str.replace(" ", "_")
df_exchange_rates['date'] = pd.to_datetime(df_exchange_rates['date'], errors='coerce')
df_exchange_rates['currency'] = df_exchange_rates['currency'].str.upper()

In [10]:
# Merge dataframes
merged_df = df_sales.merge(df_customer, on='customerkey', how='left')
merged_df = merged_df.merge(df_stores, on='storekey', how='left', suffixes=('', '_store'))
merged_df = merged_df.merge(df_product, on='productkey', how='left', suffixes=('', '_product'))
merged_df = merged_df.merge(df_exchange_rates, left_on=['order_date', 'currency_code'], right_on=['date', 'currency'], how='left')
merged_df = merged_df.apply(lambda x: x.str.upper() if x.dtype == "object" else x)

In [None]:
# Inspect the final dataframe
print(merged_df.describe)

In [None]:
merged_df.dtypes

In [11]:
import pymysql

# Connection to MySQL
myconnection = pymysql.connect(host='127.0.0.1', user='root', passwd='Krisrak@123')

try:
    # Create the database
    with myconnection.cursor() as cursor:
        cursor.execute("CREATE DATABASE IF NOT EXISTS Dataspark_Global_Electronics")
    
    # Loop through each dataframe and create tables
    for df, table_name in zip(
        [df_customer, df_stores, df_product, df_exchange_rates, df_sales, merged_df],
        ['Customers', 'Stores', 'Products', 'Exchange_Rates', 'Sales', 'Merged_df']
    ):
        # Convert columns to uppercase if they are of object type
        df = df.apply(lambda x: x.str.upper() if x.dtype == "object" else x)
        
        # Get column details
        a = ",".join(f"{i} {j}" for i, j in zip(df.columns, df.dtypes))
        a = a.replace("int64", "INT").replace("object", "TEXT").replace("float64", "FLOAT").replace("datetime64[ns]", "DATETIME")
        
        # Create table
        with myconnection.cursor() as cursor:
            cursor.execute(f"CREATE TABLE IF NOT EXISTS Dataspark_Global_Electronics.{table_name} ({a})")
        
        # Insert rows
        sql = f"INSERT INTO Dataspark_Global_Electronics.{table_name} VALUES "
        for j in range(len(df)):
            with myconnection.cursor() as cursor:
                cursor.execute(f"{sql} {tuple(df.iloc[j])}")
            myconnection.commit()
finally:
    myconnection.close()

In [11]:
file_names = {
    'df_customer': 'customer_data.csv',
    'df_stores': 'stores_data.csv',
    'df_product': 'product_data.csv',
    'df_exchange_rates': 'exchange_rates_data.csv',
    'df_sales': 'sales_data.csv'
}

df_customer.to_csv(file_names['df_customer'], index=False)
df_stores.to_csv(file_names['df_stores'], index=False)
df_product.to_csv(file_names['df_product'], index=False)
df_exchange_rates.to_csv(file_names['df_exchange_rates'], index=False)
df_sales.to_csv(file_names['df_sales'], index=False)