### Importing necessary libraries

In [121]:
import pandas as pd 
import numpy as np 
from sqlalchemy import create_engine 
!pip install psycopg2-binary 
from dotenv import load_dotenv
import os
import psycopg2




### Importing Dataset

In [7]:
# Load dataset
Sales_intelligence = pd.read_csv(r'C:\Users\user\Desktop\Sales_intelligence\Sales_intelligence\Dataset\Raw dataset\Sales Dataset.csv')
Sales_intelligence.head()

Unnamed: 0,Order ID,Amount,Profit,Quantity,Category,Sub-Category,PaymentMode,Order Date,CustomerName,State,City,Year-Month
0,B-26776,9726,1275,5,Electronics,Electronic Games,UPI,2023-06-27,David Padilla,Florida,Miami,2023-06
1,B-26776,9726,1275,5,Electronics,Electronic Games,UPI,2024-12-27,Connor Morgan,Illinois,Chicago,2024-12
2,B-26776,9726,1275,5,Electronics,Electronic Games,UPI,2021-07-25,Robert Stone,New York,Buffalo,2021-07
3,B-26776,4975,1330,14,Electronics,Printers,UPI,2023-06-27,David Padilla,Florida,Miami,2023-06
4,B-26776,4975,1330,14,Electronics,Printers,UPI,2024-12-27,Connor Morgan,Illinois,Chicago,2024-12


### Data Cleaning and Transformation

In [8]:
# Check for missing values
Sales_intelligence.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1194 entries, 0 to 1193
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Order ID      1194 non-null   object
 1   Amount        1194 non-null   int64 
 2   Profit        1194 non-null   int64 
 3   Quantity      1194 non-null   int64 
 4   Category      1194 non-null   object
 5   Sub-Category  1194 non-null   object
 6   PaymentMode   1194 non-null   object
 7   Order Date    1194 non-null   object
 8   CustomerName  1194 non-null   object
 9   State         1194 non-null   object
 10  City          1194 non-null   object
 11  Year-Month    1194 non-null   object
dtypes: int64(3), object(9)
memory usage: 112.1+ KB


In [11]:
# Split 'Year-Month' into 'Year' and 'Month'
Sales_intelligence[['Year', 'Month']] = Sales_intelligence['Year-Month'].str.split('-', expand=True)

# Drop the original 'Year-Month' column
Sales_intelligence.drop(columns=['Year-Month'], inplace=True)

# Convert to integer types
Sales_intelligence['Year'] = Sales_intelligence['Year'].astype(int)
Sales_intelligence['Month'] = Sales_intelligence['Month'].astype(int)


In [18]:
# Convert the 'Order Date' column to datetime
Sales_intelligence['Order Date'] = pd.to_datetime(Sales_intelligence['Order Date'], format='%Y-%m-%d')


In [19]:
# Convert the 'Amount' and 'Price' columns to floats types
Sales_intelligence["Amount"] = Sales_intelligence["Amount"].astype(float)
Sales_intelligence["Profit"] = Sales_intelligence["Profit"].astype(float)


In [64]:
# Rename columns 
Sales_intelligence.rename(columns={
    'Order Date': 'Order_date',
    'Sub-Category': 'Sub_category',
    'PaymentMode': 'Payment_mode',
    'Order ID': 'Order_ID',
    'CustomerName': 'Customer_name'
}, inplace=True)

In [65]:
Sales_intelligence.head()
Sales_intelligence.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1194 entries, 0 to 1193
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype         
---  ------         --------------  -----         
 0   Order_ID       1194 non-null   object        
 1   Amount         1194 non-null   float64       
 2   Profit         1194 non-null   float64       
 3   Quantity       1194 non-null   int64         
 4   Category       1194 non-null   object        
 5   Sub_category   1194 non-null   object        
 6   Payment_mode   1194 non-null   object        
 7   Order_date     1194 non-null   datetime64[ns]
 8   Customer_name  1194 non-null   object        
 9   State          1194 non-null   object        
 10  City           1194 non-null   object        
 11  Year           1194 non-null   int32         
 12  Month          1194 non-null   int32         
dtypes: datetime64[ns](1), float64(2), int32(2), int64(1), object(7)
memory usage: 112.1+ KB


### Operational Database Tables (Normalized)
### Creating Tables

In [38]:
# Products Table
Products_df = Sales_intelligence[['Category', 'Sub_category']].copy().drop_duplicates().reset_index(drop=True)
# Assign a unique product_ID
Products_df['Product_ID'] = Products_df.index + 1 
Products_df = Products_df[['Product_ID', 'Category', 'Sub_category']]
Products_df.head()

Unnamed: 0,Product_ID,Category,Sub_category
0,1,Electronics,Electronic Games
1,2,Electronics,Printers
2,3,Office Supplies,Pens
3,4,Electronics,Laptops
4,5,Furniture,Tables


In [40]:
# Location Table
Locations_df = Sales_intelligence[['City', 'State']].copy().drop_duplicates().reset_index(drop=True)
# Assign a unique Location_ID
Locations_df['Location_ID'] = Locations_df.index + 1 
Locations_df = Locations_df[['Location_ID', 'City', 'State']]
Locations_df.head()

Unnamed: 0,Location_ID,City,State
0,1,Miami,Florida
1,2,Chicago,Illinois
2,3,Buffalo,New York
3,4,Orlando,Florida
4,5,Los Angeles,California


In [50]:
# Customers Table
Customers_df = Sales_intelligence[['Customer_name', 'City', 'State']].drop_duplicates().reset_index(drop=True)
Customers_df = Customers_df.merge(Locations_df, on=['City', 'State'], how='left')
Customers_df['Customer_ID'] = Customers_df.index + 1
Customers_df =Customers_df[['Customer_ID', 'Customer_name', 'Location_ID']]
Customers_df.head()


Unnamed: 0,Customer_ID,Customer_name,Location_ID
0,1,David Padilla,1
1,2,Connor Morgan,2
2,3,Robert Stone,3
3,4,John Fields,4
4,5,Clayton Smith,1


In [53]:
# Products Table
Products_df = Sales_intelligence[['Category', 'Sub_category']].drop_duplicates().reset_index(drop=True)
Products_df['Product_ID'] = Products_df.index + 1
Products_df = Products_df[['Product_ID', 'Category', 'Sub_category']]
Products_df.head()


Unnamed: 0,Product_ID,Category,Sub_category
0,1,Electronics,Electronic Games
1,2,Electronics,Printers
2,3,Office Supplies,Pens
3,4,Electronics,Laptops
4,5,Furniture,Tables


In [59]:
# Payments Table
Payments_df = Sales_intelligence[['Payment_mode']].drop_duplicates().reset_index(drop=True)
Payments_df['Payment_ID'] = Payments_df.index + 1
Payments_df = Payments_df[['Payment_ID', 'Payment_mode']]
Payments_df.head()

Unnamed: 0,Payment_ID,Payment_mode
0,1,UPI
1,2,Debit Card
2,3,EMI
3,4,Credit Card
4,5,COD


In [66]:
# Orders Table
# Merge all Pimary_IDs
Orders_df = Sales_intelligence.copy()
Orders_df = Orders_df.merge(Customers_df, on='Customer_name', how='left')
Orders_df = Orders_df.merge(Products_df, on=['Category', 'Sub_category'], how='left')
Orders_df = Orders_df.merge(Payments_df, on='Payment_mode', how='left')

# Final structure
Orders_df = Orders_df[['Order_ID', 'Customer_ID', 'Product_ID', 'Amount', 'Payment_ID', 
                       'Order_date', 'Quantity', 'Amount', 'Profit']]
Orders_df.head()


Unnamed: 0,Order_ID,Customer_ID,Product_ID,Amount,Payment_ID,Order_date,Quantity,Amount.1,Profit
0,B-26776,1,1,9726.0,1,2023-06-27,5,9726.0,1275.0
1,B-26776,2,1,9726.0,1,2024-12-27,5,9726.0,1275.0
2,B-26776,3,1,9726.0,1,2021-07-25,5,9726.0,1275.0
3,B-26776,1,2,4975.0,1,2023-06-27,14,4975.0,1330.0
4,B-26776,2,2,4975.0,1,2024-12-27,14,4975.0,1330.0


### Save Tables to CSV

In [104]:
os.makedirs(r'C:\Users\user\Desktop\Sales_intelligence\Sales_intelligence\Dataset\Clean dataset', exist_ok=True)


In [105]:
Products_df.to_csv(r'C:\Users\user\Desktop\Sales_intelligence\Sales_intelligence\Dataset\Clean dataset\Products.csv', index=False)
Locations_df.to_csv(r'C:\Users\user\Desktop\Sales_intelligence\Sales_intelligence\Dataset\Clean dataset\Locations.csv', index=False)
Customers_df.to_csv(r'C:\Users\user\Desktop\Sales_intelligence\Sales_intelligence\Dataset\Clean dataset\Customers.csv', index=False)
Payments_df.to_csv(r'C:\Users\user\Desktop\Sales_intelligence\Sales_intelligence\Dataset\Clean dataset\Payments.csv', index=False)
Orders_df.to_csv(r'C:\Users\user\Desktop\Sales_intelligence\Sales_intelligence\Dataset\Clean dataset\Orders.csv', index=False)



### Data Warehouse Tables (Star Schema)
### DIMENSION TABLES

In [70]:
# Dim_Customer

Dim_Customer = Customers_df[['Customer_ID', 'Customer_name']]
Dim_Customer.head()


Unnamed: 0,Customer_ID,Customer_name
0,1,David Padilla
1,2,Connor Morgan
2,3,Robert Stone
3,4,John Fields
4,5,Clayton Smith


In [72]:
# Dim_Product
Dim_Product = Products_df[['Product_ID', 'Category', 'Sub_category']]
Dim_Product.head()

Unnamed: 0,Product_ID,Category,Sub_category
0,1,Electronics,Electronic Games
1,2,Electronics,Printers
2,3,Office Supplies,Pens
3,4,Electronics,Laptops
4,5,Furniture,Tables


In [73]:
# Dim_Location
Dim_Location = Locations_df[['Location_ID', 'City', 'State']]
Dim_Location.head()


Unnamed: 0,Location_ID,City,State
0,1,Miami,Florida
1,2,Chicago,Illinois
2,3,Buffalo,New York
3,4,Orlando,Florida
4,5,Los Angeles,California


In [74]:
# Dim_Payment
Dim_Payment = Payments_df[['Payment_ID', 'Payment_mode']]
Dim_Payment.head()

Unnamed: 0,Payment_ID,Payment_mode
0,1,UPI
1,2,Debit Card
2,3,EMI
3,4,Credit Card
4,5,COD


In [78]:
# Dim_Date
Dim_Date = Sales_intelligence[['Order_date']].drop_duplicates().reset_index(drop=True)
Dim_Date['Date_ID'] = Dim_Date.index + 1
Dim_Date['Year'] = Dim_Date['Order_date'].dt.year
Dim_Date['Month'] = Dim_Date['Order_date'].dt.month
Dim_Date['Day'] = Dim_Date['Order_date'].dt.day
Dim_Date = Dim_Date[['Date_ID', 'Order_date', 'Year', 'Month', 'Day']]
Dim_Date.head()


Unnamed: 0,Date_ID,Order_date,Year,Month,Day
0,1,2023-06-27,2023,6,27
1,2,2024-12-27,2024,12,27
2,3,2021-07-25,2021,7,25
3,4,2024-05-11,2024,5,11
4,5,2021-10-09,2021,10,9


In [81]:
# Fact_Sales
Fact_Sales = Sales_intelligence.copy()
# Merge all Pimary_IDs
Fact_Sales = Fact_Sales.merge(Customers_df[['Customer_name', 'Customer_ID']], on='Customer_name', how='left')
Fact_Sales = Fact_Sales.merge(Products_df, on=['Category', 'Sub_category'], how='left')
Fact_Sales = Fact_Sales.merge(Payments_df, on='Payment_mode', how='left')
Fact_Sales = Fact_Sales.merge(Locations_df, on=['City', 'State'], how='left')
Fact_Sales = Fact_Sales.merge(Dim_Date, on='Order_date', how='left')


Fact_Sales = Fact_Sales[[
    'Order_ID', 'Date_ID', 'Customer_ID', 'Product_ID',
    'Location_ID', 'Payment_ID', 'Quantity', 'Amount', 'Profit'
]]
Fact_Sales.head()


Unnamed: 0,Order_ID,Date_ID,Customer_ID,Product_ID,Location_ID,Payment_ID,Quantity,Amount,Profit
0,B-26776,1,1,1,1,1,5,9726.0,1275.0
1,B-26776,2,2,1,2,1,5,9726.0,1275.0
2,B-26776,3,3,1,3,1,5,9726.0,1275.0
3,B-26776,1,1,2,1,1,14,4975.0,1330.0
4,B-26776,2,2,2,2,1,14,4975.0,1330.0


In [82]:
Fact_Sales.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1209 entries, 0 to 1208
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Order_ID     1209 non-null   object 
 1   Date_ID      1209 non-null   int64  
 2   Customer_ID  1209 non-null   int64  
 3   Product_ID   1209 non-null   int64  
 4   Location_ID  1209 non-null   int64  
 5   Payment_ID   1209 non-null   int64  
 6   Quantity     1209 non-null   int64  
 7   Amount       1209 non-null   float64
 8   Profit       1209 non-null   float64
dtypes: float64(2), int64(6), object(1)
memory usage: 85.1+ KB


### Save Tabls to CSV

In [111]:
os.makedirs(r'C:\Users\user\Desktop\Sales_intelligence\Sales_intelligence\Dataset\Dimension Tables', exist_ok=True)


Dim_Product.to_csv(r'C:\Users\user\Desktop\Sales_intelligence\Sales_intelligence\Dataset\Dimension Tables\Dim_Products.csv', index=False)
Dim_Customer.to_csv(r'C:\Users\user\Desktop\Sales_intelligence\Sales_intelligence\Dataset\Dimension Tables\Dim_Customers.csv', index=False)
Dim_Date.to_csv(r'C:\Users\user\Desktop\Sales_intelligence\Sales_intelligence\Dataset\Dimension Tables\Dim_Date.csv', index=False)
Dim_Location.to_csv(r'C:\Users\user\Desktop\Sales_intelligence\Sales_intelligence\Dataset\Dimension Tables\Dim_Locations.csv', index=False)
Dim_Payment.to_csv(r'C:\Users\user\Desktop\Sales_intelligence\Sales_intelligence\Dataset\Dimension Tables\Dim_Payments.csv', index=False)


### Creating Tables on Postgres

In [170]:

import os
from dotenv import load_dotenv

load_dotenv(dotenv_path=r'C:\Users\user\Desktop\Sales_intelligence\Sales_intelligence\.env')

True

In [173]:
def get_db_connection():
    connection = psycopg2.connect(
        host = 'localhost',
        database = 'Sales_intelligence',
        port = '5432',
        user = 'postgres',
        password = os.getenv('PASSWORD')
    )
    return connection

In [174]:
conn = get_db_connection()

In [None]:
# Create SQL tables
def create_tables():
    conn = get_db_connection()
    cursor = conn.cursor()
    create_table_query = ''' 
                             CREATE SCHEMA IF NOT EXISTS Sales;
                             
                             DROP TABLE IF EXISTS Sales.Customers CASCADE;
                             DROP TABLE IF EXISTS Sales.Products CASCADE;
                             DROP TABLE IF EXISTS Sales.Shipping_address CASCADE;
                             DROP TABLE IF EXISTS yanki.Orders CASCADE;
                             DROP TABLE IF EXISTS yanki.Payment_method CASCADE;
                             
                             CREATE TABLE IF NOT EXISTS yanki.Customers(
                                Customer_ID UUID PRIMARY KEY, 
                                Customer_Name TEXT,
                                Email TEXT,
                                Phone_Number TEXT
                             );
                             
                             CREATE TABLE IF NOT EXISTS yanki.Products(
                                Product_ID UUID PRIMARY KEY,
                                Product_Name TEXT,
                                Brand TEXT,
                                Category TEXT,
                                Price FLOAT 
                             );
                             
                             
                            CREATE TABLE IF NOT EXISTS yanki.Shipping_address(
                               shipping_ID SERIAL PRIMARY KEY, 
                               customer_ID UUID,
                               Shipping_Address TEXT,
                               City TEXT,
                               State TEXT, 
                               Country TEXT, 
                               Postal_Code TEXT,
                               FOREIGN KEY (customer_ID) REFERENCES yanki.customers(customer_ID) 
                               
                            );
                            
                            CREATE TABLE IF NOT EXISTS yanki.order(
                                order_ID UUID PRIMARY KEY,
                                Customer_ID UUID,
                                Product_ID UUID,  
                                Quantity INTEGER,
                                Total_Price FLOAT,
                                Order_Date TIMESTAMP,
                                FOREIGN KEY (customer_ID) REFERENCES yanki.customers(customer_ID),
                                FOREIGN KEY (product_ID) REFERENCES yanki.products(product_ID)  
                            );
                            
                            CREATE TABLE IF NOT EXISTS yanki.payment_method(
                                order_ID UUID,
                                Payment_Method TEXT,
                                Transaction_Status TEXT,
                                FOREIGN KEY (order_ID) REFERENCES yanki.order(order_ID)
                            )
                            
    '''
    cursor.execute(create_table_query)
    conn.commit()
    cursor.close()
    conn.close()

In [24]:
Sales_intelligence.columns

Index(['Order_ID', 'Amount', 'Profit', 'Quantity', 'Category', 'Sub_category',
       'Payment_mode', 'Order Date', 'Customer_name', 'State', 'City', 'Year',
       'Month'],
      dtype='object')