# Importing Data

In [300]:
import pandas as pd

df = pd.read_csv('Sample-Superstore - Sample-Superstore.tsv', delimiter = "\t")

In [301]:
for col in df:
    df.rename(columns={col: col.lower().replace(' ', '').replace('-', '')}, inplace=True)


In [340]:
df.head(0)

Unnamed: 0,rowid,orderid,orderdate,shipdate,shipmode,customerid,customername,segment,country,city,...,postalcode,region,productid,category,subcategory,productname,sales,quantity,discount,profit


In [303]:
# Select relevant columns for each table
orders_cols = ['orderid', 'orderdate', 'shipdate', 'shipmode', 'customerid', 'productid']
customers_cols = ['customerid', 'customername', 'segment']
addresses_cols = ['customerid', 'country', 'city', 'state', 'postalcode', 'region']
products_cols = ['productid', 'category', 'subcategory', 'productname']
sales_cols = ['sales', 'orderid', 'quantity', 'productid', 'discount', 'profit']
prices_cols = ['productid', 'orderdate']

In [304]:
# Create new dataframes for each table
orders = df.loc[:, orders_cols].drop_duplicates(subset='orderid', keep='first').reset_index(drop=True)
customers = df.loc[:, customers_cols].drop_duplicates(subset='customerid', keep='first').reset_index(drop=True)
addresses = df.loc[:, addresses_cols].drop_duplicates(subset='customerid', keep='first').reset_index(drop=True)
products = df.loc[:, products_cols].drop_duplicates(subset='productid', keep='first').reset_index(drop=True)
sales = df.loc[:, sales_cols].reset_index(drop=True)
prices = df.loc[:, prices_cols].drop_duplicates(subset='productid', keep='first').reset_index(drop=True)

In [305]:
import sqlite3
con = sqlite3.connect("Sample-Superstore_01.db")
cur = con.cursor()
print("Connected to the database")

Connected to the database


In [306]:
cur.execute("DROP TABLE IF EXISTS Customers")
cur.execute("DROP TABLE IF EXISTS Addresses")
cur.execute("DROP TABLE IF EXISTS Orders")
cur.execute("DROP TABLE IF EXISTS Products")
cur.execute("DROP TABLE IF EXISTS Sales")
cur.execute("DROP TABLE IF EXISTS Prices")

<sqlite3.Cursor at 0x2cd671f0730>

In [307]:
cur.executescript("""
CREATE TABLE Customers (
  customerid TEXT PRIMARY KEY,
  customername TEXT,
  segment TEXT
);
""")

cur.execute("""
CREATE TABLE Orders (
  orderid TEXT PRIMARY KEY,
  orderdate TEXT,
  shipdate TEXT,
  shipmode TEXT,
  customerid TEXT,
  productid TEXT,
  FOREIGN KEY (customerid) REFERENCES Customer (customerid)
);
""")

cur.execute("""
CREATE TABLE Products (
  productid TEXT PRIMARY KEY,
  category TEXT,
  subcategory TEXT,
  productname TEXT
);
""")

cur.execute("""
CREATE TABLE Sales (
  orderid TEXT,
  productid TEXT,
  sales FLOAT64,
  quantity INTEGER,
  discount FLOAT64,
  profit FLOAT64,
  FOREIGN KEY (orderid) REFERENCES Orders (orderid)
  FOREIGN KEY (productid) REFERENCES "Products" (productid)
);
""")

cur.execute("""
CREATE TABLE Addresses (
  customerid TEXT PRIMARY KEY,
  country TEXT,
  city TEXT,
  state TEXT,
  postalcode INTEGER,
  region TEXT,
  FOREIGN KEY (customerid) REFERENCES Customer (customerid)
);
""")

cur.execute("""
CREATE TABLE Prices (
  productid TEXT,
  orderdate TEXT,
  FOREIGN KEY (productid) REFERENCES Product (productid),
  FOREIGN KEY (orderdate) REFERENCES Orders (orderdate)
);
""")
con.commit()


In [308]:
orders.info()
customers.info()
addresses.info()
products.info()
sales.info()
prices.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5009 entries, 0 to 5008
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   orderid     5009 non-null   object
 1   orderdate   5009 non-null   object
 2   shipdate    5009 non-null   object
 3   shipmode    5009 non-null   object
 4   customerid  5009 non-null   object
 5   productid   5009 non-null   object
dtypes: object(6)
memory usage: 234.9+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 793 entries, 0 to 792
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   customerid    793 non-null    object
 1   customername  793 non-null    object
 2   segment       793 non-null    object
dtypes: object(3)
memory usage: 18.7+ KB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 793 entries, 0 to 792
Data columns (total 6 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  

In [309]:
sales.head()

Unnamed: 0,sales,orderid,quantity,productid,discount,profit
0,261.96,CA-2016-152156,2,FUR-BO-10001798,0.0,41.9136
1,731.94,CA-2016-152156,3,FUR-CH-10000454,0.0,219.582
2,14.62,CA-2016-138688,2,OFF-LA-10000240,0.0,6.8714
3,957.5775,US-2015-108966,5,FUR-TA-10000577,0.45,-383.031
4,22.368,US-2015-108966,2,OFF-ST-10000760,0.2,2.5164


In [310]:
orders.to_sql('Orders', con, if_exists='append', index=False)
customers.to_sql('Customers', con, if_exists='append', index=False)
addresses.to_sql('Addresses', con, if_exists='append', index=False)
products.to_sql('Products', con, if_exists='append', index=False)
sales.to_sql('Sales', con, if_exists='append', index=False)
prices.to_sql('Prices', con, if_exists='append', index=False)

1862

In [311]:
def run(cmd):
    r = cur.execute(cmd)
    display(r.fetchall())

In [312]:
# close the connection
# con.close()

# Assignments


Are they the 3 most profitable products as well?
What are the 3 best-seller products in each product segment? (Quantity-wise)

What are the top 3 worst-selling products in every category? (Quantity-wise)

How many unique customers per month are there for the year 2016. (There's a catch here: contrary to other 'heavier' RDBMS, SQLite does not support the functions YEAR() or MONTH() to extract the year or the month in a date. You will have to create two new columns: year and month.)

What is the product generating the maximum sales revenue?

In [326]:
run("""
SELECT productname, SUM(sales)
FROM Sales
LEFT JOIN Products ON Sales.productid = Products.productid
GROUP BY Products.productid
ORDER BY SUM(sales) DESC LIMIT 1
;""")

[('Canon imageCLASS 2200 Advanced Copier', 61599.824)]

What is the category generating the maximum sales revenue?

In [333]:
run("""
SELECT category, SUM(sales)
FROM Sales
LEFT JOIN Products ON Sales.productid = Products.productid
GROUP BY Products.category
ORDER BY SUM(sales) DESC LIMIT 1
;""")

[('Technology', 836154.0329999966)]

What about the profit in this category?


In [343]:
run("""
SELECT Products.category, SUM(profit)
FROM Sales
LEFT JOIN Products ON Sales.productid = Products.productid
GROUP BY Products.category
ORDER BY SUM(sales) DESC LIMIT 1
;""")

[('Technology', 145454.9480999999)]

Are they making a loss in any categroies?

In [342]:
run("""
SELECT Products.category, SUM(profit) as total_profit
FROM Sales
LEFT JOIN Products ON Sales.productid = Products.productid
GROUP BY Products.category
HAVING total_profit < 0
;""")

[]

What are 5 states generating the maximum and minimum sales revenue?

In [366]:
run("""
SELECT Addresses.state, SUM(Sales.profit)
FROM Sales
LEFT JOIN Orders ON Sales.orderid = Orders.orderid
LEFT JOIN Addresses ON Orders.customerid = Addresses.customerid
GROUP BY Addresses.state
ORDER BY SUM(sales) DESC LIMIT 5
;""")

[('California', 59398.31250000002),
 ('New York', 58177.834100000066),
 ('Texas', 20528.91100000002),
 ('Pennsylvania', 13604.935000000007),
 ('Washington', 24405.796599999983)]

In [370]:
run("""
SELECT Addresses.state, SUM(Sales.profit)
FROM Sales
LEFT JOIN Orders ON Sales.orderid = Orders.orderid
LEFT JOIN Addresses ON Orders.customerid = Addresses.customerid
GROUP BY Addresses.state
ORDER BY SUM(sales) LIMIT 5
;""")

[('Nevada', 278.06780000000003),
 ('Maryland', 436.64919999999984),
 ('Kansas', 139.20080000000002),
 ('District of Columbia', 490.95669999999996),
 ('South Dakota', 682.5541999999999)]

What are the 3 products in each product segment with the highest sales?

In [380]:
run("""
SELECT Customers.segment, Products.productname
FROM Sales
JOIN Orders ON Sales.orderid = Orders.orderid
JOIN Customers ON Orders.customerid = Customers.customerid
LEFT JOIN Products ON Sales.productid = Products.productid
GROUP BY Customers.segment
ORDER BY SUM(sales) LIMIT 3
;""")

[('Home Office',
  'Holmes Replacement Filter for HEPA Air Cleaner, Very Large Room, HEPA Filter'),
 ('Corporate', 'Self-Adhesive Address Labels for Typewriters by Universal'),
 ('Consumer', 'Bush Somerset Collection Bookcase')]