## Introduction to SQL & SELECT Queries

## Task 1: Understand Tables and Fields

### 1. Setup and Import Libraries

In [12]:
# Setup and Import Libraries
import sqlite3
import pandas as pd

### 2. Connect to the SQLite Database

In [5]:
# Connect to the database
conn = sqlite3.connect("chinook.db")

### 3. List all tables

In [7]:
# List all tables
tables = pd.read_sql_query("SELECT name FROM sqlite_master WHERE type='table';", conn)
print("Tables in the database:")
print(tables)

Tables in the database:
               name
0            albums
1   sqlite_sequence
2           artists
3         customers
4         employees
5            genres
6          invoices
7     invoice_items
8       media_types
9         playlists
10   playlist_track
11           tracks
12     sqlite_stat1


### 4. View Table Structures using PRAGMA

In [20]:
# View Table Structures using PRAGMA
def show_table_structure(table_name):
    print(f"\nStructure of table: {table_name}")
    df = pd.read_sql_query(f"PRAGMA table_info({table_name});", conn)
    display(df)

# Loop through all tables and show their structure
for table in tables['name']:
    show_table_structure(table)


Structure of table: albums


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,AlbumId,INTEGER,1,,1
1,1,Title,NVARCHAR(160),1,,0
2,2,ArtistId,INTEGER,1,,0



Structure of table: sqlite_sequence


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,name,,0,,0
1,1,seq,,0,,0



Structure of table: artists


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,ArtistId,INTEGER,1,,1
1,1,Name,NVARCHAR(120),0,,0



Structure of table: customers


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,CustomerId,INTEGER,1,,1
1,1,FirstName,NVARCHAR(40),1,,0
2,2,LastName,NVARCHAR(20),1,,0
3,3,Company,NVARCHAR(80),0,,0
4,4,Address,NVARCHAR(70),0,,0
5,5,City,NVARCHAR(40),0,,0
6,6,State,NVARCHAR(40),0,,0
7,7,Country,NVARCHAR(40),0,,0
8,8,PostalCode,NVARCHAR(10),0,,0
9,9,Phone,NVARCHAR(24),0,,0



Structure of table: employees


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,EmployeeId,INTEGER,1,,1
1,1,LastName,NVARCHAR(20),1,,0
2,2,FirstName,NVARCHAR(20),1,,0
3,3,Title,NVARCHAR(30),0,,0
4,4,ReportsTo,INTEGER,0,,0
5,5,BirthDate,DATETIME,0,,0
6,6,HireDate,DATETIME,0,,0
7,7,Address,NVARCHAR(70),0,,0
8,8,City,NVARCHAR(40),0,,0
9,9,State,NVARCHAR(40),0,,0



Structure of table: genres


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,GenreId,INTEGER,1,,1
1,1,Name,NVARCHAR(120),0,,0



Structure of table: invoices


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,InvoiceId,INTEGER,1,,1
1,1,CustomerId,INTEGER,1,,0
2,2,InvoiceDate,DATETIME,1,,0
3,3,BillingAddress,NVARCHAR(70),0,,0
4,4,BillingCity,NVARCHAR(40),0,,0
5,5,BillingState,NVARCHAR(40),0,,0
6,6,BillingCountry,NVARCHAR(40),0,,0
7,7,BillingPostalCode,NVARCHAR(10),0,,0
8,8,Total,"NUMERIC(10,2)",1,,0



Structure of table: invoice_items


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,InvoiceLineId,INTEGER,1,,1
1,1,InvoiceId,INTEGER,1,,0
2,2,TrackId,INTEGER,1,,0
3,3,UnitPrice,"NUMERIC(10,2)",1,,0
4,4,Quantity,INTEGER,1,,0



Structure of table: media_types


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,MediaTypeId,INTEGER,1,,1
1,1,Name,NVARCHAR(120),0,,0



Structure of table: playlists


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,PlaylistId,INTEGER,1,,1
1,1,Name,NVARCHAR(120),0,,0



Structure of table: playlist_track


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,PlaylistId,INTEGER,1,,1
1,1,TrackId,INTEGER,1,,2



Structure of table: tracks


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,TrackId,INTEGER,1,,1
1,1,Name,NVARCHAR(200),1,,0
2,2,AlbumId,INTEGER,0,,0
3,3,MediaTypeId,INTEGER,1,,0
4,4,GenreId,INTEGER,0,,0
5,5,Composer,NVARCHAR(220),0,,0
6,6,Milliseconds,INTEGER,1,,0
7,7,Bytes,INTEGER,0,,0
8,8,UnitPrice,"NUMERIC(10,2)",1,,0



Structure of table: sqlite_stat1


Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,tbl,,0,,0
1,1,idx,,0,,0
2,2,stat,,0,,0


## Task 2: SELECT Queries

###  1. Query Full Table customers

In [25]:
# Query all data from the 'customers' table
customers_df = pd.read_sql_query("SELECT * FROM customers;", conn)
customers_df.head()  # Display first few rows

Unnamed: 0,CustomerId,FirstName,LastName,Company,Address,City,State,Country,PostalCode,Phone,Fax,Email,SupportRepId
0,1,Luís,Gonçalves,Embraer - Empresa Brasileira de Aeronáutica S.A.,"Av. Brigadeiro Faria Lima, 2170",São José dos Campos,SP,Brazil,12227-000,+55 (12) 3923-5555,+55 (12) 3923-5566,luisg@embraer.com.br,3
1,2,Leonie,Köhler,,Theodor-Heuss-Straße 34,Stuttgart,,Germany,70174,+49 0711 2842222,,leonekohler@surfeu.de,5
2,3,François,Tremblay,,1498 rue Bélanger,Montréal,QC,Canada,H2G 1A7,+1 (514) 721-4711,,ftremblay@gmail.com,3
3,4,Bjørn,Hansen,,Ullevålsveien 14,Oslo,,Norway,0171,+47 22 44 22 22,,bjorn.hansen@yahoo.no,4
4,5,František,Wichterlová,JetBrains s.r.o.,Klanova 9/506,Prague,,Czech Republic,14700,+420 2 4172 5555,+420 2 4172 5555,frantisekw@jetbrains.com,4


### 2. Select Specific Columns from Table employees

In [32]:
# Select first_name and last_name from the 'employees' table
employees_df = pd.read_sql_query("SELECT FirstName, LastName FROM employees;", conn)
employees_df.head()  # Display first few rows

Unnamed: 0,FirstName,LastName
0,Andrew,Adams
1,Nancy,Edwards
2,Jane,Peacock
3,Margaret,Park
4,Steve,Johnson


## Task 3: Filtering with WHERE

### 1. Use > operator

In [43]:
# Invoices with amount greater than 100
query1 = "SELECT * FROM invoices WHERE Total > 10;"
df1 = pd.read_sql_query(query1, conn)
df1.head()

Unnamed: 0,InvoiceId,CustomerId,InvoiceDate,BillingAddress,BillingCity,BillingState,BillingCountry,BillingPostalCode,Total
0,5,23,2009-01-11 00:00:00,69 Salem Street,Boston,MA,USA,2113.0,13.86
1,12,2,2009-02-11 00:00:00,Theodor-Heuss-Straße 34,Stuttgart,,Germany,70174.0,13.86
2,19,40,2009-03-14 00:00:00,"8, Rue Hanovre",Paris,,France,75002.0,13.86
3,26,19,2009-04-14 00:00:00,1 Infinite Loop,Cupertino,CA,USA,95014.0,13.86
4,33,57,2009-05-15 00:00:00,"Calle Lira, 198",Santiago,,Chile,,13.86


### 2. Use != operator

In [45]:
# Invoices where billing country is not USA
query2 = "SELECT * FROM invoices WHERE BillingCountry != 'USA';"
df2 = pd.read_sql_query(query2, conn)
df2.head()

Unnamed: 0,InvoiceId,CustomerId,InvoiceDate,BillingAddress,BillingCity,BillingState,BillingCountry,BillingPostalCode,Total
0,1,2,2009-01-01 00:00:00,Theodor-Heuss-Straße 34,Stuttgart,,Germany,70174,1.98
1,2,4,2009-01-02 00:00:00,Ullevålsveien 14,Oslo,,Norway,0171,3.96
2,3,8,2009-01-03 00:00:00,Grétrystraat 63,Brussels,,Belgium,1000,5.94
3,4,14,2009-01-06 00:00:00,8210 111 ST NW,Edmonton,AB,Canada,T6G 2C7,8.91
4,6,37,2009-01-19 00:00:00,Berger Straße 10,Frankfurt,,Germany,60316,0.99


### 3. Use BETWEEN operator

In [49]:
# Invoices with amount between 10 and 15
query3 = "SELECT * FROM invoices WHERE Total BETWEEN 10 AND 15;"
df3 = pd.read_sql_query(query3, conn)
df3.head()

Unnamed: 0,InvoiceId,CustomerId,InvoiceDate,BillingAddress,BillingCity,BillingState,BillingCountry,BillingPostalCode,Total
0,5,23,2009-01-11 00:00:00,69 Salem Street,Boston,MA,USA,2113.0,13.86
1,12,2,2009-02-11 00:00:00,Theodor-Heuss-Straße 34,Stuttgart,,Germany,70174.0,13.86
2,19,40,2009-03-14 00:00:00,"8, Rue Hanovre",Paris,,France,75002.0,13.86
3,26,19,2009-04-14 00:00:00,1 Infinite Loop,Cupertino,CA,USA,95014.0,13.86
4,33,57,2009-05-15 00:00:00,"Calle Lira, 198",Santiago,,Chile,,13.86


### 4. Use LIKE operator

In [53]:
# Customers with first name starting with 'A'
query4 = "SELECT * FROM customers WHERE FirstName LIKE 'A%';"
df4 = pd.read_sql_query(query4, conn)
df4.head()

Unnamed: 0,CustomerId,FirstName,LastName,Company,Address,City,State,Country,PostalCode,Phone,Fax,Email,SupportRepId
0,7,Astrid,Gruber,,"Rotenturmstraße 4, 1010 Innere Stadt",Vienne,,Austria,1010,+43 01 5134505,,astrid.gruber@apple.at,5
1,11,Alexandre,Rocha,Banco do Brasil S.A.,"Av. Paulista, 2022",São Paulo,SP,Brazil,01310-200,+55 (11) 3055-3278,+55 (11) 3055-8131,alero@uol.com.br,5
2,32,Aaron,Mitchell,,696 Osborne Street,Winnipeg,MB,Canada,R3L 2B9,+1 (204) 452-6452,,aaronmitchell@yahoo.ca,4


### 5. Use IN operator

In [56]:
# Employees from specific cities
query5 = "SELECT * FROM employees WHERE City IN ('Calgary', 'Edmonton');"
df5 = pd.read_sql_query(query5, conn)
df5.head()

Unnamed: 0,EmployeeId,LastName,FirstName,Title,ReportsTo,BirthDate,HireDate,Address,City,State,Country,PostalCode,Phone,Fax,Email
0,1,Adams,Andrew,General Manager,,1962-02-18 00:00:00,2002-08-14 00:00:00,11120 Jasper Ave NW,Edmonton,AB,Canada,T5K 2N1,+1 (780) 428-9482,+1 (780) 428-3457,andrew@chinookcorp.com
1,2,Edwards,Nancy,Sales Manager,1.0,1958-12-08 00:00:00,2002-05-01 00:00:00,825 8 Ave SW,Calgary,AB,Canada,T2P 2T3,+1 (403) 262-3443,+1 (403) 262-3322,nancy@chinookcorp.com
2,3,Peacock,Jane,Sales Support Agent,2.0,1973-08-29 00:00:00,2002-04-01 00:00:00,1111 6 Ave SW,Calgary,AB,Canada,T2P 5M5,+1 (403) 262-3443,+1 (403) 262-6712,jane@chinookcorp.com
3,4,Park,Margaret,Sales Support Agent,2.0,1947-09-19 00:00:00,2003-05-03 00:00:00,683 10 Street SW,Calgary,AB,Canada,T2P 5G3,+1 (403) 263-4423,+1 (403) 263-4289,margaret@chinookcorp.com
4,5,Johnson,Steve,Sales Support Agent,2.0,1965-03-03 00:00:00,2003-10-17 00:00:00,7727B 41 Ave,Calgary,AB,Canada,T3B 1Y7,1 (780) 836-9987,1 (780) 836-9543,steve@chinookcorp.com


## Task 4: Sorting and Limiting

### 1. ORDER BY Total ASC (Ascending)

In [60]:
# Sort invoices by Total amount (ascending)
query_asc = "SELECT * FROM invoices ORDER BY Total ASC;"
df_asc = pd.read_sql_query(query_asc, conn)
df_asc.head()

Unnamed: 0,InvoiceId,CustomerId,InvoiceDate,BillingAddress,BillingCity,BillingState,BillingCountry,BillingPostalCode,Total
0,6,37,2009-01-19 00:00:00,Berger Straße 10,Frankfurt,,Germany,60316,0.99
1,13,16,2009-02-19 00:00:00,1600 Amphitheatre Parkway,Mountain View,CA,USA,94043-1351,0.99
2,20,54,2009-03-22 00:00:00,110 Raeburn Pl,Edinburgh,,United Kingdom,EH4 1HH,0.99
3,27,33,2009-04-22 00:00:00,5112 48 Street,Yellowknife,NT,Canada,X1A 1N6,0.99
4,34,12,2009-05-23 00:00:00,"Praça Pio X, 119",Rio de Janeiro,RJ,Brazil,20040-020,0.99


### 2. ORDER BY Total DESC (Descending)

In [64]:
# Sort invoices by Total amount (descending)
query_desc = "SELECT * FROM invoices ORDER BY Total DESC;"
df_desc = pd.read_sql_query(query_desc, conn)
df_desc.head()

Unnamed: 0,InvoiceId,CustomerId,InvoiceDate,BillingAddress,BillingCity,BillingState,BillingCountry,BillingPostalCode,Total
0,404,6,2013-11-13 00:00:00,Rilská 3174/6,Prague,,Czech Republic,14300,25.86
1,299,26,2012-08-05 00:00:00,2211 W Berry Street,Fort Worth,TX,USA,76110,23.86
2,96,45,2010-02-18 00:00:00,Erzsébet krt. 58.,Budapest,,Hungary,H-1073,21.86
3,194,46,2011-04-28 00:00:00,3 Chatham Street,Dublin,Dublin,Ireland,,21.86
4,89,7,2010-01-18 00:00:00,"Rotenturmstraße 4, 1010 Innere Stadt",Vienne,,Austria,1010,18.86


### 3. LIMIT – Show Top 5 Invoices by Amount

In [70]:
# Top 5 invoices with highest Total
query_top5 = "SELECT * FROM invoices ORDER BY Total DESC LIMIT 5;"
df_top5 = pd.read_sql_query(query_top5, conn)
df_top5

Unnamed: 0,InvoiceId,CustomerId,InvoiceDate,BillingAddress,BillingCity,BillingState,BillingCountry,BillingPostalCode,Total
0,404,6,2013-11-13 00:00:00,Rilská 3174/6,Prague,,Czech Republic,14300,25.86
1,299,26,2012-08-05 00:00:00,2211 W Berry Street,Fort Worth,TX,USA,76110,23.86
2,96,45,2010-02-18 00:00:00,Erzsébet krt. 58.,Budapest,,Hungary,H-1073,21.86
3,194,46,2011-04-28 00:00:00,3 Chatham Street,Dublin,Dublin,Ireland,,21.86
4,89,7,2010-01-18 00:00:00,"Rotenturmstraße 4, 1010 Innere Stadt",Vienne,,Austria,1010,18.86


## Task 5: Basic Aggregations

### 1. COUNT Total Records

In [74]:
# Count total number of invoices
query_count = "SELECT COUNT(*) AS InvoiceCount FROM invoices;"
df_count = pd.read_sql_query(query_count, conn)
df_count

Unnamed: 0,InvoiceCount
0,412


### 2. SUM of Total Sales

In [77]:
# Sum of all invoice totals (i.e., total sales)
query_sum = "SELECT SUM(Total) AS TotalSales FROM invoices;"
df_sum = pd.read_sql_query(query_sum, conn)
df_sum

Unnamed: 0,TotalSales
0,2328.6


### 3. AVG of Invoice Totals

In [80]:
# Average invoice total
query_avg = "SELECT AVG(Total) AS AvgSaleAmount FROM invoices;"
df_avg = pd.read_sql_query(query_avg, conn)
df_avg

Unnamed: 0,AvgSaleAmount
0,5.651942


### 4. GROUP BY Country (Total Sales by Region)

In [83]:
# Total sales per billing country
query_groupby = """
SELECT BillingCountry, COUNT(*) AS NumInvoices, 
       SUM(Total) AS TotalSales, 
       AVG(Total) AS AvgSale
FROM invoices
GROUP BY BillingCountry
ORDER BY TotalSales DESC;
"""
df_groupby = pd.read_sql_query(query_groupby, conn)
df_groupby.head()

Unnamed: 0,BillingCountry,NumInvoices,TotalSales,AvgSale
0,USA,91,523.06,5.747912
1,Canada,56,303.96,5.427857
2,France,35,195.1,5.574286
3,Brazil,35,190.1,5.431429
4,Germany,28,156.48,5.588571
