In [None]:
#!pip install sqlalchemy jupysql

##  SQL vs Pandas

In [None]:
import pandas as pd
import numpy as np
import sqlalchemy
from pathlib import Path
%load_ext sql

In [None]:
# Run this cell to connect to database
engine = sqlalchemy.create_engine('sqlite:///starwars.db')
connection = engine.connect()

In [None]:
%sql engine


In [None]:
%%sql
SELECT * FROM sqlite_master WHERE type='table';

In [None]:
%%sql
# Display the first 5 rows of characters table
SELECT * FROM characters LIMIT 5;


In [None]:
# Read the table into a pandas DataFrame
characters_df = pd.read_sql("SELECT * FROM characters", connection)

characters_df

In [None]:
characters_df.dtypes

##  Lets compare some SQL to Pandas commands   

Demo 1 - Select Data ( columns)

In [None]:
%sql SELECT name, species FROM characters

and in Pandas

In [None]:
characters_df[['name', 'species']]

Demo 2 - Filtering

In [None]:
%%sql
SELECT * FROM characters
WHERE  height > 200 AND species = 'Human'

Oops that didn't work - because apparently height is a string and so the boolean about hieght didnt filter

One approach is to ask SQL to treat that column as an integer

`CAST()` is a SQL function that converts one data type into another. This is useful when a column is stored as text (TEXT or VARCHAR), but you need to perform numeric operations on it.


In [None]:
%%sql
SELECT * FROM characters
WHERE CAST(height AS INTEGER) > 200 
AND species = 'Human';

In [None]:
characters_df

in Pandas we also need to check the type of height - if it wasnt a float in SQL it probably isnt in Pandas

In [None]:
characters_df['height'].dtype

convert height to numeric

In [None]:
characters_df['height'] = pd.to_numeric(characters_df['height'], errors='coerce')
characters_df['height'].dtype

In [None]:
characters_df[(characters_df['height'] > 200) & (characters_df['species'] == 'Human')]

Demo 3 - aggregate data

Average height by species



In [None]:
%%sql
SELECT species, AVG(CAST(height AS INTEGER)) AS avg_height
FROM characters
GROUP BY species;

And in Pandas

In [None]:
species_avg_height = characters_df.groupby('species')['height'].mean()

species_avg_height

Demo 4 - 

4.1 Let's join two tables characters and planets

4.2 Let's look for Humans from the desert

In [None]:
%%sql
SELECT characters.name, planets.name AS homeworld
FROM characters
JOIN planets ON characters.homeworld = planets.name;

In [None]:
%%sql
SELECT characters.name, characters.homeworld, characters.species, planets.climate, planets.population, planets.terrain
FROM characters
JOIN planets ON characters.homeworld = planets.name;

In [None]:
%%sql
SELECT characters.name, characters.homeworld, characters.species, planets.climate, planets.population, planets.terrain
FROM characters
JOIN planets ON characters.homeworld = planets.name
WHERE LOWER(characters.species) = 'human' 
AND LOWER(planets.terrain) = 'desert';

In [None]:
planets_df = pd.read_sql("SELECT * FROM planets", connection)


In [None]:
charactersXplanets= characters_df[['name', 'species','homeworld']].merge(planets_df, left_on='homeworld', right_on='name')
charactersXplanets

Get a subset of the humans from the desert from merged data


In [None]:
charactersXplanets[(charactersXplanets['species'] == 'Human') & (charactersXplanets['terrain'] == 'desert')]
