In [22]:
pip install pandas mysql-connector-python

Note: you may need to restart the kernel to use updated packages.


In [23]:
import pandas as pd
import re
from data201 import db_connection
from mysql.connector import errorcode

In [24]:
conn = db_connection(config_file = 'joinforce.ini')
cursor = conn.cursor()

In [25]:
cursor.execute('DROP TABLE IF EXISTS aq_pollutant;')
cursor.execute('DROP TABLE IF EXISTS air_quality;')
cursor.execute('DROP TABLE IF EXISTS pollutant;')
cursor.execute('DROP TABLE IF EXISTS location;')
#cursor.execute('DROP TABLE IF EXISTS district;')

In [26]:
# Create District table (One-to-One relationship with Location)
# Primary key: district_id

sql = ( """
    CREATE TABLE IF NOT EXISTS district (
        district_id INT NOT NULL,
        district_name VARCHAR(100) NOT NULL,
        PRIMARY KEY (district_id)
    )""")

cursor.execute(sql);

In [27]:
# Create Location table (One-to-One relationship with District)
#doesn't work if district_id is used as primary key for multiple tables:https://stackoverflow.com/questions/55631622/can-i-use-one-same-primary-key-in-two-different-tables
# Primary key: location_id
# Foreign key: district_id
    
sql = ( """
    CREATE TABLE location (
        location_id INT NOT NULL,
        longitude FLOAT,
        latitude FLOAT,
        district_id INT NOT NULL,
        PRIMARY KEY (location_id),
        CONSTRAINT fk_location_district FOREIGN KEY (district_id) REFERENCES district(district_id)
    )""")

cursor.execute(sql);

In [28]:
# Create Pollutant table(one-to-many relationship with District))
# Primary key: pollutant_id

sql = ("""
    CREATE TABLE IF NOT EXISTS pollutant (
        pollutant_id INT NOT NULL,
        measure VARCHAR(50) NOT NULL,
        PRIMARY KEY (pollutant_id)
    )""")

cursor.execute(sql);

In [29]:
# Create AirQuality table (Many-to-Many relationship between AirQuality and Pollutant)
# Primary key: aq_id
# Foreign key: district_id

sql = ("""
    CREATE TABLE IF NOT EXISTS air_quality (
        aq_id VARCHAR(70) NOT NULL,
        date DATE NOT NULL,
        measure VARCHAR(50) NOT NULL,
        data_value FLOAT NOT NULL,
        district_id INT NOT NULL,
        PRIMARY KEY (aq_id),
        CONSTRAINT fk_airquality_district FOREIGN KEY (district_id) REFERENCES district(district_id)
    )""")

cursor.execute(sql);

In [30]:
# Create AQPollutant table 
# Primary keys: aq_id, pollutant_id
# Foreign key: aq_id → References air_quality(aq_id), pollutant_id → References pollutant(pollutant_id)

sql = ("""
    CREATE TABLE IF NOT EXISTS aq_pollutant (
        pollutant_id INT NOT NULL,
        aq_id VARCHAR(70) NOT NULL,
        PRIMARY KEY (pollutant_id, aq_id),
        CONSTRAINT fk_aqpollutant_pollutant FOREIGN KEY (pollutant_id) REFERENCES pollutant(pollutant_id),
        CONSTRAINT fk_aqpollutant_airquality FOREIGN KEY (aq_id) REFERENCES air_quality(aq_id)
    )""")

cursor.execute(sql)

In [31]:

sql = ("""
      CREATE TABLE IF NOT EXISTS air_quality_category (
      aq_id VARCHAR(70) NOT NULL,
      date DATE NOT NULL,
      measure VARCHAR(50) NOT NULL,
      data_value FLOAT NOT NULL,
      district_id INT NOT NULL,
      air_quality_category VARCHAR(50), -- The new column for transformed data
      PRIMARY KEY (aq_id),
      CONSTRAINT fk_airquality_category_district FOREIGN KEY (district_id) REFERENCES district(district_id)
    )""")

cursor.execute(sql)
cursor.close()
conn.close()
print("MySQL connection is closed.")

MySQL connection is closed.


In [32]:
#Insert data into the district table
try:
    conn = db_connection(config_file='joinforce.ini')
    cursor = conn.cursor()

    sql = """
        INSERT INTO joinforce_db.district (district_id, district_name) VALUES
            (1, 'San Jose'),
            (2, 'Los Angeles'),
            (3, 'New York'),
            (4, 'Chicago'),
            (5, 'Houston')
        ON DUPLICATE KEY UPDATE district_name = VALUES(district_name);
    """
    
    cursor.execute(sql)
    conn.commit()
    print("Data inserted successfully!")

except Exception as e:
    print(f"Error inserting data: {e}")

finally:
    cursor.close()
    conn.close()


Data inserted successfully!


In [33]:
#Insert data into the location table
try:
    conn = db_connection(config_file='joinforce.ini')
    cursor = conn.cursor()

    sql = """
        INSERT INTO joinforce_db.location (location_id, longitude, latitude, district_id) VALUES
            (1, -121.8863, 37.3382, 1),
            (2, -118.2437, 34.0522, 2),
            (3, -74.0060, 40.7128, 3),
            (4, -87.6298, 41.8781, 4),
            (5, -95.3698, 29.7604, 5)
        ON DUPLICATE KEY UPDATE longitude = VALUES(longitude), latitude = VALUES(latitude);
    """

    cursor.execute(sql)
    conn.commit()
    print("Data inserted successfully into location table!")

except Exception as e:
    print(f"Error inserting data into location: {e}")

finally:
    cursor.close()
    conn.close()


Data inserted successfully into location table!


In [34]:
#Insert data into the pollutant table
try:
    conn = db_connection(config_file='joinforce.ini')
    cursor = conn.cursor()

    sql = """
        INSERT INTO joinforce_db.pollutant (pollutant_id, measure) VALUES
            (1, 'PM2.5'),
            (2, 'PM10'),
            (3, 'CO'),
            (4, 'NO2'),
            (5, 'SO2')
        ON DUPLICATE KEY UPDATE measure = VALUES(measure);
    """

    cursor.execute(sql)
    conn.commit()
    print("Data inserted successfully into pollutant table!")

except Exception as e:
    print(f"Error inserting data into pollutant: {e}")

finally:
    cursor.close()
    conn.close()


Data inserted successfully into pollutant table!


In [35]:
#Insert data into the air_quality table
try:
    conn = db_connection(config_file='joinforce.ini')
    cursor = conn.cursor()

    sql = """
        INSERT INTO joinforce_db.air_quality (aq_id, date, measure, data_value, district_id) VALUES
            ('AQ1', '2025-02-14', 'PM2.5', 35.6, 1),
            ('AQ2', '2025-02-14', 'PM10', 50.2,2),
            ('AQ3', '2025-02-14', 'CO', 0.8, 3),
            ('AQ4', '2025-02-14', 'NO2', 25.5, 4),
            ('AQ5', '2025-02-14', 'SO2', 15.0, 5);
        
    """

    cursor.execute(sql)
    conn.commit()
    print("Data inserted successfully into air_quality table!")

except Exception as e:
    print(f"Error inserting data into air_quality: {e}")

finally:
    cursor.close()
    conn.close()


Data inserted successfully into air_quality table!


In [36]:
#Insert data into the aq_pollutant table
try:
    conn = db_connection(config_file='joinforce.ini')
    cursor = conn.cursor()

    sql = """
        INSERT INTO joinforce_db.aq_pollutant (pollutant_id, aq_id) VALUES
            (1, 'AQ1'),
            (2, 'AQ2'),
            (3, 'AQ3'),
            (4, 'AQ4'),
            (5, 'AQ5')
        ON DUPLICATE KEY UPDATE pollutant_id = VALUES(pollutant_id);
    """

    cursor.execute(sql)
    conn.commit()
    print("Data inserted successfully into aq_pollutant table!")

except Exception as e:
    print(f"Error inserting data into aq_pollutant: {e}")

finally:
    cursor.close()
    conn.close()


Data inserted successfully into aq_pollutant table!


In [37]:
#INSERT INTO SELECT with CASE
try:
    conn = db_connection(config_file='joinforce.ini')
    cursor = conn.cursor()

    sql = """
        INSERT INTO air_quality_category (aq_id, date, measure, data_value, district_id, air_quality_category)
SELECT 
    aq.aq_id,
    aq.date,
    aq.measure,
    aq.data_value,
    aq.district_id,
    CASE
        WHEN aq.data_value < 20 THEN 'Excellent'
        WHEN aq.data_value >= 20 AND aq.data_value <= 50 THEN 'Fair'
        ELSE 'Unhealthy'
    END AS air_quality_category
FROM air_quality aq;
    """

    cursor.execute(sql)
    conn.commit()
    print("Data inserted successfully into air_quality table!")

except Exception as e:
    print(f"Error inserting data into air_quality: {e}")

finally:
    cursor.close()
    conn.close()


Error inserting data into air_quality: 1062 (23000): Duplicate entry 'AQ1' for key 'air_quality_category.PRIMARY'


In [38]:
# Reconnect to DB
conn = db_connection(config_file='joinforce.ini')

# Query to fetch the rows of every table
try:
    if conn.is_connected():
        cursor = conn.cursor()

        tables = ["district", "location", "pollutant", "air_quality", "aq_pollutant","air_quality_category"]
        dataframes = {}

        for table in tables:
            query = f"SELECT * FROM {table};"
            cursor.execute(query)
            rows = cursor.fetchall()

            # Get column names from cursor description
            column_names = [desc[0] for desc in cursor.description]

            # Create a pandas DataFrame for each table
            df = pd.DataFrame(rows, columns=column_names)
            dataframes[table] = df

            # Print DataFrame
            print(f"\n{table.upper()} TABLE:")
            print(df)

except Exception as e:
    print(f"Error while executing queries: {e}")

finally:
    cursor.close()
    conn.close()
    print("MySQL connection is closed.")


DISTRICT TABLE:
   district_id district_name
0            1      San Jose
1            2   Los Angeles
2            3      New York
3            4       Chicago
4            5       Houston

LOCATION TABLE:
   location_id  longitude  latitude  district_id
0            1  -121.8860   37.3382            1
1            2  -118.2440   34.0522            2
2            3   -74.0060   40.7128            3
3            4   -87.6298   41.8781            4
4            5   -95.3698   29.7604            5

POLLUTANT TABLE:
   pollutant_id measure
0             1   PM2.5
1             2    PM10
2             3      CO
3             4     NO2
4             5     SO2

AIR_QUALITY TABLE:
  aq_id        date measure  data_value  district_id
0   AQ1  2025-02-14   PM2.5        35.6            1
1   AQ2  2025-02-14    PM10        50.2            2
2   AQ3  2025-02-14      CO         0.8            3
3   AQ4  2025-02-14     NO2        25.5            4
4   AQ5  2025-02-14     SO2        15.0            

Doing Join Operations on above Tables as below:

In [39]:
# Doing Left Outer Join, Right Outer Join, and Full Outer Join
queries = {
     #Display the region ID, region name, and number of stores in the region for all regions
    "1. Left Outer Join": """
        SELECT d.district_name,l.location_id, l.longitude, l.latitude
        FROM joinforce_db.district d
        LEFT OUTER JOIN joinforce_db.location l
        ON d.district_id= l.district_id;
    """,
    "2. Right Outer Join": """
        SELECT d.district_name, a.measure, a.data_value
        FROM joinforce_db.district d
        Right OUTER JOIN joinforce_db.air_quality a
        ON d.district_id= a.district_id;
    """,
     "3. Full Outer Join": """
        SELECT a.aq_id, p.measure
        FROM joinforce_db.pollutant p
        Right OUTER JOIN joinforce_db.aq_pollutant a
        ON p.pollutant_id= a.pollutant_id;
    """
}
# Reconnect to DB
conn = db_connection(config_file='joinforce.ini')
try:
    if conn.is_connected():
        cursor = conn.cursor()

        for query_name, sql_query in queries.items():
            cursor.execute(sql_query)
            rows = cursor.fetchall()

            # Get column names from cursor description
            column_names = [desc[0] for desc in cursor.description]

            # Create DataFrame
            df_result = pd.DataFrame(rows, columns=column_names)

            # Display DataFrame
            print(f"\n {query_name} Results:")
            print(df_result.head(40))  # Show first 40 rows

except Exception as e:
    print(f"Error while executing queries: {e}")

finally:
    cursor.close()
    conn.close()
    print("MySQL connection is closed.")


 1. Left Outer Join Results:
  district_name  location_id  longitude  latitude
0      San Jose            1  -121.8860   37.3382
1   Los Angeles            2  -118.2440   34.0522
2      New York            3   -74.0060   40.7128
3       Chicago            4   -87.6298   41.8781
4       Houston            5   -95.3698   29.7604

 2. Right Outer Join Results:
  district_name measure  data_value
0      San Jose   PM2.5        35.6
1   Los Angeles    PM10        50.2
2      New York      CO         0.8
3       Chicago     NO2        25.5
4       Houston     SO2        15.0

 3. Full Outer Join Results:
  aq_id measure
0   AQ1   PM2.5
1   AQ2    PM10
2   AQ3      CO
3   AQ4     NO2
4   AQ5     SO2
MySQL connection is closed.
