In [21]:
import pandas as pd
from sqlalchemy import create_engine
import numpy as np
import json
import requests
from io import StringIO
from azure.storage.blob import BlobServiceClient, BlobClient, ContainerClient
from math import ceil
import datetime
import calendar

In [2]:
# Load the CSV file
df = pd.read_csv('/Users/nicholasnoto/Desktop/CIS 4400/Group Project/cleaned_data.csv')

# Print the columns of the DataFrame
print("Columns in CSV file:", df.columns.tolist())

# Manually list the columns you expect based on your schema
expected_columns = ['school_id', 'schoolrating', 'schoolname']

# Check if the expected columns match the CSV columns
if all(column in df.columns for column in expected_columns):
    print("The CSV matches the expected schema!")
else:
    print("The CSV does not match the expected schema. Missing columns:", set(expected_columns) - set(df.columns))


Columns in CSV file: ['Name', 'City', 'State', 'Zip', 'County', 'IsPrivate', 'Level', 'LowGrade', 'HighGrade', 'RankStars', 'PupilTeacherRatio']
The CSV does not match the expected schema. Missing columns: {'school_id', 'schoolrating', 'schoolname'}


In [3]:
# Load the CSV file
df = pd.read_csv('/Users/nicholasnoto/Desktop/CIS 4400/Group Project/cleaned_data.csv')

# Rename the columns according to the schema mappings
df.rename(columns={'Name': 'schoolname', 'RankStars': 'schoolrating'}, inplace=True)

# Assuming you need a unique 'school_id' for each school
# Here, we create 'school_id' as a new column from index (for simplicity)
df['school_id'] = range(1, len(df) + 1)

# Select only the columns needed for the schema
df = df[['school_id', 'schoolrating', 'schoolname',]]

# Optionally, save the adjusted DataFrame to a new CSV for upload or verification
df.to_csv('/Users/nicholasnoto/Desktop/CIS 4400/Group Project/cleaned_schooldata.csv', index=False)

# Print the updated DataFrame to verify the adjustments
print(df.head())


   school_id schoolrating                                        schoolname
0          1            1                  Central Middle School of Science
1          2            1                                Chinook Elementary
2          3          NaN                 Family Partnership Charter School
3          4          NaN  Alaska State School for Deaf and Hard of Hearing
4          5          NaN                             Ursa Major Elementary


In [4]:
# Load the CSV file
df = pd.read_csv('/Users/nicholasnoto/Desktop/CIS 4400/Group Project/cleaned_schooldata.csv')

# Display the data types in the DataFrame
print("Original Data Types:")
print(df.dtypes)

# Convert data types to match the schema
df['school_id'] = pd.to_numeric(df['school_id'], errors='coerce')
df['schoolrating'] = pd.to_numeric(df['schoolrating'], errors='coerce', downcast='float')
df['schoolname'] = df['schoolname'].astype(str)

# Check and display the data types after conversion
print("\nData Types after Conversion:")
print(df.dtypes)

# Optionally, check for any NaN values that may have resulted from improper conversions
print("\nNaN Values Check:")
print(df.isna().sum())

# Ensure all entries for school_id and schoolrating are numeric and not NaN
# This is crucial because the database expects all these fields to be filled with numeric values
df.dropna(subset=['school_id', 'schoolrating'], inplace=True)

Original Data Types:
school_id        int64
schoolrating    object
schoolname      object
dtype: object

Data Types after Conversion:
school_id         int64
schoolrating    float32
schoolname       object
dtype: object

NaN Values Check:
school_id           0
schoolrating    13858
schoolname          0
dtype: int64


In [5]:
# Ensure schoolrating has at most 5 decimal places
df['schoolrating'] = df['schoolrating'].round(5)

# Print the first few rows to check the data
print(df.head())


   school_id  schoolrating                        schoolname
0          1           1.0  Central Middle School of Science
1          2           1.0                Chinook Elementary
5          6           4.0                 Aurora Elementary
6          7           1.0            Abbott Loop Elementary
7          8           1.0        Airport Heights Elementary


In [6]:
# Assuming 'df' is your updated DataFrame
df.to_csv('/Users/nicholasnoto/Desktop/CIS 4400/Group Project/cleaned_copy_schooldata.csv', index=False)

print("DataFrame has been saved to CSV successfully.")


DataFrame has been saved to CSV successfully.


In [12]:
# Database credentials and connection details
DATABASE_TYPE = 'postgresql'
DBAPI = 'psycopg2'
HOST = 'proj9.postgres.database.azure.com'
USER = 'group8'
PASSWORD = 'project9!'
DATABASE = 'postgres'  # Adjust if your actual database name is different
PORT = 5432

# Connection string for SQLAlchemy
DATABASE_CONNECTION = f"{DATABASE_TYPE}+{DBAPI}://{USER}:{PASSWORD}@{HOST}:{PORT}/{DATABASE}"

# Create a database engine
engine = create_engine(DATABASE_CONNECTION, echo=True)

# Load your DataFrame from the CSV
df = pd.read_csv('/Users/nicholasnoto/Desktop/CIS 4400/Group Project/cleaned_copy_schooldata.csv')

# Check that the DataFrame loads correctly
print("DataFrame loaded successfully:")
print(df.head())

DataFrame loaded successfully:
   school_id  schoolrating                        schoolname
0          1           1.0  Central Middle School of Science
1          2           1.0                Chinook Elementary
2          6           4.0                 Aurora Elementary
3          7           1.0            Abbott Loop Elementary
4          8           1.0        Airport Heights Elementary


In [13]:
# Insert the DataFrame into the Azure PostgreSQL database
# 'dim_schoolrating' is the table name as confirmed from DataGrip
df.to_sql('dim_schoolrating', con=engine, index=False, if_exists='append', method='multi')

2024-05-06 22:21:18,836 INFO sqlalchemy.engine.Engine select pg_catalog.version()
2024-05-06 22:21:18,838 INFO sqlalchemy.engine.Engine [raw sql] {}
2024-05-06 22:21:18,896 INFO sqlalchemy.engine.Engine select current_schema()
2024-05-06 22:21:18,898 INFO sqlalchemy.engine.Engine [raw sql] {}
2024-05-06 22:21:18,969 INFO sqlalchemy.engine.Engine show standard_conforming_strings
2024-05-06 22:21:18,970 INFO sqlalchemy.engine.Engine [raw sql] {}
2024-05-06 22:21:19,033 INFO sqlalchemy.engine.Engine select relname from pg_class c join pg_namespace n on n.oid=c.relnamespace where pg_catalog.pg_table_is_visible(c.oid) and relname=%(name)s
2024-05-06 22:21:19,035 INFO sqlalchemy.engine.Engine [generated in 0.00180s] {'name': 'dim_schoolrating'}
2024-05-06 22:21:19,112 INFO sqlalchemy.engine.Engine BEGIN (implicit)
2024-05-06 22:21:19,115 INFO sqlalchemy.engine.Engine 
CREATE TABLE dim_schoolrating (
	school_id BIGINT, 
	schoolrating FLOAT(53), 
	schoolname TEXT
)


2024-05-06 22:21:19,116 IN

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



2024-05-06 22:21:21,123 INFO sqlalchemy.engine.Engine COMMIT


20257