In [None]:
# storage.ipynb - Save with specified file names and validate

# Import libraries
import pandas as pd
import sqlite3

# Load the cleaned CSV
df2 = pd.read_csv('df2_clean.csv')

# Relational Storage: SQLite
sqlite_db_path = 'rankings.db'  # save as requested
conn = sqlite3.connect(sqlite_db_path)
df2.to_sql('university_rankings', conn, if_exists='replace', index=False)
conn.close()
print("Data stored in SQLite database (rankings.db).")

# Big Data / Analytical Storage: Parquet
parquet_path = 'rankings.parquet'  # save as requested
df2.to_parquet(parquet_path, engine='pyarrow', index=False)
print("Data stored in Parquet format (rankings.parquet).")

# Validation function
def validate_storage(df_original, sqlite_db=sqlite_db_path, parquet_file=parquet_path):
    # Read back from SQLite
    conn = sqlite3.connect(sqlite_db)
    df_sqlite = pd.read_sql('SELECT * FROM university_rankings', conn)
    conn.close()

    # Read back from Parquet
    df_parquet = pd.read_parquet(parquet_file, engine='pyarrow')

    # Check equality
    sqlite_check = df_original.equals(df_sqlite)
    parquet_check = df_original.equals(df_parquet)

    print(f"SQLite Integrity Check: {sqlite_check}")
    print(f"Parquet Integrity Check: {parquet_check}")

    return sqlite_check and parquet_check

# Run validation
is_valid = validate_storage(df2)
if is_valid:
    print("Data successfully stored and verified in both rankings.db and rankings.parquet!")
else:
    print("Data mismatch detected!")