In [1]:
import mysql.connector
import json
from pymongo import MongoClient
import pandas as pd

In [None]:
#Healthcare Schema

SQL = {
  "hospitals": {
    "pk": ["admissionid"],
    "fk": {
      "patients": "patientid",
      "insurance": "insuranceid"
    },
    "admissionid": "INT",
    "patientid": "INT",
    "insuranceid": "INT",
    "doctor": "VARCHAR(255)",
    "hospitalname": "VARCHAR(255)",
    "intakedate": "DATE",
    "dischargedate": "DATE",
    "roomnumber": "INT",
    "carelevel": "VARCHAR(50)",
    "testresults": "VARCHAR(50)"
  },

  "patients": {
    "pk": ["patientid"],
    "fk": {},
    "patientid": "INT",
    "patientname": "VARCHAR(255)",
    "age": "INT",
    "gender": "VARCHAR(10)",
    "bloodtype": "VARCHAR(5)",
    "disease": "VARCHAR(255)"
  },

  "insurance": {
    "pk": ["insuranceid"],
    "fk": {
      "patients": "patientid"
    },
    "insuranceid": "INT",
    "patientid": "INT",
    "insuranceprovider": "VARCHAR(255)",
    "billingcost": "DECIMAL(10, 2)",
    "benefit": "VARCHAR(50)"
  },

  "prescriptions": {
    "pk": ["prescriptionid"],
    "fk": {
      "patients": "patientid"
    },
    "prescriptionid": "INT",
    "patientid": "INT",
    "medication": "VARCHAR(255)",
    "drug_quantity": "VARCHAR(50)"
  }
}


In [58]:
new_schema = dict()

In [34]:
def show_tables_and_primary_keys(db_config):
    try:
        # Establish a connection to the MySQL database
        conn = mysql.connector.connect(**db_config)
        cursor = conn.cursor()

        # Query to retrieve tables and their primary keys
        query = """
        SELECT 
            tab.table_name,
            GROUP_CONCAT(kcu.column_name ORDER BY kcu.ordinal_position SEPARATOR ', ') AS primary_keys
        FROM 
            information_schema.tables tab
        LEFT JOIN 
            information_schema.table_constraints tco
            ON tab.table_schema = tco.table_schema
            AND tab.table_name = tco.table_name
            AND tco.constraint_type = 'PRIMARY KEY'
        LEFT JOIN 
            information_schema.key_column_usage kcu
            ON tco.constraint_schema = kcu.constraint_schema
            AND tco.constraint_name = kcu.constraint_name
            AND tco.table_name = kcu.table_name
        WHERE 
            tab.table_schema = %s
        GROUP BY 
            tab.table_schema,
            tab.table_name
        ORDER BY 
            tab.table_name;
        """

        # Execute the query
        cursor.execute(query, (db_config['database'],))
        
        # Fetch all results
        results = cursor.fetchall()

        pk = []
        # Print the results
        print(f"Tables and Primary Keys in database '{db_config['database']}':")
        for table_name, primary_keys in results:
            pk.append((table_name, primary_keys))
            if primary_keys:
                print(f"Table: {table_name}, Primary Key(s): {primary_keys}")
            else:
                print(f"Table: {table_name}, No Primary Key")

    except mysql.connector.Error as err:
        print(f"Error: {err}")
    finally:
        # Close the connection
        if conn.is_connected():
            cursor.close()
            conn.close()

    return pk
# Usage example:
# db_config = {
#     'host': 'localhost',
#     'user': 'your_username',
#     'password': 'your_password',
#     'database': 'your_database_name'
# }
# show_tables_and_primary_keys(db_config)

In [None]:
import pandas as pd
import mysql.connector
import re

def normalize_column_names(df):
    normalized_columns = []
    for col in df.columns:
        normalized_col = col.lower()  # Convert to lower case
        normalized_col = re.sub(r'\s+|\W+', '_', normalized_col)  # Replace spaces and special characters
        normalized_columns.append(normalized_col)
    df.columns = normalized_columns
    return df

def infer_mysql_datatype(dtype):
    if pd.api.types.is_integer_dtype(dtype):
        return 'INT'
    elif pd.api.types.is_float_dtype(dtype):
        return 'FLOAT'
    elif pd.api.types.is_bool_dtype(dtype):
        return 'BOOLEAN'
    elif pd.api.types.is_datetime64_any_dtype(dtype):
        return 'DATETIME'
    else:
        return 'VARCHAR(255)'

def insert_csv_to_mysql(csv_file, db_config, table_name, db_schema):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file)
    df = normalize_column_names(df)
    # Establish a connection to the MySQL database
    conn = mysql.connector.connect(**db_config)
    cursor = conn.cursor()
    
    # Check if the table exists
    cursor.execute(f"SHOW TABLES LIKE '{table_name}'")
    table_exists = cursor.fetchone()
    
    if not table_exists:
    # Create the table
        columns = df.columns.tolist()
        sql_types = {col: infer_mysql_datatype(dtype) for col, dtype in df.dtypes.items()}
        column_definitions = [f"{col} {sql_types[col]}" for col in columns]
        db_schema[table_name] = {}
        db_schema[table_name] = {**sql_types}

        pk_columns = ["Create new auto-increment primary key"] + columns
        
        # Ask user to select primary key
        print("Select the primary key column:")
        for i, col in enumerate(pk_columns):
            print(f"{i}. {col}")
        pk_choice = int(input("Enter the number of the primary key column: "))
        
        db_schema[table_name]["pk"] = []
        if pk_choice == 0:
            column_definitions.insert(0, "id INT AUTO_INCREMENT PRIMARY KEY")
            db_schema[table_name]["pk"].append("id")

        else:
            db_schema[table_name]["pk"].append(columns[pk_choice - 1])
            column_definitions[pk_choice - 1] += " PRIMARY KEY"
        
        # Ask user if they want to add a foreign key
        db_schema[table_name]["fk"] = dict()
        add_fk = input("Do you want to add a foreign key? (y/n): ").lower() == 'y'
        if add_fk:
            fk_num = int(input("how many foreign keys present?"))
            print("select foreign table and its primary key")
            primary_keys = show_tables_and_primary_keys(db_config)
            for i, table in enumerate(primary_keys):
                print(f"{i}. {table}")

            fk_definition = ""

            while fk_num>0:

                fk_choice = int(input("Enter the number of the table and pk associated: "))
                
                fk_table = primary_keys[fk_choice][0]
                fk_ref_column = primary_keys[fk_choice][1]
                db_schema[table_name]["fk"][fk_table] = fk_ref_column
                fk_column = input("Enter the name of the foreign key column in current table: ")
                fk_definition += f", FOREIGN KEY ({fk_column}) REFERENCES {fk_table}({fk_ref_column})"
                fk_num -= 1
        else:
            fk_definition = ""

        
        # Create table SQL
        create_table_sql = f""" 
        CREATE TABLE {table_name} (
            {', '.join(column_definitions)}
            {fk_definition}
        )
        """
        cursor.execute(create_table_sql)
        print(f"Table {table_name} created successfully.")
    # Insert DataFrame records one by one
    for _, row in df.iterrows():
        columns = ', '.join(df.columns)
        placeholders = ', '.join(['%s'] * len(df.columns))
        sql = f"INSERT INTO {table_name} ({columns}) VALUES ({placeholders})"
        cursor.execute(sql, tuple(row))
    
    # Commit the transaction and close the connection
    conn.commit()
    cursor.close()
    conn.close()
    print(f"Data inserted into {table_name} successfully.")

In [68]:
def insert_csv_to_mysql(csv_file, db_config, table_name, db_schema):
    # Read the CSV file into a DataFrame
    df = pd.read_csv(csv_file)
    df = normalize_column_names(df)
    # Establish a connection to the MySQL database
    conn = mysql.connector.connect(**db_config)
    cursor = conn.cursor()
    
    # Check if the table exists
    cursor.execute(f"SHOW TABLES LIKE '{table_name}'")
    table_exists = cursor.fetchone()
    
    if not table_exists:
    # Create the table
        columns = df.columns.tolist()
        sql_types = {col: infer_mysql_datatype(dtype) for col, dtype in df.dtypes.items()}
        column_definitions = [f"{col} {sql_types[col]}" for col in columns]
        db_schema[table_name] = {}
        db_schema[table_name] = {**sql_types}

        pk_columns = ["Create new auto-increment primary key"] + columns
        
        # Ask user to select primary key
        print("Select the primary key column:")
        for i, col in enumerate(pk_columns):
            print(f"{i}. {col}")
        pk_choice = int(input("Enter the number of the primary key column: "))
        
        db_schema[table_name]["pk"] = []
        if pk_choice == 0:
            column_definitions.insert(0, "id INT AUTO_INCREMENT PRIMARY KEY")
            db_schema[table_name]["pk"].append("id")

        else:
            db_schema[table_name]["pk"].append(columns[pk_choice - 1])
            column_definitions[pk_choice - 1] += " PRIMARY KEY"
        
        # Ask user if they want to add a foreign key
        db_schema[table_name]["fk"] = dict()
        add_fk = input("Do you want to add a foreign key? (y/n): ").lower() == 'y'
        if add_fk:
            fk_num = int(input("how many foreign keys present?"))
            print("select foreign table and its primary key")
            primary_keys = show_tables_and_primary_keys(db_config)
            for i, table in enumerate(primary_keys):
                print(f"{i}. {table}")

            fk_definition = ""

            while fk_num>0:

                fk_choice = int(input("Enter the number of the table and pk associated: "))
                
                fk_table = primary_keys[fk_choice][0]
                fk_ref_column = primary_keys[fk_choice][1]
                db_schema[table_name]["fk"][fk_table] = fk_ref_column
                fk_column = input("Enter the name of the foreign key column in current table: ")
                fk_definition += f", FOREIGN KEY ({fk_column}) REFERENCES {fk_table}({fk_ref_column})"
                fk_num -= 1
        else:
            fk_definition = ""

        
        # Create table SQL
        create_table_sql = f""" 
        CREATE TABLE {table_name} (
            {', '.join(column_definitions)}
            {fk_definition}
        )
        """
        cursor.execute(create_table_sql)
        print(f"Table {table_name} created successfully.")
    # Insert DataFrame records one by one
    for _, row in df.iterrows():
        columns = ', '.join(df.columns)
        placeholders = ', '.join(['%s'] * len(df.columns))
        sql = f"INSERT INTO {table_name} ({columns}) VALUES ({placeholders})"
        cursor.execute(sql, tuple(row))
    
    # Commit the transaction and close the connection
    conn.commit()
    cursor.close()
    conn.close()
    print(f"Data inserted into {table_name} successfully.")

In [69]:
db_config = {
    "user": "root",
    "password": "Ptjthtvtag123@@",
    "host": "127.0.0.1",
    "database": "chatdb",
    "raise_on_warnings": True}
insert_csv_to_mysql("patients_table.csv", db_config, "patients", new_schema)

Select the primary key column:
0. Create new auto-increment primary key
1. patientid
2. patientname
3. age
4. gender
5. bloodtype
6. disease
Table patients created successfully.
Data inserted into patients successfully.


In [70]:
new_schema

{'patients': {'patientid': 'INT',
  'patientname': 'VARCHAR(255)',
  'age': 'INT',
  'gender': 'VARCHAR(255)',
  'bloodtype': 'VARCHAR(255)',
  'disease': 'VARCHAR(255)',
  'pk': ['patientid'],
  'fk': {}}}

In [71]:
db_config = {
    "user": "root",
    "password": "Ptjthtvtag123@@",
    "host": "127.0.0.1",
    "database": "chatdb",
    "raise_on_warnings": True}
insert_csv_to_mysql("insurance_table.csv", db_config, "insurance", new_schema)

Select the primary key column:
0. Create new auto-increment primary key
1. insuranceid
2. patientid
3. insuranceprovider
4. billingcost
5. benefit
select foreign table and its primary key
Tables and Primary Keys in database 'chatdb':
Table: patients, Primary Key(s): patientid
0. ('patients', 'patientid')
Table insurance created successfully.
Data inserted into insurance successfully.


In [72]:
new_schema

{'patients': {'patientid': 'INT',
  'patientname': 'VARCHAR(255)',
  'age': 'INT',
  'gender': 'VARCHAR(255)',
  'bloodtype': 'VARCHAR(255)',
  'disease': 'VARCHAR(255)',
  'pk': ['patientid'],
  'fk': {}},
 'insurance': {'insuranceid': 'INT',
  'patientid': 'INT',
  'insuranceprovider': 'VARCHAR(255)',
  'billingcost': 'FLOAT',
  'benefit': 'VARCHAR(255)',
  'pk': ['insuranceid'],
  'fk': {'patients': 'patientid'}}}

In [73]:
db_config = {
    "user": "root",
    "password": "Ptjthtvtag123@@",
    "host": "127.0.0.1",
    "database": "chatdb",
    "raise_on_warnings": True}
insert_csv_to_mysql("hospitals_table.csv", db_config, "hospitals", new_schema)

Select the primary key column:
0. Create new auto-increment primary key
1. admissionid
2. patientid
3. insuranceid
4. doctor
5. hospitalname
6. intakedate
7. dischargedate
8. roomnumber
9. carelevel
10. testresults
select foreign table and its primary key
Tables and Primary Keys in database 'chatdb':
Table: insurance, Primary Key(s): insuranceid
Table: patients, Primary Key(s): patientid
0. ('insurance', 'insuranceid')
1. ('patients', 'patientid')
Table hospitals created successfully.
Data inserted into hospitals successfully.


In [74]:
new_schema

{'patients': {'patientid': 'INT',
  'patientname': 'VARCHAR(255)',
  'age': 'INT',
  'gender': 'VARCHAR(255)',
  'bloodtype': 'VARCHAR(255)',
  'disease': 'VARCHAR(255)',
  'pk': ['patientid'],
  'fk': {}},
 'insurance': {'insuranceid': 'INT',
  'patientid': 'INT',
  'insuranceprovider': 'VARCHAR(255)',
  'billingcost': 'FLOAT',
  'benefit': 'VARCHAR(255)',
  'pk': ['insuranceid'],
  'fk': {'patients': 'patientid'}},
 'hospitals': {'admissionid': 'INT',
  'patientid': 'INT',
  'insuranceid': 'INT',
  'doctor': 'VARCHAR(255)',
  'hospitalname': 'VARCHAR(255)',
  'intakedate': 'VARCHAR(255)',
  'dischargedate': 'VARCHAR(255)',
  'roomnumber': 'INT',
  'carelevel': 'VARCHAR(255)',
  'testresults': 'VARCHAR(255)',
  'pk': ['admissionid'],
  'fk': {'insurance': 'insuranceid', 'patients': 'patientid'}}}

## MongoDB

In [97]:
import pandas as pd
import json

def csv_to_json_mongo(csv_file, save_file, primary_key):
    # Read the CSV file using Pandas
    data = pd.read_csv(csv_file)
    
    # Convert the DataFrame to a list of dictionaries (JSON format)
    json_output = data.to_dict(orient='records')
    
    # Convert lists of values into MongoDB accepted format
    json_ready = []
    for record in json_output:
        json_ready.append({key: value for key, value in record.items() if pd.notna(value)})  # remove NaN values
    for record in json_ready:
        record['_id'] = record[primary_key]

    with open(save_file, 'w') as file:
        json.dump(json_ready, file, indent=4)
    
    return json_ready

In [100]:
csv_to_json_mongo("patients_table.csv", "patients.json", "patientid")

[{'patientid': 1,
  'patientname': 'BobbyJacksOn',
  'age': 30,
  'gender': 'Male',
  'bloodtype': 'B-',
  'disease': 'Cancer',
  '_id': 1},
 {'patientid': 2,
  'patientname': 'LesLieTErRy',
  'age': 62,
  'gender': 'Male',
  'bloodtype': 'A+',
  'disease': 'Obesity',
  '_id': 2},
 {'patientid': 3,
  'patientname': 'DaNnYsMitH',
  'age': 76,
  'gender': 'Female',
  'bloodtype': 'A-',
  'disease': 'Obesity',
  '_id': 3},
 {'patientid': 4,
  'patientname': 'andrEwwaTtS',
  'age': 28,
  'gender': 'Female',
  'bloodtype': 'O+',
  'disease': 'Diabetes',
  '_id': 4},
 {'patientid': 5,
  'patientname': 'adrIENNEbEll',
  'age': 43,
  'gender': 'Female',
  'bloodtype': 'AB+',
  'disease': 'Cancer',
  '_id': 5},
 {'patientid': 6,
  'patientname': 'EMILYJOHNSOn',
  'age': 36,
  'gender': 'Male',
  'bloodtype': 'A+',
  'disease': 'Asthma',
  '_id': 6},
 {'patientid': 7,
  'patientname': 'edwArDEDWaRDs',
  'age': 21,
  'gender': 'Female',
  'bloodtype': 'AB-',
  'disease': 'Diabetes',
  '_id': 7},


In [98]:
csv_to_json_mongo("hospitals_table.csv", "hospitals.json", "admissionid")

[{'admissionid': 2001,
  'patientid': 1,
  'insuranceid': 1001,
  'doctor': 'MatthewSmith',
  'hospitalname': 'SonsandMiller',
  'intakedate': '01/31/2024',
  'dischargedate': '02/02/2024',
  'roomnumber': 328,
  'carelevel': 'Urgent',
  'testresults': 'Normal',
  '_id': 2001},
 {'admissionid': 2002,
  'patientid': 2,
  'insuranceid': 1002,
  'doctor': 'SamanthaDavies',
  'hospitalname': 'KimInc',
  'intakedate': '08/20/2019',
  'dischargedate': '08/26/2019',
  'roomnumber': 265,
  'carelevel': 'Emergency',
  'testresults': 'Inconclusive',
  '_id': 2002},
 {'admissionid': 2003,
  'patientid': 3,
  'insuranceid': 1003,
  'doctor': 'TiffanyMitchell',
  'hospitalname': 'CookPLC',
  'intakedate': '09/22/2022',
  'dischargedate': '10/07/2022',
  'roomnumber': 205,
  'carelevel': 'Emergency',
  'testresults': 'Normal',
  '_id': 2003},
 {'admissionid': 2004,
  'patientid': 4,
  'insuranceid': 1004,
  'doctor': 'KevinWells',
  'hospitalname': 'HernandezRogersandVang,',
  'intakedate': '11/18/2

In [101]:
csv_to_json_mongo("insurance_table.csv", "insurance.json", "insuranceid")

[{'insuranceid': 1001,
  'patientid': 1,
  'insuranceprovider': 'BlueCross',
  'billingcost': 18856.28131,
  'benefit': 'Senior',
  '_id': 1001},
 {'insuranceid': 1002,
  'patientid': 2,
  'insuranceprovider': 'Medicare',
  'billingcost': 33643.32729,
  'benefit': 'Standard',
  '_id': 1002},
 {'insuranceid': 1003,
  'patientid': 3,
  'insuranceprovider': 'Aetna',
  'billingcost': 27955.09608,
  'benefit': 'Standard',
  '_id': 1003},
 {'insuranceid': 1004,
  'patientid': 4,
  'insuranceprovider': 'Medicare',
  'billingcost': 37909.78241,
  'benefit': 'Premium',
  '_id': 1004},
 {'insuranceid': 1005,
  'patientid': 5,
  'insuranceprovider': 'Aetna',
  'billingcost': 14238.31781,
  'benefit': 'Premium',
  '_id': 1005},
 {'insuranceid': 1006,
  'patientid': 6,
  'insuranceprovider': 'UnitedHealthcare',
  'billingcost': 48145.11095,
  'benefit': 'Premium',
  '_id': 1006},
 {'insuranceid': 1007,
  'patientid': 7,
  'insuranceprovider': 'Medicare',
  'billingcost': 19580.87234,
  'benefit': '

In [102]:
csv_to_json_mongo("prescriptions_table.csv", "prescriptions.json", "prescriptionid")

[{'prescriptionid': 'P0001',
  'patientid': 1,
  'medication': 'Paracetamol',
  'drug_quantity': '500mg',
  '_id': 'P0001'},
 {'prescriptionid': 'P0002',
  'patientid': 2,
  'medication': 'Ibuprofen',
  'drug_quantity': '5ml',
  '_id': 'P0002'},
 {'prescriptionid': 'P0003',
  'patientid': 3,
  'medication': 'Aspirin',
  'drug_quantity': '10mg',
  '_id': 'P0003'},
 {'prescriptionid': 'P0004',
  'patientid': 4,
  'medication': 'Ibuprofen',
  'drug_quantity': '2ml',
  '_id': 'P0004'},
 {'prescriptionid': 'P0005',
  'patientid': 5,
  'medication': 'Penicillin',
  'drug_quantity': '2ml',
  '_id': 'P0005'},
 {'prescriptionid': 'P0006',
  'patientid': 6,
  'medication': 'Ibuprofen',
  'drug_quantity': '500mg',
  '_id': 'P0006'},
 {'prescriptionid': 'P0007',
  'patientid': 7,
  'medication': 'Paracetamol',
  'drug_quantity': '10mg',
  '_id': 'P0007'},
 {'prescriptionid': 'P0008',
  'patientid': 8,
  'medication': 'Paracetamol',
  'drug_quantity': '10mg',
  '_id': 'P0008'},
 {'prescriptionid': 

In [90]:
def insert_json_to_mongodb(json_file, mongo_uri, db_name, collection_name):
    # Read the JSON file into a DataFrame
    df = pd.read_json(json_file)
    
    # Establish a connection to the MongoDB database
    client = MongoClient(mongo_uri)
    db = client[db_name]
    collection = db[collection_name]
    
    # Insert records into the collection
    documents = json.loads(df.to_json(orient='records'))
    collection.insert_many(documents)
    
    # Close the connection
    client.close()

In [103]:
conn_string = "mongodb://localhost:27017/"
db_name = "chatdb"
coll_name = "patients"
insert_json_to_mongodb("patients.json", conn_string, db_name, coll_name)

In [99]:
conn_string = "mongodb://localhost:27017/"
db_name = "chatdb"
coll_name = "hospitals"
insert_json_to_mongodb("hospitals.json", conn_string, db_name, coll_name)

In [104]:
conn_string = "mongodb://localhost:27017/"
db_name = "chatdb"
coll_name = "insurance"
insert_json_to_mongodb("insurance.json", conn_string, db_name, coll_name)

In [76]:
with open('db_schema.json', 'w') as db_file:
    json.dump(new_schema, db_file, indent=4)

In [106]:
with open("/Users/vedanttibrewal/Documents/USC/lectures/sem_1/DSCI-551/project/chatDB-dsci551/src/chatdb/constants/sql_db_schema.json", 'r') as db_file:
    test = json.load(db_file)

test

{'patients': {'patientid': 'INT',
  'patientname': 'VARCHAR(255)',
  'age': 'INT',
  'gender': 'VARCHAR(255)',
  'bloodtype': 'VARCHAR(255)',
  'disease': 'VARCHAR(255)',
  'pk': ['patientid'],
  'fk': {}},
 'insurance': {'insuranceid': 'INT',
  'patientid': 'INT',
  'insuranceprovider': 'VARCHAR(255)',
  'billingcost': 'FLOAT',
  'benefit': 'VARCHAR(255)',
  'pk': ['insuranceid'],
  'fk': {'patients': 'patientid'}},
 'hospitals': {'admissionid': 'INT',
  'patientid': 'INT',
  'insuranceid': 'INT',
  'doctor': 'VARCHAR(255)',
  'hospitalname': 'VARCHAR(255)',
  'intakedate': 'VARCHAR(255)',
  'dischargedate': 'VARCHAR(255)',
  'roomnumber': 'INT',
  'carelevel': 'VARCHAR(255)',
  'testresults': 'VARCHAR(255)',
  'pk': ['admissionid'],
  'fk': {'insurance': 'insuranceid', 'patients': 'patientid'}}}