Step 1: Connect and check SQL Server tables & data types

In [12]:
import pyodbc
import pandas as pd

sql_server_conn_str = (
    r"DRIVER={ODBC Driver 17 for SQL Server};"
    r"SERVER=DESKTOP-NU99729\SQLEXPRESS;"
    r"DATABASE=Chinook_db;"
    r"Trusted_Connection=yes;"
)

sql_tables = [
    "Album", "Artist", "Customer", "Employee", "Genre",
    "Invoice", "InvoiceLine", "MediaType", "Playlist", "PlaylistTrack", "Track"
]

with pyodbc.connect(sql_server_conn_str) as conn:
    for table in sql_tables:
        df = pd.read_sql(f"SELECT TOP 5 * FROM {table}", conn)  # Only first 5 rows
        print(f"Table {table} sample data types:\n{df.dtypes}\n")
        print(df.head(), "\n")


  df = pd.read_sql(f"SELECT TOP 5 * FROM {table}", conn)  # Only first 5 rows


Table Album sample data types:
AlbumId     object
Title       object
ArtistId    object
dtype: object

  AlbumId                                  Title ArtistId
0     1.0  For Those About To Rock We Salute You      1.0
1     2.0                      Balls to the Wall      2.0
2     3.0                      Restless and Wild      2.0
3     4.0                      Let There Be Rock      1.0
4     5.0                               Big Ones      3.0 

Table Artist sample data types:
ArtistId    object
Name        object
dtype: object

  ArtistId               Name
0        1              AC/DC
1        2               None
2        3          Aerosmith
3        4  Alanis Morissette
4        5    Alice In Chains 

Table Customer sample data types:
CustomerId       int64
FirstName       object
LastName        object
Company         object
Address         object
City            object
State           object
Country         object
PostalCode      object
Phone           object
Fax             

  df = pd.read_sql(f"SELECT TOP 5 * FROM {table}", conn)  # Only first 5 rows
  df = pd.read_sql(f"SELECT TOP 5 * FROM {table}", conn)  # Only first 5 rows
  df = pd.read_sql(f"SELECT TOP 5 * FROM {table}", conn)  # Only first 5 rows
  df = pd.read_sql(f"SELECT TOP 5 * FROM {table}", conn)  # Only first 5 rows
  df = pd.read_sql(f"SELECT TOP 5 * FROM {table}", conn)  # Only first 5 rows


Table Genre sample data types:
GenreId     int64
Name       object
dtype: object

   GenreId                Name
0        1                Rock
1        2                Jazz
2        3               Metal
3        4  Alternative & Punk
4        5       Rock And Roll 

Table Invoice sample data types:
InvoiceId              int64
CustomerId             int64
InvoiceDate           object
BillingAddress        object
BillingCity           object
BillingState          object
BillingCountry        object
BillingPostalCode     object
Total                float64
dtype: object

   InvoiceId  CustomerId          InvoiceDate           BillingAddress  \
0          1           2  2009-01-01 00:00:00  Theodor-Heuss-Straße 34   
1          2           4  2009-01-02 00:00:00         Ullevålsveien 14   
2          3           8  2009-01-03 00:00:00          Grétrystraat 63   
3          4          14  2009-01-06 00:00:00           8210 111 ST NW   
4          5          23  2009-01-11 00:00:00      

  df = pd.read_sql(f"SELECT TOP 5 * FROM {table}", conn)  # Only first 5 rows
  df = pd.read_sql(f"SELECT TOP 5 * FROM {table}", conn)  # Only first 5 rows


Table InvoiceLine sample data types:
InvoiceLineId      int64
InvoiceId          int64
TrackId            int64
UnitPrice        float64
Quantity           int64
dtype: object

   InvoiceLineId  InvoiceId  TrackId  UnitPrice  Quantity
0              1          1        2       0.99         1
1              2          1        4       0.99         1
2              3          2        6       0.99         1
3              4          2        8       0.99         1
4              5          2       10       0.99         1 

Table MediaType sample data types:
MediaTypeId     int64
Name           object
dtype: object

   MediaTypeId                         Name
0            1              MPEG audio file
1            2     Protected AAC audio file
2            3  Protected MPEG-4 video file
3            4     Purchased AAC audio file
4            5               AAC audio file 

Table Playlist sample data types:
PlaylistId     int64
Name          object
dtype: object

   PlaylistId        N

  df = pd.read_sql(f"SELECT TOP 5 * FROM {table}", conn)  # Only first 5 rows
  df = pd.read_sql(f"SELECT TOP 5 * FROM {table}", conn)  # Only first 5 rows
  df = pd.read_sql(f"SELECT TOP 5 * FROM {table}", conn)  # Only first 5 rows


Step 2: Create Snowflake database, schema, and stage

In [1]:
import os
import pyodbc
import pandas as pd
from snowflake.snowpark import Session
from dotenv import load_dotenv

load_dotenv()

sql_server_conn_str = (
    r"DRIVER={ODBC Driver 17 for SQL Server};"
    r"SERVER=DESKTOP-NU99729\SQLEXPRESS;"
    r"DATABASE=Chinook_db;"
    r"Trusted_Connection=yes;"
)

snowflake_params = {
    "account": os.getenv("SNOWFLAKE_ACCOUNT"),
    "user": os.getenv("SNOWFLAKE_USER"),
    "password": os.getenv("SNOWFLAKE_PASSWORD"),
    "role": os.getenv("SNOWFLAKE_ROLE"),
    "warehouse": os.getenv("SNOWFLAKE_WAREHOUSE"),
    "database": os.getenv("SNOWFLAKE_DATABASE"),
    "schema": os.getenv("SNOWFLAKE_SCHEMA"),
}

new_database = "CHINOOK_DATABASE"
new_schema = "ERD_SCHEMA"
new_stage = "DATA_INGESTION_STAGE"

# List of tables to migrate
sql_tables = ["Album", "Artist", "Customer", "Employee", "Genre", "Invoice", "InvoiceLine", "MediaType", "Playlist", "PlaylistTrack", "Track"]

def create_db_schema_stage(session):
    session.sql(f"CREATE DATABASE IF NOT EXISTS {new_database}").collect()
    session.sql(f"CREATE SCHEMA IF NOT EXISTS {new_database}.{new_schema}").collect()
    session.sql(f"""
        CREATE STAGE IF NOT EXISTS {new_database}.{new_schema}.{new_stage}
        FILE_FORMAT = (TYPE = 'CSV' FIELD_OPTIONALLY_ENCLOSED_BY = '\"' SKIP_HEADER = 1)
    """).collect()
    session.use_database(new_database)
    session.use_schema(new_schema)

def map_dtype_to_snowflake(dtype):
    # Basic dtype mapping
    dtype_str = str(dtype).lower()
    if 'int' in dtype_str:
        return "NUMBER"
    elif 'float' in dtype_str or 'double' in dtype_str:
        return "FLOAT"
    elif 'bool' in dtype_str:
        return "BOOLEAN"
    elif 'datetime' in dtype_str or 'timestamp' in dtype_str:
        return "TIMESTAMP_NTZ"
    else:
        # Default fallback type
        return "STRING"

def create_table_from_df(session, df, table_name):
    columns_ddl = []
    for col, dtype in zip(df.columns, df.dtypes):
        sf_type = map_dtype_to_snowflake(dtype)
        # Sanitize column name for Snowflake (uppercase, no spaces)
        col_clean = col.replace(" ", "_").upper()
        columns_ddl.append(f"{col_clean} {sf_type}")
    ddl = f"CREATE TABLE IF NOT EXISTS {table_name.upper()} ({', '.join(columns_ddl)})"
    session.sql(ddl).collect()

def export_to_csv(df, table_name):
    filename = f"{table_name}.csv"
    df.to_csv(filename, index=False, header=True)
    return filename

def upload_and_copy(session, table_name, csv_file):
    # PUT command to upload CSV to stage
    put_sql = f"PUT file://{csv_file} @{new_database}.{new_schema}.{new_stage} AUTO_COMPRESS=TRUE"
    session.sql(put_sql).collect()

    # COPY INTO to load data into table
    copy_sql = f"""
        COPY INTO {table_name.upper()}
        FROM @{new_database}.{new_schema}.{new_stage}/{csv_file}.gz
        FILE_FORMAT = (TYPE = 'CSV' FIELD_OPTIONALLY_ENCLOSED_BY='\"' SKIP_HEADER=1)
        ON_ERROR = 'CONTINUE'
    """
    session.sql(copy_sql).collect()

def main():
    with pyodbc.connect(sql_server_conn_str) as conn, \
         Session.builder.configs(snowflake_params).create() as session:

        create_db_schema_stage(session)

        for table in sql_tables:
            print(f"Processing table: {table}")
            df = pd.read_sql(f"SELECT * FROM {table}", conn)

            # Create table based on DataFrame schema
            create_table_from_df(session, df, table)

            csv_file = export_to_csv(df, table)
            upload_and_copy(session, table, csv_file)

            print(f"Table {table} migrated successfully.")

if __name__ == "__main__":
    main()


Processing table: Album


  df = pd.read_sql(f"SELECT * FROM {table}", conn)


Table Album migrated successfully.
Processing table: Artist


  df = pd.read_sql(f"SELECT * FROM {table}", conn)


Table Artist migrated successfully.
Processing table: Customer


  df = pd.read_sql(f"SELECT * FROM {table}", conn)


Table Customer migrated successfully.
Processing table: Employee


  df = pd.read_sql(f"SELECT * FROM {table}", conn)


Table Employee migrated successfully.
Processing table: Genre


  df = pd.read_sql(f"SELECT * FROM {table}", conn)


Table Genre migrated successfully.
Processing table: Invoice


  df = pd.read_sql(f"SELECT * FROM {table}", conn)


Table Invoice migrated successfully.
Processing table: InvoiceLine


  df = pd.read_sql(f"SELECT * FROM {table}", conn)


Table InvoiceLine migrated successfully.
Processing table: MediaType


  df = pd.read_sql(f"SELECT * FROM {table}", conn)


Table MediaType migrated successfully.
Processing table: Playlist


  df = pd.read_sql(f"SELECT * FROM {table}", conn)


Table Playlist migrated successfully.
Processing table: PlaylistTrack


  df = pd.read_sql(f"SELECT * FROM {table}", conn)


Table PlaylistTrack migrated successfully.
Processing table: Track


  df = pd.read_sql(f"SELECT * FROM {table}", conn)


Table Track migrated successfully.
