1. Install virtualenv `pip install virtualenv`
2. Create a virtual environment in project root directory `virtualenv venv --python=python3.8.9` (Snowpark for Python requires Python 3.8.X)
3. Install Snowflake Connector for Python with Pandas `pip install snowflake-connector-python[pandas]`
4. Install Snowpark for Python with Pandas `pip install snowflake-snowpark-python[pandas]`
5. Install IPYthon `pip install IPYthon`
6. Install Jupyter Notebooks `pip install jupyter`

In [None]:
import pandas as pd

from configparser import ConfigParser
# from typing import PASS
from pandas_profiling import ProfileReport
from snowflake.snowpark import Session
# from snowflake.snowpark.functions import udf, split
# from snowflake.snowpark import functions as funct

In [None]:
# Create a 'config.ini' file to store credentials locally for security
config = ConfigParser()
cfg_path = '../config.ini'
config.read(cfg_path)

In [None]:
# Connect to Snowflake
account = config['CONNECTION']['account']
user = config['CONNECTION']['user']
password = config['CONNECTION']['password']
role = config['CONNECTION']['role']
warehouse = config['CONNECTION']['warehouse']
database = config['CONNECTION']['database']
schema = config['CONNECTION']['schema']

connection_parameters = {
  "account": account,
  "user": user,
  "password": password,
  "role": role,
  "warehouse": warehouse,
  "database": database,
  "schema": schema
}

session = Session.builder.configs(connection_parameters).create()

# Test connection
print(session.sql("select * from movies_raw limit 10").collect())  

In [None]:
# Store database tables in a DataFrame
movies_raw = session.table('movies_raw').to_pandas()
ratings_raw = session.table('ratings_raw').to_pandas()
users_raw = session.table('users_raw').to_pandas()

# Print an example of the movies_raw dataframe in pandas format
movies_raw.head()

In [None]:
# Data transformations on movies_raw. Add a new column called 'year', create dummies for genres, and drop the genres column (create function to do this)
movies_raw[['MOVIE_TITLE','MOVIE_YEAR']] = movies_raw['MOVIE_TITLE'].str.split(' \(', expand=True)
movies_raw['MOVIE_YEAR'] = movies_raw['MOVIE_YEAR'].str.replace('\)','')
movies_raw = movies_raw.join(pd.DataFrame(movies_raw['GENRE'].str.get_dummies())).drop('GENRE', axis=1)

In [None]:
# DQ checks: Remove missing values, check for duplicate rows, check for null values, check for duplicate columns (create function to do this)
movies_raw.isnull().sum()

In [None]:
# DQ checks on movies_raw
movies_raw.drop_duplicates().count()


In [None]:
movies_raw.dropna().count()

In [None]:
ratings_base = ratings_raw.join(users_raw.set_index('USER_ID'), on='USER_ID')

In [None]:
movies_base = movies_raw.join(ratings_base.set_index('MOVIE_ID'), on='MOVIE_ID')

In [None]:
ProfileReport(movies_base)