# Libraries Import

In [43]:
import pandas as pd

# for connection with Azure SQL Database
import pyodbc

# for keeping credentials out of sight
import os
from dotenv import load_dotenv

# Credentials and Authorization

You will need to insert your database connection parameters and save as `sql-keys.env` local file.

```
DB_SERVER = "XXXXX"
DB_NAME = "XXXXX" 
DB_USERNAME = "XXXXX"
DB_PASSWORD = XXXXX
```
The following is the list of the connection parameters:
- *DB_SERVER*: database server address e.g., localhost or an IP address.
- *DB_NAME*: the name of the database that you want to connect.
- *DB_USERNAME*: the username used to authenticate.
- *DB_PASSWORD*: password used to authenticate.


In [50]:
# establish working directory path
# getcwd() returns current working directory
wdir_path = os.getcwd()

sql_path = os.path.join(wdir_path, "sql-keys.env") # absolute path of "sql-keys.env"
# load the credentials into os environment 
load_dotenv(sql_path)
#check if credentials loaded successfully
os.environ

# getting credentials information from "sql-keys.env"
server = os.getenv("DB_SERVER")
database = os.getenv("DB_NAME")
username = os.getenv("DB_USERNAME")
password = os.getenv("DB_PASSWORD")

driver = ''
# retriving ODBC Driver information in user's computer using Pyodbc
driver_names = [x for x in pyodbc.drivers() if x.endswith(' for SQL Server')]
if driver_names:
    driver = driver_names[0]
# if driver exists, create connection
if driver:
    # establish the connection
    conn = pyodbc.connect("DRIVER={" + driver + "};SERVER=" + server +
                      ";DATABASE=" + database + ";UID=" + username +
                      ";PWD=" + password)
    print(conn)
else:
    print("(No suitable driver found. Cannot connect.)")


<pyodbc.Connection object at 0x000002972419F1C0>


# Create a cusor object

In [58]:
cursor = conn.cursor()

# Database Tables Creation

In [83]:
def create_table(command, msg):
    '''
    Function to create table into Azure database
    
    Parameters:
        command (string): SQL statement to create new table in database
        msg (string): A string message to print that table is successfully created
    '''
    try:
        cursor.execute(command)
        conn.commit()     
    except Exception as e:
        # rollback current transaction if there is an error
        conn.rollback()
        raise e
    print(msg)

# create olist_sellers table
create_table("""
            IF OBJECT_ID('dbo.olist_sellers', 'U') IS NULL 
                CREATE TABLE olist_sellers
                (
                    seller_id VARCHAR(255) PRIMARY KEY NOT NULL, 
                    seller_zip_code_prefix INT, 
                    seller_city VARCHAR(255), 
                    seller_state VARCHAR(2)
                );
            """,
            "olist_sellers successfully created.")

# create olist_customers table
create_table("""
            IF OBJECT_ID('dbo.olist_customers', 'U') IS NULL 
                CREATE TABLE olist_customers 
                (
                    customer_id VARCHAR(255) PRIMARY KEY NOT NULL,
                    customer_unique_id VARCHAR(255) NOT NULL, 
                    customer_zip_code_prefix INT, 
                    customer_city VARCHAR(255), 
                    customer_state VARCHAR(2)
                );
            """,
            "olist_customers successfully created.")

# create olist_geolocation
create_table("""
            IF OBJECT_ID('dbo.olist_geolocation', 'U') IS NULL 
                CREATE TABLE olist_geolocation 
                (
                    geolocation_zip_code_prefix INT PRIMARY KEY NOT NULL,
                    geolocation_lat DEMICAL NOT NULL, 
                    geolocation_lng DECIMAL NOT NULL, 
                    geolocation_city VARCHAR(255) NOT NULL, 
                    geolocation_state VARCHAR(2) NOT NULL
                );
            """,
            "olist_geolocation successfully created.")

conn.commit()

olist_sellers successfully created.
olist_customers successfully created.


olist_order_reviews_dataset.csv

In [None]:
# review_id: unique review identifier
# order_id: unique order identifier
# review_score: Note ranging from 1 to 5 given by the customer on a satisfaction survey
# review_comment_title: Comment title from the review left by the customer, in Portuguese
# review_comment_message: Comment message from the review left by the customer, in Portuguese
# review_creation_date: Shows the date in which the satisfaction survey was sent to the customer
# review_answer_timestamp: Shows satisfaction survey answer timestamp

# Read in olist_order_reviews_dataset.csv as pandas dataframe and parse dates of 
# "review_creation_date" and "review_answer_timestamp" columns.
parse_dates = ["review_creation_date", "review_answer_timestamp"]
order_reviews_data = pd.read_csv("olist_order_reviews_dataset.csv",\
                                 infer_datetime_format = True, parse_dates = parse_dates)

# Drop duplicates from review_id column.
order_reviews_data.drop_duplicates(subset = ['review_id'], inplace = True)

# Preview the first 5 lines of the loaded data 
order_reviews_data.head()

In [None]:
# Count number of entries in each column
order_reviews_data.count()

In [None]:
# Count number of unique entries
order_reviews_data.nunique()

In [None]:
# Check for null values
order_reviews_data.isnull().sum()

In [None]:
# Drop columns with null values
order_reviews_data.drop(columns = ["review_comment_title", "review_comment_message"], inplace = True)

In [None]:
# Check datatypes
order_reviews_data.dtypes

In [None]:
# Change column to datetime
order_reviews_data['review_answer_timestamp'] = order_reviews_data['review_answer_timestamp'].dt.date
order_reviews_data.head()

In [None]:
# Check datatypes
order_reviews_data.dtypes

olist_order_payments_dataset.csv

In [None]:
# order_id: unique order identifier
# payment_sequential: a customer may pay an order with more than one payment method. If he does so, a sequence will be created to
# payment_type: method of payment chosen by the customer
# payment_installments: number of installments chosen by the customer
# payment_value: transaction value

# Boleto payments: Boleto is an official (regulated by the Central Bank of Brazil) payment method in Brazil. 
# To complete a transaction, customers receive a voucher stating the amount to pay for services or goods. 
# Customers then pay the boleto before its expiration date in one of several different methods, including at authorized agencies or banks, ATMs, or online bank portals. 
# You will receive payment confirmation after 1 business day, while funds will be available for payout 2 business days after payment confirmation.
# https://stripe.com/docs/payments/boleto 
# https://www.rapyd.net/blog/what-is-boleto-everything-you-need-to-know/ 

# Read in olist_order_payments_dataset.csv and make pandas dataframe
order_payment_data = pd.read_csv("C:/Users/Mavis Luo/Downloads/Final Year Project/Dataset/olist_order_payments_dataset.csv")

# Drop duplicates from order_id column
order_payment_data.drop_duplicates(subset = ['order_id'], inplace = True)

# Set index as order_id
order_payment_data.set_index("order_id", inplace = True)

# Print dataframe
order_payment_data.head()

In [None]:
# Count rows of each column
order_payment_data.count()

In [None]:
# Count number of unique entries
order_payment_data.nunique()

In [None]:
# Print dataframe
order_payment_data.head()

In [None]:
# Check for null values
order_payment_data.isnull().sum()

In [None]:
# Check datatypes
order_payment_data.dtypes

In [None]:
df_comments = olist_order_reviews.loc[:, ['review_score', 'review_comment_message']]
df_comments = df_comments.dropna(subset=['review_comment_message'])
df_comments = df_comments.reset_index(drop=True)
print(f'Dataset shape: {df_comments.shape}')
df_comments.columns = ['score', 'comment']
df_comments.head()