### Import Python Dependencies

In [1]:
import time
import numpy as np 
import pandas as pd
import sqlite3
import psycopg2
import os
import openpyxl as xl

### Load Datasets

In [2]:
# root directory of the assessment
root_dir = os.getcwd()     

# this is path to the dataset
dataset = os.path.join(root_dir, "dataset")

#load assessment databases
asssessment_db = os.path.join(dataset, "assessment_db_script.sql")

# load excel dataset
dsr_data = os.path.join(dataset, "DSR.xlsx")
farms_data = os.path.join(dataset, "farms.xlsx")
producers_data = os.path.join(dataset, "producers.xlsx")
varieties_data = os.path.join(dataset, "varieties.xlsx")
vessels_data = os.path.join(dataset, "vessels.xlsx")

In [26]:
class DataLoader(object):
    """
    This is a blueprint for connecting to the database and processing the data
    """

    def __init__(self):
        pass
    
    def connect(self, db_file):
        """
        This method connect to the database

        arg:
            db_file: this is the dababase file

        returns: 
            cursor and connection
        """

        try:
            # connect to the database
            conn = sqlite3.connect(db_file)
            cursor = conn.cursor()
            print("Successfully connected to the database.")

        except psycopg2.Error as e:
            
            print("Error: Could not get the cursor to the database")
            print(e)
            
        return conn, cursor
            

    def load_workbook(self, file_name):
        """
        This method loads excel dataset
        
        args:
            file_name: the name of the file
            excel_file: file to import into the sql database
            
        returns: 
            dataframe
        """
        
        df = pd.read_excel(file_name)
                
        return df

### Connect to the Database

In [4]:
# data loader object
dataloader_object = DataLoader()

# start time
start_time = time.time()

# establish the connection 
conn, cursor = dataloader_object.connect(asssessment_db)
print("Time to connect to the DB: ",round((time.time() - start_time)/60,5),"minutes")

Successfully connected to the database.
Time to connect to the DB:  6e-05 minutes


### Load Detailed Season Report (DSR) Raw Dataset

In [13]:
# start time
start_time = time.time()

# load dsr dataframe 
dsr_df = dataloader_object.load_workbook(dsr_data)

print("Time to load DSR data: ",round((time.time() - start_time)/60,5),"minutes")

Time to connect to the DB:  1.27828 minutes


In [32]:
# visualize DSR raw dataset
dsr_df.head()

Unnamed: 0,Year,ProducerID,FarmID,Sales Week,Pallet ID,Sale ID,Barcode,Grade Code,Mark Code,VarietyID,...,Total Export Cost,Total Weight,Pack Nett Weight,Nett Weight,Pack Gross Weight,Gross Weight,FOB,DIP,VAT,Return To Grower
0,2017,201100,45,24,201233,3124,9.600916001274861e+17,1,AMA,74,...,0.0,0,16.0,1280.0,16.95,1356.0,10.5,139.324307,-55.907373,10074.41044
1,2017,201100,45,25,207565,10375,9.60091600127457e+17,1,AMA,74,...,0.0,0,16.0,1280.0,16.95,1356.0,10.5,137.090203,-47.779393,9875.083022
2,2017,201100,45,25,100285,10375,9.600916001274861e+17,1,AMA,74,...,0.0,0,16.0,1280.0,16.95,1356.0,10.5,137.090203,52.220607,9875.083022
3,2017,201100,45,25,181685,7374,9.60091600127457e+17,1,AMA,74,...,0.0,0,16.0,1280.0,16.95,1356.0,10.5,141.074018,-40.300725,10122.17772
4,2017,201100,45,26,66238,6697,9.600916001274701e+17,1,AMA,700,...,0.0,0,16.0,1280.0,16.95,1356.0,7.0,92.381472,73.75377,6614.094181


In [33]:
# compute the sum of null values in the dataframe
sum_of_null_values = dsr_df.isnull().sum()

print(sum_of_null_values)

Year                       0
ProducerID                 0
FarmID                     0
Sales Week                 0
Pallet ID                  0
Sale ID                    0
Barcode                    0
Grade Code                 0
Mark Code                  0
VarietyID                  0
Pack Code                  0
Count Code                 2
Exchange Rate              0
Currency                   0
QC                         0
Sequence Number            0
Production ID            209
Run Number             96551
Container No           55337
VesselID                   0
Port Of Discharge      42479
Target Market              0
Inventory Code           587
Trader ID                  0
Pallet Size                0
From Barcode             211
To Barcode                 0
Invoice ID                 0
Selected                 209
No Cartons                 0
Advance Price              0
Producer Adjustment        0
Advance Purchase          44
Purchase Price           164
Final Price   

### Load Farms Raw Dataset

In [34]:
# start time
start_time = time.time()

# loads farms raw dataset
farms_df  = dataloader_object.load_workbook(farms_data )

print("Time to loads farms data: ",round((time.time() - start_time)/60,5),"minutes")

Time to connect to the DB:  0.00037 minutes


In [35]:
# visualize farms raw dataset
farms_df.head()

Unnamed: 0,ID,Farm Name
0,45,Tel Dan
1,54,Urkish
2,62,Lehi
3,83,Shur
4,154,Havilah


### Load Producers Raw Dataset

In [36]:
# start time
start_time = time.time()

# loads producers raw dataset
producers_df  = dataloader_object.load_workbook(producers_data)

print("Time to load producers: ",round((time.time() - start_time)/60,5),"minutes")

Time to connect to the DB:  0.00024 minutes


In [38]:
# visualize producers raw dataset
producers_df.head()

Unnamed: 0,ID,Producer
0,201100,Alderaan
1,318820,Yavin IV
2,351986,Hoth
3,240822,Dagobah
4,11959,Bespin


### Load Varieties Raw Dataset

In [39]:
# start time
start_time = time.time()

# loads varieties raw dataset
varieties_df  = dataloader_object.load_workbook(varieties_data)

print("Time to connect to the DB: ",round((time.time() - start_time)/60,5),"minutes")

Time to connect to the DB:  0.00058 minutes


In [40]:
# visualize varieties raw dataset
varieties_df.head()

Unnamed: 0,ID,Variety Group,Variety Code,Commodity Code
0,45,AGN,AGN,OR
1,965,ALN,ALN,OR
2,937,ANL,ANL,OR
3,610,BAR,BAR,GR
4,686,BIN,ELL,SC


### Load Vessels Raw Dataset

In [41]:
# start time
start_time = time.time()

# loads vessels raw dataset
vessels_df  = dataloader_object.load_workbook(vessels_data)

print("Time to connect to the DB: ",round((time.time() - start_time)/60,5),"minutes")

Time to connect to the DB:  0.00094 minutes
