### Import Libraries

In [1]:
import pyodbc
import pandas as pd
import os
import time

In [2]:
# root directory of the assessment
root_dir = os.getcwd()     

# this is path to the dataset
dataset = os.path.join(root_dir, "dataset")

# load excel dataset
dsr_data = os.path.join(dataset, "DSR.xlsx")
farms_data = os.path.join(dataset, "farms.xlsx")
producers_data = os.path.join(dataset, "producers.xlsx")
varieties_data = os.path.join(dataset, "varieties.xlsx")
vessels_data = os.path.join(dataset, "vessels.xlsx")

# loop through all the drivers we have access to
for driver in pyodbc.drivers():
    print(driver)

SQL Server
SQL Server Native Client 11.0
ODBC Driver 17 for SQL Server
SQL Server Native Client RDA 11.0


### Connecting to the Databases

In [6]:
class ConnectDB(object):
    """
    This is a blueprint for codnnecting to the local databases
    """
    
    def __init__(self):
        self.server = "DESKTOP-E5PL80T\SQLEXPRESS"
        self.database = "Atom_Assessment_2021"
        self.driver =  "ODBC Driver 17 for SQL Server"
    
    def get_connection(self):
        """
        This method connects to the database
        :returns:
            cur, conn
        """

        try:
            # connect to the database
            conn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server}; \
                   SERVER=' + self.server + '; \
                   DATABASE=' + self.database +';\
                   Trusted_Connection=yes;')
            cursor = conn.cursor()
        
        except Exception as e:
                
            print("Error: Could not get the cursor to the database")
            print(e)
        
        return conn, cursor      

In [7]:
# start time
start_time = time.time()

# define db connect object
connect_db_object = ConnectDB()

# establish the connection 
conn, cursor = connect_db_object.get_connection()
print("Time to connect to the DB: ",round((time.time() - start_time)/60,5),"minutes")

Time to connect to the DB:  0.00022 minutes


### Display Table Names from the Server

In [12]:
# sql quecry for displaying all tables from the databases
cursor.execute("SELECT table_name FROM information_schema.tables;")

# fetch all tables
tables  = cursor.fetchall()

# iterate through the database and display all table names
for table_name in tables:
    print(table_name)

('DSR', )
('Farms', )
('Producers', )
('Varieties', )
('Vessels', )


### Load Excel Dataset

In [13]:
class LoadExcelDataset(object):
    """
    This is a blueprint for loading dataset
    """
    def __init__(self):
        pass
    
    def get_excel_data(self, file_name):
        """
        This method loads excel dataset
        
        args:
            file_name: the name of the file
            excel_file: file to import into the sql database
            
        returns: 
            dataframe
        """
        
        df = pd.read_excel(file_name)
                
        return df
    
    def load_to_db(self, conn, df):
        """
        This method loads excel data to the dabase
        
        """
        
        
        

### Load Detailed Season Report (DSR) Raw Dataset

In [14]:
# excel data object
excel_data_object = LoadExcelDataset()

# load dsr dataframe 
dsr_df = excel_data_object.get_excel_data(dsr_data)

print("Time to load DSR data: ",round((time.time() - start_time)/60,5),"minutes")

Time to load DSR data:  104.87363 minutes


In [14]:
# visualize DSR raw dataset
dsr_df.head()

Unnamed: 0,Year,ProducerID,FarmID,Sales Week,Pallet ID,Sale ID,Barcode,Grade Code,Mark Code,VarietyID,...,Total Export Cost,Total Weight,Pack Nett Weight,Nett Weight,Pack Gross Weight,Gross Weight,FOB,DIP,VAT,Return To Grower
0,2017,201100,45,24,201233,3124,960091600127485952,1,AMA,74,...,0.0,0,16.0,1280.0,16.95,1356.0,10.5,139.324307,-55.907373,10074.41044
1,2017,201100,45,25,207565,10375,960091600127457024,1,AMA,74,...,0.0,0,16.0,1280.0,16.95,1356.0,10.5,137.090203,-47.779393,9875.083022
2,2017,201100,45,25,100285,10375,960091600127485952,1,AMA,74,...,0.0,0,16.0,1280.0,16.95,1356.0,10.5,137.090203,52.220607,9875.083022
3,2017,201100,45,25,181685,7374,960091600127457024,1,AMA,74,...,0.0,0,16.0,1280.0,16.95,1356.0,10.5,141.074018,-40.300725,10122.17772
4,2017,201100,45,26,66238,6697,960091600127469952,1,AMA,700,...,0.0,0,16.0,1280.0,16.95,1356.0,7.0,92.381472,73.75377,6614.094181


In [15]:
# compute the sum of null values in the dataframe
sum_of_null_values = dsr_df.isnull().sum()

print(sum_of_null_values)

Year                       0
ProducerID                 0
FarmID                     0
Sales Week                 0
Pallet ID                  0
Sale ID                    0
Barcode                    0
Grade Code                 0
Mark Code                  0
VarietyID                  0
Pack Code                  0
Count Code                 2
Exchange Rate              0
Currency                   0
QC                         0
Sequence Number            0
Production ID            209
Run Number             96551
Container No           55337
VesselID                   0
Port Of Discharge      42479
Target Market              0
Inventory Code           587
Trader ID                  0
Pallet Size                0
From Barcode             211
To Barcode                 0
Invoice ID                 0
Selected                 209
No Cartons                 0
Advance Price              0
Producer Adjustment        0
Advance Purchase          44
Purchase Price           164
Final Price   

### Load Farms Raw Dataset

In [16]:
# start time
start_time = time.time()

# loads farms raw dataset
farms_df  = excel_data_object.get_excel_data(farms_data )

print("Time to loads farms data: ",round((time.time() - start_time)/60,5),"minutes")

Time to loads farms data:  0.00057 minutes


In [17]:
# visualize farms raw dataset
farms_df.head()

Unnamed: 0,ID,Farm Name
0,45,Tel Dan
1,54,Urkish
2,62,Lehi
3,83,Shur
4,154,Havilah


### Load Producers Raw Dataset

In [18]:
# start time
start_time = time.time()

# loads producers raw dataset
producers_df  =  excel_data_object.get_excel_data(producers_data)

print("Time to load producers: ",round((time.time() - start_time)/60,5),"minutes")

Time to load producers:  0.00043 minutes


In [19]:
# visualize producers raw dataset
producers_df.head()

Unnamed: 0,ID,Producer
0,201100,Alderaan
1,318820,Yavin IV
2,351986,Hoth
3,240822,Dagobah
4,11959,Bespin


### Load Varieties Raw Dataset

In [20]:
# start time
start_time = time.time()

# loads varieties raw dataset
varieties_df  = excel_data_object.get_excel_data(varieties_data)

print("Time to connect to the DB: ",round((time.time() - start_time)/60,5),"minutes")

Time to connect to the DB:  0.00048 minutes


In [21]:
# visualize varieties raw dataset
varieties_df.head()

Unnamed: 0,ID,Variety Group,Variety Code,Commodity Code
0,45,AGN,AGN,OR
1,965,ALN,ALN,OR
2,937,ANL,ANL,OR
3,610,BAR,BAR,GR
4,686,BIN,ELL,SC


### Load Vessels Raw Dataset

In [22]:
# start time
start_time = time.time()

# loads vessels raw dataset
vessels_df = excel_data_object.get_excel_data(vessels_data)

print("Time to connect to the DB: ",round((time.time() - start_time)/60,5),"minutes")

Time to connect to the DB:  0.0009 minutes


In [23]:
# visualize varieties raw dataset
vessels_df.head()

Unnamed: 0,ID,Vessel
0,4271,ADRIAN SCHULTE
1,1347,ALEXANDRA
2,2653,ALMAVIVA
3,4496,ALS Venus
4,6744,AMANDA D


In [8]:
# define server name and the database name
server = "DESKTOP-E5PL80T\SQLEXPRESS"
database = "Atom_Assessment_2021"
#self.driver =  "ODBC Driver 17 for SQL Server"

# define connection string
conn = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server}; \
                       SERVER=' + server + '; \
                       DATABASE=' + database +';\
                       Trusted_Connection=yes;')

# create the connection cursor
cursor = conn.cursor()


<pyodbc.Connection at 0x1faf6b972a0>