### Import Dependencies

In [15]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

from sqlalchemy import create_engine
from dotenv import load_dotenv            # pip install python-dotenv
import os

import psycopg2                           # pip install psycopg2-binary

### Load Data

In [16]:
# configure connection to postgerSQL

load_dotenv()
db_password  = os.getenv("db_password")
db_string = f"postgresql://postgres:{db_password}@127.0.0.1:5432/Winedemic"
engine = create_engine(db_string)

print("Python is connected to PostgreSQL: ", engine)

Python is connected to PostgreSQL:  Engine(postgresql://postgres:***@127.0.0.1:5432/Winedemic)


In [17]:
# Load 2019 data from postgreSQL
df_2019 = pd.read_sql_table(
    table_name="complete_2019",
    con=engine)

# Load 2020 data from postgreSQL
df_2020 = pd.read_sql_table(
    table_name="complete_2020",
    con=engine)

# Load 2021 data from postgreSQL
df_2021 = pd.read_sql_table(
    table_name="complete_2021",
    con=engine)

In [18]:
# Preliminary data
file = '../data_frame/Complete_annual_dataframes/Complete_2020.csv'
df = pd.read_csv(file)
df.head()

Unnamed: 0.1,Unnamed: 0,Order Number,Company Name,Ship Date,City,State,Zip,Shipping Service,Created Date,Weight,Item/Bottle Count
0,0,0,Ecom3,1/2/20,Plantation,FL,33325,UPS,1/1/20,9.9,3.0
1,1,1,Ecom3,1/2/20,Lafayette,CA,94549,UPS,12/31/19,35.0,10.0
2,2,2,Ecom3,1/2/20,New York,NY,10022,FEH,1/1/20,29.0,8.0
3,3,3,Ecom3,1/3/20,New York,NY,10006,FEH,1/2/20,21.5,6.0
4,4,4,Ecom3,1/2/20,Walkersville,MD,21793,FXG,12/31/19,41.0,12.0


### Preprocess data for ML Model:
- Target: Item/Bottle Count
- Features: Company Name, City, State, Zip, Shipping Service, Created Date (split into Year, Month, Day)
    - Columns to Drop:
        - Unnamed: 0
        - Order Number
        - Ship Date
        - Weight
    - Custom Encode Company Name
    - Columns to Encode:
        - City
        - State
        - Shipping Service
    - Split Created Date into Year, Month, Day columns (model cannot handle datetime datatype)

### Create Preprocessing Function

In [19]:
def data_preprocessing(df):

    # Label each company as integers 1 through 6
    company_num = {
    "Ecom1": 1,
    'Ecom2': 2,
    "Ecom3": 3,
    "Winery1": 4,
    "Winery2": 5,
    "Winery3": 6
    }
    df['Company Name'] = df['Company Name'].apply(lambda x:company_num[x])

    # Encode 'City', 'State', and 'Shipping Service' columns
    le = LabelEncoder()
    df['City'] = le.fit_transform(df['City'])
    df['State'] = le.fit_transform(df['State'])
    df['Shipping Service'] = le.fit_transform(df['Shipping Service'])

    # Convert 'Created Date' into three columns (Year, Month, Day)
    df['Created Date'] = pd.to_datetime(df['Created Date'], infer_datetime_format=True)
    df['Year'] = df['Created Date'].dt.year
    df['Month'] = df['Created Date'].dt.month
    df['Day'] = df['Created Date'].dt.day

    # Drop 'Unnamed: 0', 'Order Number', 'Ship Date', 'Weight' and 'Created Date' columns
    df = df.drop(['Unnamed: 0', 'Order Number', 'Ship Date','Weight', 'Created Date'], axis=1)

    return df

In [20]:
df = data_preprocessing(df)

In [21]:
df.head()

Unnamed: 0,Company Name,City,State,Zip,Shipping Service,Item/Bottle Count,Year,Month,Day
0,3,7981,9,33325,14,3.0,2020,1,1
1,3,5188,4,94549,14,10.0,2019,12,31
2,3,6939,34,10022,1,8.0,2020,1,1
3,3,6939,34,10006,1,6.0,2020,1,2
4,3,10666,20,21793,4,12.0,2019,12,31


In [23]:
# df.to_csv(r'preprocessed_data.csv', index=False)