In [None]:
import pandas as pd
from sqlalchemy import create_engine
from config import password, username

# Extract

In [None]:
csv_path = "../Resources/customers.csv"
customer_df = pd.read_csv(csv_path)
customer_df.head()

In [None]:
json_path = "../Resources/addresses.json"
address_df = pd.read_json(json_path)
address_df.head()

# Transform

## EDA
* You will need to do a whole lot more than I have done

In [None]:
customer_df.info()

In [None]:
address_df.info()

## Data Cleaning
* You will have to do a whole lot more than I have done

In [None]:
address_df[['last_name','first_name']] = address_df['name'].str.split(',',expand=True)
address_df.head()

In [None]:
merged_df = pd.merge(customer_df, address_df, on = ["first_name", "last_name"])
merged_df.head()

## Get data into separate dataframes to match tables
* Make sure to include primary keys!

In [None]:
max(merged_df[["first_name", "last_name"]].value_counts().to_list())

In [None]:
merged_df.reset_index(inplace = True)

In [None]:
merged_df.rename(columns={"index": "customer_id"}, inplace = True)

In [None]:
merged_df.head()

In [None]:
names_df = merged_df[["customer_id", "first_name", "last_name"]]
location_df = merged_df[["customer_id", "address", "us_state"]]
email_df = merged_df[["customer_id", "email"]]

In [None]:
max(location_df[["address", "us_state"]].value_counts().to_list())

In [None]:
location_df = location_df.reset_index().rename(columns={"index": "location_id"})
location_df.head()

In [None]:
max(email_df["email"].value_counts().to_list())

In [None]:
email_df = email_df.reset_index().rename(columns={"index": "email_id"})
email_df.head()

# Load

## Connect to local database

In [None]:
rds_connection_string = f"{username}:{password}@localhost:5432/customer_db"
engine = create_engine(f'postgresql://{rds_connection_string}')

## Check for tables

In [None]:
engine.table_names()

## Export data from dataframes to tables

In [None]:
names_df.to_sql(name='customer_name', con=engine, if_exists='append', index=False)

In [None]:
location_df.to_sql(name='customer_location', con=engine, if_exists='append', index=False)

In [None]:
email_df.to_sql(name='customer_email', con=engine, if_exists='append', index=False)

## Confirm data has been added by querying tables
* NOTE: can also check using pgAdmin

In [None]:
pd.read_sql_query('select * from customer_name', con=engine).head()

In [None]:
pd.read_sql_query('select * from customer_location', con=engine).head()

In [None]:
pd.read_sql_query('select * from customer_email', con=engine).head()