# Data Staging
### Extract, Transform, Load (ETL)

In [41]:
import pandas as pd

In [42]:
# Read the csv file into a pandas dataframe
df= pd.read_csv('Cookie Company Financials.csv')

In [43]:
df

Unnamed: 0,Country,Product,Units Sold,Revenue per cookie,Cost per cookie,Date,Month Number,Month Name,Year
0,Canada,Chocolate Chip,292.0,$5.00,$2.00,01-02-2019,2,February,2019
1,Mexico,Chocolate Chip,974.0,$5.00,$2.00,01-02-2019,2,February,2019
2,Canada,Chocolate Chip,2518.0,$5.00,$2.00,01-06-2019,6,June,2019
3,Germany,Chocolate Chip,1006.0,$5.00,$2.00,01-06-2019,6,June,2019
4,Germany,Chocolate Chip,367.0,$5.00,$2.00,01-07-2019,7,July,2019
...,...,...,...,...,...,...,...,...,...
695,France,White Chocolate Macadamia Nut,2826.0,$6.00,$2.75,01-05-2019,5,May,2019
696,France,White Chocolate Macadamia Nut,663.0,$6.00,$2.75,01-09-2019,9,September,2019
697,United States,White Chocolate Macadamia Nut,2574.0,$6.00,$2.75,01-11-2018,11,November,2018
698,United States,White Chocolate Macadamia Nut,2438.0,$6.00,$2.75,01-12-2018,12,December,2018


In [44]:
# Print a Series with the data type of each column
print(df.dtypes)

Country                object
Product                object
Units Sold            float64
Revenue per cookie     object
Cost per cookie        object
Date                   object
Month Number            int64
Month Name             object
Year                    int64
dtype: object


**Transform**

In [45]:
# Convert the 'Date' column to a datetime object
df['Date'] = pd.to_datetime(df['Date'])

df[:25]

Unnamed: 0,Country,Product,Units Sold,Revenue per cookie,Cost per cookie,Date,Month Number,Month Name,Year
0,Canada,Chocolate Chip,292.0,$5.00,$2.00,2019-01-02,2,February,2019
1,Mexico,Chocolate Chip,974.0,$5.00,$2.00,2019-01-02,2,February,2019
2,Canada,Chocolate Chip,2518.0,$5.00,$2.00,2019-01-06,6,June,2019
3,Germany,Chocolate Chip,1006.0,$5.00,$2.00,2019-01-06,6,June,2019
4,Germany,Chocolate Chip,367.0,$5.00,$2.00,2019-01-07,7,July,2019
5,Mexico,Chocolate Chip,883.0,$5.00,$2.00,2019-01-08,8,August,2019
6,France,Chocolate Chip,549.0,$5.00,$2.00,2018-01-09,9,September,2018
7,Mexico,Chocolate Chip,788.0,$5.00,$2.00,2018-01-09,9,September,2018
8,Mexico,Chocolate Chip,2472.0,$5.00,$2.00,2019-01-09,9,September,2019
9,United States,Chocolate Chip,1143.0,$5.00,$2.00,2019-01-10,10,October,2019


In [46]:
# Convert the 'Units Sold' column to an integer
df['Units Sold'] = df['Units Sold'].astype(int)

df[:25]

Unnamed: 0,Country,Product,Units Sold,Revenue per cookie,Cost per cookie,Date,Month Number,Month Name,Year
0,Canada,Chocolate Chip,292,$5.00,$2.00,2019-01-02,2,February,2019
1,Mexico,Chocolate Chip,974,$5.00,$2.00,2019-01-02,2,February,2019
2,Canada,Chocolate Chip,2518,$5.00,$2.00,2019-01-06,6,June,2019
3,Germany,Chocolate Chip,1006,$5.00,$2.00,2019-01-06,6,June,2019
4,Germany,Chocolate Chip,367,$5.00,$2.00,2019-01-07,7,July,2019
5,Mexico,Chocolate Chip,883,$5.00,$2.00,2019-01-08,8,August,2019
6,France,Chocolate Chip,549,$5.00,$2.00,2018-01-09,9,September,2018
7,Mexico,Chocolate Chip,788,$5.00,$2.00,2018-01-09,9,September,2018
8,Mexico,Chocolate Chip,2472,$5.00,$2.00,2019-01-09,9,September,2019
9,United States,Chocolate Chip,1143,$5.00,$2.00,2019-01-10,10,October,2019


Removing $ asign and change data types

In [47]:
# The type of the 'Revenue per cookie' column is object. Convert it to a float
# And remove the dollar sign
df['Revenue per cookie'] = df['Revenue per cookie'].str.replace('$','').astype('float')

In [48]:
df['Cost per cookie'] = df['Cost per cookie'].str.replace('$','').astype('float')

In [49]:
df[:25]

Unnamed: 0,Country,Product,Units Sold,Revenue per cookie,Cost per cookie,Date,Month Number,Month Name,Year
0,Canada,Chocolate Chip,292,5.0,2.0,2019-01-02,2,February,2019
1,Mexico,Chocolate Chip,974,5.0,2.0,2019-01-02,2,February,2019
2,Canada,Chocolate Chip,2518,5.0,2.0,2019-01-06,6,June,2019
3,Germany,Chocolate Chip,1006,5.0,2.0,2019-01-06,6,June,2019
4,Germany,Chocolate Chip,367,5.0,2.0,2019-01-07,7,July,2019
5,Mexico,Chocolate Chip,883,5.0,2.0,2019-01-08,8,August,2019
6,France,Chocolate Chip,549,5.0,2.0,2018-01-09,9,September,2018
7,Mexico,Chocolate Chip,788,5.0,2.0,2018-01-09,9,September,2018
8,Mexico,Chocolate Chip,2472,5.0,2.0,2019-01-09,9,September,2019
9,United States,Chocolate Chip,1143,5.0,2.0,2019-01-10,10,October,2019


**Drop Unrequired columns**

In [50]:
df = df.drop(columns = ['Month Name'])

In [51]:
df

Unnamed: 0,Country,Product,Units Sold,Revenue per cookie,Cost per cookie,Date,Month Number,Year
0,Canada,Chocolate Chip,292,5.0,2.00,2019-01-02,2,2019
1,Mexico,Chocolate Chip,974,5.0,2.00,2019-01-02,2,2019
2,Canada,Chocolate Chip,2518,5.0,2.00,2019-01-06,6,2019
3,Germany,Chocolate Chip,1006,5.0,2.00,2019-01-06,6,2019
4,Germany,Chocolate Chip,367,5.0,2.00,2019-01-07,7,2019
...,...,...,...,...,...,...,...,...
695,France,White Chocolate Macadamia Nut,2826,6.0,2.75,2019-01-05,5,2019
696,France,White Chocolate Macadamia Nut,663,6.0,2.75,2019-01-09,9,2019
697,United States,White Chocolate Macadamia Nut,2574,6.0,2.75,2018-01-11,11,2018
698,United States,White Chocolate Macadamia Nut,2438,6.0,2.75,2018-01-12,12,2018


**Generate measures/facts**

In [52]:
df['Revenue'] = df['Units Sold'] * df['Revenue per cookie']
df['Cost'] = df['Units Sold'] * df['Cost per cookie']
df['Profit'] = df['Revenue'] - df['Cost']

In [53]:
df

Unnamed: 0,Country,Product,Units Sold,Revenue per cookie,Cost per cookie,Date,Month Number,Year,Revenue,Cost,Profit
0,Canada,Chocolate Chip,292,5.0,2.00,2019-01-02,2,2019,1460.0,584.00,876.00
1,Mexico,Chocolate Chip,974,5.0,2.00,2019-01-02,2,2019,4870.0,1948.00,2922.00
2,Canada,Chocolate Chip,2518,5.0,2.00,2019-01-06,6,2019,12590.0,5036.00,7554.00
3,Germany,Chocolate Chip,1006,5.0,2.00,2019-01-06,6,2019,5030.0,2012.00,3018.00
4,Germany,Chocolate Chip,367,5.0,2.00,2019-01-07,7,2019,1835.0,734.00,1101.00
...,...,...,...,...,...,...,...,...,...,...,...
695,France,White Chocolate Macadamia Nut,2826,6.0,2.75,2019-01-05,5,2019,16956.0,7771.50,9184.50
696,France,White Chocolate Macadamia Nut,663,6.0,2.75,2019-01-09,9,2019,3978.0,1823.25,2154.75
697,United States,White Chocolate Macadamia Nut,2574,6.0,2.75,2018-01-11,11,2018,15444.0,7078.50,8365.50
698,United States,White Chocolate Macadamia Nut,2438,6.0,2.75,2018-01-12,12,2018,14628.0,6704.50,7923.50


In [54]:
df = df.rename({'Month Number':'Month'}, axis='columns')

In [55]:
df.head()

Unnamed: 0,Country,Product,Units Sold,Revenue per cookie,Cost per cookie,Date,Month,Year,Revenue,Cost,Profit
0,Canada,Chocolate Chip,292,5.0,2.0,2019-01-02,2,2019,1460.0,584.0,876.0
1,Mexico,Chocolate Chip,974,5.0,2.0,2019-01-02,2,2019,4870.0,1948.0,2922.0
2,Canada,Chocolate Chip,2518,5.0,2.0,2019-01-06,6,2019,12590.0,5036.0,7554.0
3,Germany,Chocolate Chip,1006,5.0,2.0,2019-01-06,6,2019,5030.0,2012.0,3018.0
4,Germany,Chocolate Chip,367,5.0,2.0,2019-01-07,7,2019,1835.0,734.0,1101.0


In [56]:
# Print the unique values of the 'Country' column
unique_countries = df['Country'].unique()

print(unique_countries)

['Canada' 'Mexico' 'Germany' 'France' 'United States']


**Extract country population**

In [57]:
df_country = pd.read_csv('Country Population.csv')

In [58]:
# Filter the DataFrame to include only the rows for the unique countries
filtered_country_df = df_country[df_country['Country'].isin(unique_countries)]

In [59]:
filtered_country_df

Unnamed: 0,Country,Population
2,United States,335893238
9,Mexico,129406736
17,Germany,84607016
19,France,68373000
35,Canada,40528396


In [60]:
# Reset the index of the filtered DataFrame
filtered_country_df = filtered_country_df.reset_index(drop=True)


In [61]:
filtered_country_df

Unnamed: 0,Country,Population
0,United States,335893238
1,Mexico,129406736
2,Germany,84607016
3,France,68373000
4,Canada,40528396


## Dataset Integration (Enrich Sales Data)

In [62]:
result = pd.merge(df, df_country, how="left", on=["Country"]) # Merge the two DataFrames on the 'Country' column
result

Unnamed: 0,Country,Product,Units Sold,Revenue per cookie,Cost per cookie,Date,Month,Year,Revenue,Cost,Profit,Population
0,Canada,Chocolate Chip,292,5.0,2.00,2019-01-02,2,2019,1460.0,584.00,876.00,40528396
1,Mexico,Chocolate Chip,974,5.0,2.00,2019-01-02,2,2019,4870.0,1948.00,2922.00,129406736
2,Canada,Chocolate Chip,2518,5.0,2.00,2019-01-06,6,2019,12590.0,5036.00,7554.00,40528396
3,Germany,Chocolate Chip,1006,5.0,2.00,2019-01-06,6,2019,5030.0,2012.00,3018.00,84607016
4,Germany,Chocolate Chip,367,5.0,2.00,2019-01-07,7,2019,1835.0,734.00,1101.00,84607016
...,...,...,...,...,...,...,...,...,...,...,...,...
695,France,White Chocolate Macadamia Nut,2826,6.0,2.75,2019-01-05,5,2019,16956.0,7771.50,9184.50,68373000
696,France,White Chocolate Macadamia Nut,663,6.0,2.75,2019-01-09,9,2019,3978.0,1823.25,2154.75,68373000
697,United States,White Chocolate Macadamia Nut,2574,6.0,2.75,2018-01-11,11,2018,15444.0,7078.50,8365.50,335893238
698,United States,White Chocolate Macadamia Nut,2438,6.0,2.75,2018-01-12,12,2018,14628.0,6704.50,7923.50,335893238


**Check for null values after merge**

In [63]:
result.isnull().sum()

Country               0
Product               0
Units Sold            0
Revenue per cookie    0
Cost per cookie       0
Date                  0
Month                 0
Year                  0
Revenue               0
Cost                  0
Profit                0
Population            0
dtype: int64

In [64]:
result.dtypes

Country                       object
Product                       object
Units Sold                     int64
Revenue per cookie           float64
Cost per cookie              float64
Date                  datetime64[ns]
Month                          int64
Year                           int64
Revenue                      float64
Cost                         float64
Profit                       float64
Population                    object
dtype: object

**Extract covid data**

In [65]:
df_covid = pd.read_csv('covid-data.csv')

In [66]:
df_covid

Unnamed: 0,iso_code,continent,location,date,total_cases,new_cases,new_cases_smoothed,total_deaths,new_deaths,new_deaths_smoothed,...,male_smokers,handwashing_facilities,hospital_beds_per_thousand,life_expectancy,human_development_index,population,excess_mortality_cumulative_absolute,excess_mortality_cumulative,excess_mortality,excess_mortality_cumulative_per_million
0,AFG,Asia,Afghanistan,2020-01-05,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
1,AFG,Asia,Afghanistan,2020-01-06,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
2,AFG,Asia,Afghanistan,2020-01-07,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
3,AFG,Asia,Afghanistan,2020-01-08,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
4,AFG,Asia,Afghanistan,2020-01-09,,0.0,,,0.0,,...,,37.746,0.5,64.83,0.511,41128772.0,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
373400,ZWE,Africa,Zimbabwe,2024-01-17,266202.0,0.0,12.143,5737.0,0.0,0.429,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,
373401,ZWE,Africa,Zimbabwe,2024-01-18,266202.0,0.0,12.143,5737.0,0.0,0.429,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,
373402,ZWE,Africa,Zimbabwe,2024-01-19,266202.0,0.0,12.143,5737.0,0.0,0.429,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,
373403,ZWE,Africa,Zimbabwe,2024-01-20,266202.0,0.0,12.143,5737.0,0.0,0.429,...,30.7,36.791,1.7,61.49,0.571,16320539.0,,,,


In [67]:
df_covid.columns

Index(['iso_code', 'continent', 'location', 'date', 'total_cases', 'new_cases',
       'new_cases_smoothed', 'total_deaths', 'new_deaths',
       'new_deaths_smoothed', 'total_cases_per_million',
       'new_cases_per_million', 'new_cases_smoothed_per_million',
       'total_deaths_per_million', 'new_deaths_per_million',
       'new_deaths_smoothed_per_million', 'reproduction_rate', 'icu_patients',
       'icu_patients_per_million', 'hosp_patients',
       'hosp_patients_per_million', 'weekly_icu_admissions',
       'weekly_icu_admissions_per_million', 'weekly_hosp_admissions',
       'weekly_hosp_admissions_per_million', 'total_tests', 'new_tests',
       'total_tests_per_thousand', 'new_tests_per_thousand',
       'new_tests_smoothed', 'new_tests_smoothed_per_thousand',
       'positive_rate', 'tests_per_case', 'tests_units', 'total_vaccinations',
       'people_vaccinated', 'people_fully_vaccinated', 'total_boosters',
       'new_vaccinations', 'new_vaccinations_smoothed',
       't

**Transform**

In [68]:
# Select the desired columns
df_covid = df_covid[['location','total_cases','date']]

In [69]:
print(df_covid.dtypes)

location        object
total_cases    float64
date            object
dtype: object


In [70]:
df['Country'].unique()

array(['Canada', 'Mexico', 'Germany', 'France', 'United States'],
      dtype=object)

In [71]:
# Check the number of null values
df_covid['total_cases'].isnull().sum() 

38736

In [72]:
# Drop the null values
df_cases = df_covid[['location','total_cases','date']]
df_cases = df_cases.dropna()

In [73]:
df_cases

Unnamed: 0,location,total_cases,date
56,Afghanistan,1.0,2020-03-01
57,Afghanistan,1.0,2020-03-02
58,Afghanistan,1.0,2020-03-03
59,Afghanistan,1.0,2020-03-04
60,Afghanistan,1.0,2020-03-05
...,...,...,...
373400,Zimbabwe,266202.0,2024-01-17
373401,Zimbabwe,266202.0,2024-01-18
373402,Zimbabwe,266202.0,2024-01-19
373403,Zimbabwe,266202.0,2024-01-20


In [74]:
# to convert date to datetime
df_cases['date'] = pd.to_datetime(df_cases['date'])

In [75]:
df_cases

Unnamed: 0,location,total_cases,date
56,Afghanistan,1.0,2020-03-01
57,Afghanistan,1.0,2020-03-02
58,Afghanistan,1.0,2020-03-03
59,Afghanistan,1.0,2020-03-04
60,Afghanistan,1.0,2020-03-05
...,...,...,...
373400,Zimbabwe,266202.0,2024-01-17
373401,Zimbabwe,266202.0,2024-01-18
373402,Zimbabwe,266202.0,2024-01-19
373403,Zimbabwe,266202.0,2024-01-20


In [76]:
df_cases.reset_index(inplace=True) # reset index
df_cases = df_cases.drop(columns=['index']) # drop the old index column
df_cases

Unnamed: 0,location,total_cases,date
0,Afghanistan,1.0,2020-03-01
1,Afghanistan,1.0,2020-03-02
2,Afghanistan,1.0,2020-03-03
3,Afghanistan,1.0,2020-03-04
4,Afghanistan,1.0,2020-03-05
...,...,...,...
334664,Zimbabwe,266202.0,2024-01-17
334665,Zimbabwe,266202.0,2024-01-18
334666,Zimbabwe,266202.0,2024-01-19
334667,Zimbabwe,266202.0,2024-01-20


In [77]:
df_cases['Month'] = df_cases['date'].dt.month # extract month
df_cases['Year'] = df_cases['date'].dt.year # extract year

In [78]:
df_cases

Unnamed: 0,location,total_cases,date,Month,Year
0,Afghanistan,1.0,2020-03-01,3,2020
1,Afghanistan,1.0,2020-03-02,3,2020
2,Afghanistan,1.0,2020-03-03,3,2020
3,Afghanistan,1.0,2020-03-04,3,2020
4,Afghanistan,1.0,2020-03-05,3,2020
...,...,...,...,...,...
334664,Zimbabwe,266202.0,2024-01-17,1,2024
334665,Zimbabwe,266202.0,2024-01-18,1,2024
334666,Zimbabwe,266202.0,2024-01-19,1,2024
334667,Zimbabwe,266202.0,2024-01-20,1,2024
