# Clean Data for Power BI

In [67]:
import pandas as pd

sales_data = pd.read_csv('./Sales.csv')

print(sales_data.columns)

print(sales_data.head())
print(f"Number of rows: {len(sales_data)}")

Index(['ProductID', 'Date', 'Zip', 'Units', 'Revenue'], dtype='object')
   ProductID        Date    Zip  Units   Revenue
0       1076  2015-01-20  72638      1  254.5725
1       1076  2015-01-21  47577      1  254.5725
2       1076  2015-01-28  34653      1  254.5725
3       1076  2015-01-31  84014      1  254.5725
4       1076  2015-02-01  75070      1  254.5725
Number of rows: 4198753


In [68]:
total_revenue = sales_data['Revenue'].sum()
formatted_revenue = f"{total_revenue:,.2f}"
print(f"Total de ganancias (Revenue): ${formatted_revenue}")

Total de ganancias (Revenue): $1,697,110,399.53


In [69]:
product_df = pd.read_excel('./bi_dimensions.xlsx', sheet_name='product', header=1)
geo_df = pd.read_excel('./bi_dimensions.xlsx', sheet_name='geo' , header=3)

In [70]:
manu_raw = pd.read_excel('./bi_dimensions.xlsx', sheet_name='manufacturer', header=None)
manu_raw = manu_raw.dropna(how='all', axis=0).dropna(how='all', axis=1)

# Row indices:
# 0 -> dummy headers 'Column1' etc.
# 1 -> ManufacturerID + ids
# 2 -> Manufacturer names
# 3 -> Logo URLs

ids = manu_raw.iloc[1, 1:]  # numeric IDs starting at second cell
names = manu_raw.iloc[2, 1:]
logos = manu_raw.iloc[3, 1:]

manufacturer_df = pd.DataFrame({'ManufacturerID': ids.values,
                                'Manufacturer': names.values,
                                'Logo': logos.values})

In [71]:
product_df.head()

Unnamed: 0,ProductID,Product,Category,ManufacturerID,Price
0,1,Abbas MA-01|All Season,Mix,1,USD 412.13
1,2,Abbas MA-02|All Season,,1,USD 329.78
2,3,Abbas MA-03|All Season,,1,USD 963.38
3,4,Abbas MA-04|All Season,,1,USD 828.98
4,5,Abbas MA-05|All Season,,1,USD 745.5


In [72]:
product_df['Price'] = product_df['Price'].str.replace('USD', '').str.strip().astype(float)
product_df.head()

Unnamed: 0,ProductID,Product,Category,ManufacturerID,Price
0,1,Abbas MA-01|All Season,Mix,1,412.13
1,2,Abbas MA-02|All Season,,1,329.78
2,3,Abbas MA-03|All Season,,1,963.38
3,4,Abbas MA-04|All Season,,1,828.98
4,5,Abbas MA-05|All Season,,1,745.5


In [73]:
manufacturer_df.head()

Unnamed: 0,ManufacturerID,Manufacturer,Logo
0,1,Abbas,https://raw.githubusercontent.com/CharlesSterl...
1,2,Aliqui,https://raw.githubusercontent.com/CharlesSterl...
2,3,Barba,https://raw.githubusercontent.com/CharlesSterl...
3,4,Currus,https://raw.githubusercontent.com/CharlesSterl...
4,5,Fama,https://raw.githubusercontent.com/CharlesSterl...


In [74]:
geo_df.head()

Unnamed: 0,Zip,City,State,Region,District,Country
0,22654,"Star Tannery, VA, USA",VA,East,District #07,USA
1,22655,"Stephens City, VA, USA",VA,East,District #07,USA
2,22656,"Stephenson, VA, USA",VA,East,District #07,USA
3,22657,"Strasburg, VA, USA",VA,East,District #07,USA
4,22660,"Toms Brook, VA, USA",VA,East,District #07,USA


In [75]:
product_df.to_csv('./output/product.csv', index=False)
manufacturer_df.to_csv('./output/manufacturer.csv', index=False)
geo_df.to_csv('./output/geo.csv', index=False)