In [17]:
import pandas as pd

# Load data using relative paths
brands = pd.read_csv('../data/raw/brands.csv')
orderlines = pd.read_csv('../data/raw/orderlines.csv')
orders = pd.read_csv('../data/raw/orders.csv')
products = pd.read_csv('../data/raw/products.csv')

# Create working copies
brands_df = brands.copy()
orderlines_df = orderlines.copy()
orders_df = orders.copy()
products_df = products.copy()

In [18]:
brands_df.shape

(187, 2)

In [19]:
nrows = brands_df.shape[0]
ncols = brands_df.shape[1]
print("The number of rows is", nrows)
print("The number of columns is", ncols)

The number of rows is 187
The number of columns is 2


In [20]:
brands_df.size

374

In [21]:
brands_df.shape[0] * brands_df.shape[1] == brands_df.size

True

In [22]:
brands_df.ndim

2

In [23]:
brands_df.head(20)

Unnamed: 0,short,long
0,8MO,8Mobility
1,ACM,Acme
2,ADN,Adonit
3,AII,Aiino
4,AKI,Akitio
5,ALL,Allocacoc
6,AP2,Apple
7,APP,Apple
8,BAN,Band&Strap
9,BEA,Beats


In [24]:
brands_df.tail()

Unnamed: 0,short,long
182,XOO,Xoopar
183,XRI,X-Rite
184,XTO,Xtorm
185,ZAG,ZaggKeys
186,ZEP,Zepp


In [25]:
brands_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 187 entries, 0 to 186
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   short   187 non-null    object
 1   long    187 non-null    object
dtypes: object(2)
memory usage: 3.1+ KB


In [26]:
brands_df.describe()

Unnamed: 0,short,long
count,187,187
unique,187,181
top,8MO,Apple
freq,1,2


In [27]:
brands_df.nunique()

short    187
long     181
dtype: int64

In [28]:
mask = brands_df['long'].duplicated(keep=False)
brands_df.loc[mask,:]

Unnamed: 0,short,long
6,AP2,Apple
7,APP,Apple
17,BOS,Bose
19,CAD,Bose
37,ENV,Unknown
67,JYB,Jaybird
70,KEN,Jaybird
80,LIB,Unknown
94,MOP,Mophie
98,MUJ,Mophie


In [29]:
mask = brands_df['short'].duplicated(keep=False)
brands_df.loc[mask,:]

Unnamed: 0,short,long


In [30]:
brands_df.isna().sum()

short    0
long     0
dtype: int64

In [31]:
list_of_values = ["unknown", 'null', 'error', 'miss', 'na', 'nan']
mask = brands_df['long'].str.lower().isin(list_of_values) | brands_df['short'].str.lower().isin(list_of_values)
brands_df.loc[mask,:]


Unnamed: 0,short,long
37,ENV,Unknown
80,LIB,Unknown


## Documentation

### Dataset Overview
- **2 columns:** `short` (brand abbreviation) and `long` (full brand name)
- Shorts and longs can contain numbers, letters, and special characters
- **187 rows** with 187 unique shorts and 181 unique longs

### Data Quality Notes
- No null values in the dataframe
- Brand "Unknown" appears for ENV and LIB - no suitable equivalents found
- Some brands have different short codes mapping to the same name (e.g., Apple: AP2, APP)