In [29]:
import pandas as pd

# Creating a DataFrame
data = {
    "order_id": [1001, 1002, 1003, 1004, 1005, 1006, 1007],
    "customer_name": ["Alice", "Bob", None, "David", "Eva", "Frank", None],
    "product_category": ["Electronics", "Clothing", "Electronics", "Books", None, "Clothing", "Books"],
    "quantity": [2, 1, None, 1, 3, None, 4],
    "unit_price": [299.99, 49.99, 199.99, None, 15.99, 79.99, 12.99],
    "order_date": ["2024-08-01", "2024-08-02", "2024-08-03", "2024-08-04", "2024-08-05", "2024-08-06", None]
}

df = pd.DataFrame(data)
print(df)

   order_id customer_name product_category  quantity  unit_price  order_date
0      1001         Alice      Electronics       2.0      299.99  2024-08-01
1      1002           Bob         Clothing       1.0       49.99  2024-08-02
2      1003          None      Electronics       NaN      199.99  2024-08-03
3      1004         David            Books       1.0         NaN  2024-08-04
4      1005           Eva             None       3.0       15.99  2024-08-05
5      1006         Frank         Clothing       NaN       79.99  2024-08-06
6      1007          None            Books       4.0       12.99        None


### Amount of missing values

#### Amount of missing values in each column

In [30]:
columns_missing_values = df.isna().sum()
columns_missing_values

order_id            0
customer_name       2
product_category    1
quantity            2
unit_price          1
order_date          1
dtype: int64

#### Amount of records with missing values

In [31]:
rows_missing_values = df.isna().sum(axis=1)
rows_missing_values

0    0
1    0
2    2
3    1
4    1
5    1
6    2
dtype: int64

### Percentage of missing values

#### Percentage of missing values in each column

In [32]:
percentage_columns_missing_values = columns_missing_values / df.shape[0] * 100
percentage_columns_missing_values

order_id             0.000000
customer_name       28.571429
product_category    14.285714
quantity            28.571429
unit_price          14.285714
order_date          14.285714
dtype: float64

#### Percentage of records with missing values

In [35]:
percentage_columns_missing_values = rows_missing_values / df.shape[1] * 100
percentage_columns_missing_values

0     0.000000
1     0.000000
2    33.333333
3    16.666667
4    16.666667
5    16.666667
6    33.333333
dtype: float64

#### Deleting columns with more than 50% of missing values

We don't have any rows or columns where 50% of the values are missing. So, we don't need to delete any columns or rows.

In [44]:
# df.dropna(thresh=int(df.shape[1] * 0.5))
# df.dropna(thresh=int(df.shape[0] * 0.5))

### Imputing missing values

In [51]:
df['customer_name'] = df['customer_name'].fillna('Unknown')
df['product_category'] = df['product_category'].fillna('Miscellaneous')
df['quantity'] = df['quantity'].fillna(df['quantity'].mean())
df['unit_price'] = df['unit_price'].fillna(df['unit_price'].mean())
df['order_date'] = pd.to_datetime(df['order_date'])
df = df.sort_values(by='order_date')
df['order_date'] = df['order_date'].ffill()

df

Unnamed: 0,order_id,customer_name,product_category,quantity,unit_price,order_date
0,1001,Alice,Electronics,2.0,299.99,2024-08-01
1,1002,Bob,Clothing,1.0,49.99,2024-08-02
2,1003,Unknown,Electronics,2.2,199.99,2024-08-03
3,1004,David,Books,1.0,109.823333,2024-08-04
4,1005,Eva,Miscellaneous,3.0,15.99,2024-08-05
5,1006,Frank,Clothing,2.2,79.99,2024-08-06
6,1007,Unknown,Books,4.0,12.99,2024-08-06
