# 1. Functions, libraries and packages

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
def dataframe_info(df: pd.DataFrame):
    """
    Generate a summary DataFrame containing metadata about the columns of the input DataFrame.

    Args:
        df (pd.DataFrame): The input DataFrame.

    Returns:
        pd.DataFrame: A summary DataFrame with the following columns:
            - Column_name: Name of each column.
            - Total records: Total number of records in each column.
            - Missing Values: Number of missing (NaN) values in each column.
            - Data type: Data type of each column.
            - Unique values: Number of unique values in each column.
    """
    df_summary = pd.DataFrame({
        'Column_name': df.columns,
        'Total records': [df[col].size for col in df.columns],
        'Missing Values': [df[col].isna().sum() for col in df.columns],
        'Data type': [df[col].dtype for col in df.columns],
        'Unique values': [df[col].nunique() for col in df.columns]
    })

    return df_summary

# 2. Data import

In [7]:
df = pd.read_csv(r'..\Data\products.csv')

In [8]:
df.head(5)

Unnamed: 0,Product_ID,Product_desc,Product_type,Product_category,Product_subcategory,Width,Height,Product_color,Product_handle,Product_hinge,Product_packing,Product_batch,Product_costs
0,P0001,Door_Industrial_Thermal-insulated_RAL7035_PL_Left,Door,Industrial,Thermal-insulated,697.0,396.0,RAL7035,PL,Left,Wooden box,8,237.42
1,P0002,Door_Residential_Balcony_RAL9005_PL_Right,Door,Residential,Balcony,675.0,1139.0,RAL9005,PL,Right,Wooden box,2,241.72
2,P0003,Door_Design_Minimalist_RAL9005_AL_Right,Door,Design,Minimalist,738.0,1111.0,RAL9005,AL,Right,Wooden box,59,168.29
3,P0004,Door_Design_Glass-panel_RAL7024_PL_Right,Door,Design,Glass-panel,745.0,1006.0,RAL7024,PL,Right,Wooden box,28,210.2
4,P0005,Frame_Residential_Balcony_RAL7035_ST_Left,Frame,Residential,Balcony,653.0,1838.0,RAL7035,ST,Left,Wooden box,63,154.64


# 3.EDA

In [10]:
dataframe_info(df).sort_values(by = 'Missing Values', ascending = False)

Unnamed: 0,Column_name,Total records,Missing Values,Data type,Unique values
7,Product_color,300000,324,object,3
10,Product_packing,300000,313,object,2
2,Product_type,300000,313,object,2
6,Height,300000,311,float64,1771
3,Product_category,300000,300,object,5
12,Product_costs,300000,300,float64,19514
5,Width,300000,294,float64,137
4,Product_subcategory,300000,284,object,15
9,Product_hinge,300000,282,object,2
8,Product_handle,300000,279,object,3
