In [1]:
import pandas as pd

In [2]:
raw_data = pd.read_csv('house_prices.csv')
raw_data.head(3)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500


### dfs and column manipulations

In [3]:
def get_null_df(df: pd.DataFrame) -> pd.DataFrame:
    """
    Calculates the percentage of null values in each column of the given DataFrame.

    Args:
        df (pd.DataFrame): The DataFrame to analyze for null values.

    Returns:
        pd.DataFrame: A DataFrame containing the percentage of null values for each column,
                      with the column name '% null'.
    """
    return (
        (df.isnull().mean() * 100)
        .to_frame(name='% null')
        .sort_values(by='% null', ascending=False)
        )

def agg_small_cat(df: pd.DataFrame, col_name: str, out_col_name: str, th: float = 0.05) -> pd.DataFrame:
    """
    Aggregates small categories in a DataFrame column into an 'Other' category based on a threshold.

    Args:
        df (pd.DataFrame): The DataFrame containing the column to aggregate.
        col (str): The column name to process.
        out_col_name (str): The name of the new column to be created.
        th (float, optional): The threshold proportion below which categories are aggregated into 'Other'. Default is 0.05.

    Returns:
        pd.DataFrame: A DataFrame with the new column where small categories are replaced with 'Other'.
    """
    
    def generalize_top5(val: str) -> str:
        """
        Generalizes the value to 'Other' if it is not in the top categories.
        
        Args:
            val (str): The value to check.
        
        Returns:
            str: The original value if in top categories, otherwise 'Other'.
        """
        return val if val in top5 else 'Other'

    s = df[col_name].value_counts(normalize=True)
    top5 = s[s > th].index
    return df.assign(**{out_col_name: lambda df_: df_[col_name].apply(generalize_top5)})


In [4]:
# get_null_df(raw_data)

In [5]:
raw_data['LotShape'].value_counts(normalize=True)

LotShape
Reg    0.633562
IR1    0.331507
IR2    0.028082
IR3    0.006849
Name: proportion, dtype: float64

In [6]:
raw_data.pipe(agg_small_cat, col_name='LotShape',out_col_name='LotShape_agg',th=0.05)['LotShape_agg'].value_counts()

LotShape_agg
Reg      925
IR1      484
Other     51
Name: count, dtype: int64

### ML Evaluation