In [1]:
#Importing Packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# suppress warnings
import warnings
warnings.filterwarnings('ignore')

# define the function for reducing memory usage when importing data
def reduce_memory_usage(df):
  
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)

    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

## Plotting

In [None]:
def countplot_function(df, column, hue="TARGET", figsize=(10, 5), rotation=90):
    """
    Plots a countplot with labeled values on each bar.

    Parameters:
    - data (DataFrame): The dataset.
    - column (str): Categorical column to plot.
    - hue (str): Column for grouping (default is "TARGET").
    - figsize (tuple): Figure size (default is (10, 5)).
    - rotation (int): Rotation angle for x-axis labels (default is 90).
    """
    plt.figure(figsize=figsize)  # Set figure size
    ax = sns.countplot(data=data, x=column, hue=hue)

    # Rotate x-axis labels
    plt.xticks(rotation=rotation)

    # Add count labels on top of bars
    for p in ax.patches:
        ax.annotate(f'{int(p.get_height())}', 
                    (p.get_x() + p.get_width() / 2., p.get_height()), 
                    ha='center', va='bottom', fontsize=10, color='black')

    plt.title(f"Countplot of {column} by {hue}")
    plt.show()

## Data Cleaning

In [None]:
# filtering the data that has missing values > 65%
def dropna_over50(df):
    missing_values = df.isnull().sum().sort_values(ascending=False)
    missing_percent = (missing_values / len(df) * 100)

    missing_data_over_50 = missing_percent[missing_percent > 50]
    print(f'There are: {len(missing_data_over_50)} columns missing data over 50%')
    print(missing_data_over_50)

#dropping columns that have more than 65% null values
    df.drop(columns = missing_data_over_50.index, inplace=True)
    print('\n')
    print(f'Shape of the df after removing missing data over 50% : {df.shape}')
    
    return df  

In [None]:
def factorize_cat_cols(df):
    
    cat_cols = df.select_dtypes(include='object').columns
    for col in cat_cols:
        df[col] = pd.factorize(df[col])[0]
    return df

In [None]:
# Drop highly-correlated numerical variables
# Identify pairs of features with correlation above a threshold
def redundant_data(df,correlation_matrix):
    threshold = 0.8
    to_drop = []  # List to store columns to drop
    numeric_columns = df.select_dtypes(include=['number']).columns
    correlation_matrix = df[numeric_columns].corr()
    
# Looping through the correlation matrix to find highly correlated pairs
    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if abs(correlation_matrix.iloc[i, j]) > threshold:
                colname = correlation_matrix.columns[i]
                if colname not in to_drop:
                    to_drop.append(colname)

# Drop one column from each highly correlated pair
    to_drop = list(set(to_drop) & set(df.columns))

    # Drop columns
    df.drop(columns=to_drop, inplace=True)
    print(f"Dropped {len(to_drop)} redundant columns")
    print(f"The columns names that were dropped are :{to_drop}")
    print('\n')
    print(f"Shape of the dataset after removing multicolinearitly: {df.shape}")

    return df

In [None]:
def imputing_na(df):
    for col in df.columns:
        if not df[col].dtype == 'number':
            df[col] = df[col].fillna(df[col].mode()[0])
        else:
            df[col] = df[col].fillna(df[col].median())
    print("Imputed all na values")
    return df


In [None]:
def cap_outliers_sd(df, threshold=3, exclude_columns=None):
    """
    Caps outliers at Â±3 standard deviations for all numeric columns,
    except those in exclude_columns.

    Parameters:
    df (pd.DataFrame): The input DataFrame.
    threshold (float): Standard deviation threshold (default is 3).
    exclude_columns (list): List of column names to exclude from capping.

    Returns:
    pd.DataFrame: DataFrame with outliers capped for numeric columns only.
    """
    if exclude_columns is None:
        exclude_columns = []
        
    df_capped = df.copy()
    # Select only numeric columns
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    
    for column in numeric_columns:
        if column in exclude_columns:
            continue  # Skip excluded column
        
        mean = df[column].mean()
        std_dev = df[column].std()

        lower_bound = mean - threshold * std_dev
        upper_bound = mean + threshold * std_dev

        df_capped[column] = df[column].clip(lower=lower_bound, upper=upper_bound)
        df_capped = df
    return df

