### NaN Ratio Hist
Plots a histogram of NaN values for features

In [2]:
def plot_nan_ration(df, figsize=(10,5), title='Numerical Features'):
    """
    Plots a histogram with NaN ration and its features 
    It only takes into account the fact that the target features is located in the end of a DataFrame!
    
    """
    fig = plt.figure(figsize=figsize)
    plt.hist(df.iloc[:,:-1].isna().sum()/df.shape[0])
    plt.xlabel('NaN Fraction')
    plt.ylabel('Number of Features')
    plt.title(title)
    plt.grid();

### NaN Columns Dropping
Drops columns that consists of only NaN values

In [1]:
def drop_nan_columns(df):
    """
    Drops columns that have only NaN values
    
    """
    feat_to_drop = [feature for feature in df.columns if df[feature].isna().all()]

    print(f'Dropped {len(feat_to_drop)} NaN Columns')
    df.drop(columns=feat_to_drop, axis=1, inplace=True)  

### Unique Categories
Returns info about a number of unique categories per feature 

In [None]:
def show_unique_cat(df): 
    
    features = []
    values = []
    
    for feature in df.columns:
        features.append(feature)
        values.append(df[feature].unique().shape[0])
        
    return pd.DataFrame({'Unique Categories': values}, index=features)

### Outliers Detection
Excludes observations that are outliers

In [None]:
def remove_outliers(df_in, col = ''):
    """
    Calculates quantiles for outliers + interquartile range. Everything above and below the borders will not be included
    in final dataframe.
    
    """
    Q1 = df_in[col].quantile(0.25)
    Q3= df_in[col].quantile(0.75)
    IRQ = Q3 - Q1
    upper_border = Q3 + IRQ*1.5
    lower_border = Q1 - IRQ*1.5
    df_out = df_in[ (df_in[col]>=lower_border) & (df_in[col]<=upper_border) ]
    return df_out

### Multicollinearity Problems
Several emplementations of dropping highly correlated features:
- Based on correlation (usually faster)
- Based on VIF (always slower)

In [3]:
# Based on Pearson Correlation
def drop_cor_features(df, thresh = 0.8):
            
    corr_matrix = df.corr()
    corr_features = set()

    for i in range(len(corr_matrix.columns)):
        for j in range(i):
            if abs(corr_matrix.iloc[i,j]) >= thresh:
                col_name = corr_matrix.columns[i]
                corr_features.add(col_name)
        
    print('Original Number of Features: ', df.shape[1])
    print('Number of Higly Correlated Features: ', len(corr_features))
    df.drop(columns=corr_features, inplace=True)
    print('New Number of Features: ', df.shape[1])
    
    return df

# Based on VIF 
def caclculate_vif(df, target=None, thresh=5):
    # df preparation
    if target is not None:
        df = df.drop(columns = target.columns)
        
    col_to_drop = df.select_dtypes(['object','datetime64[ns]']).columns
    df = df.drop(columns = col_to_drop)
    df = df.dropna(axis=0)
    from statsmodels.stats.outliers_influence import variance_inflation_factor
    
    features = df.columns
    feature_idx = np.arange(len(features))
    dropping = True
    original_shape = df.shape[1]
    print('Original Number of Features: ', original_shape)
    # VIF Calculation
    while dropping:
        dropping = False
        current_df = df[features[feature_idx]].values
        vif_values = [ variance_inflation_factor(current_df, idx) for idx in np.arange(current_df.shape[1]) ]
        
        max_vif = max(vif_values)
        max_vif_loc = vif_values.index(max_vif)
        
        if max_vif > thresh:
            feature_idx = np.delete(feature_idx, max_vif_loc)
            dropping = True
            
    new_shape = df[features[feature_idx]].shape[1]
    print('Number of Features Having Large VIF: ', original_shape - new_shape)
    print('New Number of Features:', new_shape)
    
    return df[features[feature_idx]]

### Data Normalization and Standardization

In [4]:
def scale_data(x_in, scaling_type='N',col_to_scale=[]):

    if scaling_type == 'N':
        from sklearn.preprocessing import MinMaxScaler
        scaler = MinMaxScaler()
        x_in[col_to_scale] = scaler.fit_transform(x_in[col_to_scale])
        x_norm = x_in
        
        return x_norm
    
    if scaling_type == 'S':
        from sklearn.preprocessing import StandardScaler
        scaler = StandardScaler()
        x_in[col_to_scale] = scaler.fit_transform(x_in[col_to_scale])
        x_std = x_in
        
        return x_std