<a href="https://colab.research.google.com/github/RifatMuhtasim/Data_Science_Workflow/blob/main/3.1.Create_New_Features.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Polynomial Feature

In [None]:
def Polynomial_features(df, target):
    result_df = df.copy()
    column_list = [col for col in df.columns]
    column_list.remove(target)

    for i in column_list:
        result_df[i + '_square'] = df[i] ** 2
        result_df[i + '_cube'] = df[i] ** 3
        result_df[i + '_sqrt'] = np.sqrt(df[i])
        result_df[i + '_log'] = np.log(df[i])
    return result_df


pdf = Polynomial_features(df=df, target=target)

# Interaction Features

In [None]:
def Interaction_features_for_multiple_columns(df, target):
    result_df = df.copy()
    column_list = [col for col in df.columns]
    column_list.remove(target)

    for i in column_list:
        for col in column_list:
            if i not in col:
                result_df[i + '+' + col] = df[i] + df[col]
                result_df[i + '-' + col] = df[i] - df[col]
                result_df[i + '*' + col] = df[i] * df[col]
                result_df[i + '/' + col] = df[i] / df[col]
    return result_df


idf = Interaction_features_for_multiple_columns(df=pdf, target= 'target')

# Binning or Discretization

In [None]:
from sklearn.preprocessing import KBinsDiscretizer

# Define number of bins
n_bins = 6
kbins = KBinsDiscretizer(n_bins=n_bins, encode='ordinal', strategy='uniform')
binned_data = kbins.fit_transform(np.array(mdf['sepal length (cm)']).reshape(-1, 1))
mdf['sepal area bins'] = binned_data
mdf['sepal area bins'] = mdf['sepal area bins'].astype(int)

# Remove Features With Correlation Value

- Pearson Correlation: When Target and Features both are Numerical.
- Point-Biserial Correlation: When Target is Binary and Features are Numerical.
- Correlation Ratio: When Target are categorical and Features are Numerical.
- Cramer's V: When Target are categorical and Features are Categorical.

## # Pearson Correlation

In [None]:
def Pearson_correlation_list(df, target):
    df_copy = df.copy()
    correlation_matrix = df_copy.corr()
    correlation_matrix.reset_index(inplace=True)
    correlation_df = correlation_matrix[['index', target]]
    correlation_df = correlation_df[correlation_df['index'] != target]
    correlation_df[target] = abs(correlation_df[target])
    return correlation_df

In [None]:
correlation_df = Pearson_correlation_list(df= df, target= 'target')
correlation_df.sort_values("target", ascending=True)

In [None]:
removed_columns = correlation_df[(correlation_df['target'] > 0.9) | (correlation_df['target'] < 0.1) | (correlation_df['target'].isna())]['index'].tolist()
df= df.drop(removed_columns, axis="columns")

## # Point-Biserial Correlation

In [None]:
from scipy.stats import pointbiserialr

def Pointbiserial(df, target):
    df_copy = df.copy()
    columns = [col for col in df_copy.columns]
    correlation_df = pd.DataFrame(columns=['column', 'value'])

    for i in columns:
        correlation, p_value = pointbiserialr(df_copy[i], df_copy[target])
        correlation_df = pd.concat([correlation_df, pd.DataFrame({'column': [i], 'value': [correlation]})], ignore_index=True)

    correlation_df = correlation_df[correlation_df['column'] != target]
    return correlation_df

In [None]:
correlation_df = Pointbiserial(df= df, target= 'target')
correlation_df.sort_values("value", ascending=True)

In [None]:
removed_columns = correlation_df[(correlation_df['value'] > 0.95) | (correlation_df['value'] < 0.05) | (correlation_df['value'].isna())]['column'].tolist()
df = df.drop(removed_columns, axis="columns")

## # Correlation Ratio

In [None]:
def Classification_correlation_ratio(df, target):
    df_copy = df.copy()
    def correlation_ratio(categories, measurements):
        # Convert categorical labels to integers
        categories = np.array(categories)
        unique_categories = np.unique(categories)
        category_counts = {category: np.sum(categories == category) for category in unique_categories}
        # Total sum of squares
        total_variance = np.var(measurements) * (len(measurements) - 1)

        # Between-group sum of squares
        numerator = 0
        for category, count in category_counts.items():
            category_measurements = measurements[categories == category]
            numerator += count * np.var(category_measurements)

        # Calculate correlation ratio
        eta = numerator / total_variance
        return eta

    columns = [col for col in df_copy.columns]
    correlation_df = pd.DataFrame(columns=['column', 'value'])
    for i in columns:
        correlation_eta = correlation_ratio(df_copy[target], df_copy[i])
        correlation_df = pd.concat([correlation_df, pd.DataFrame({'column': [i], 'value': [correlation_eta]})], ignore_index=True)

    correlation_df = correlation_df[correlation_df['column'] != target]
    return correlation_df

In [None]:
correlation_df = Classification_correlation_ratio(df= df, target= 'target')
correlation_df.sort_values("value", ascending=True)

In [None]:
removed_columns = correlation_df[(correlation_df['value'] > 0.95) | (correlation_df['value'] < 0.05) | (correlation_df['value'].isna())]['column'].tolist()
df = df.drop(removed_columns, axis="columns")

## # Cramer's V

In [None]:
from scipy.stats import chi2_contingency


def CramersV(df, target):
    df_copy = df.copy()

    def cramers_v(confusion_matrix):
        chi2 = chi2_contingency(confusion_matrix)[0]
        n = confusion_matrix.sum()
        r, k = confusion_matrix.shape
        phi2 = chi2 / n
        phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
        rcorr = r - ((r-1)**2)/(n-1)
        kcorr = k - ((k-1)**2)/(n-1)
        return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))


    columns = [col for col in df_copy.columns]
    correlation_df = pd.DataFrame(columns=['column', 'value'])
    for i in columns:
        conf_matrix = pd.crosstab(df_copy[i], df_copy[target])
        # Calculate Cramér's V
        cramers_v_value = cramers_v(conf_matrix.values)
        correlation_df = pd.concat([correlation_df, pd.DataFrame({'column': [i], 'value': [cramers_v_value]})], ignore_index=True)

    correlation_df = correlation_df[correlation_df['column'] != target]
    return correlation_df

In [None]:
correlation_df = CramersV(df= df, target='target')
correlation_df.sort_values("value", ascending=True)

In [None]:
removed_columns = correlation_df[(correlation_df['value'] > 0.95) | (correlation_df['value'] < 0.05) | (correlation_df['value'].isna())]['column'].tolist()
df = df.drop(removed_columns, axis="columns")