<a href="https://colab.research.google.com/github/RifatMuhtasim/Data_Science_Workflow/blob/main/2.1.EDA_Target_Correlation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# 1. Explore Distribution of Target Variable

In [None]:
# Visualize the distribution of the target variable
# Draw a Histogram for CO2 Emissions

import plotly.express as px
import plotly.graph_objects as go
import plotly.figure_factory as ff
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from scipy.stats import skew, kurtosis


def Histogram_plot_kde(df, x):
    hist_data = df[x]
    skewness = skew(hist_data)
    kurtosis_value = kurtosis(hist_data)
    fig = ff.create_distplot([hist_data], [x], curve_type='kde')
    fig.update_layout(title= f"{x} Distribution. Skewness: {round(skewness, 2)} and Kurtosis: {round(kurtosis_value, 2)}")
    fig.show()


Histogram_plot_kde(df, 'regression')

In [None]:
# Visualize the target variable skewness and histogram

import scipy.stats as stat
from scipy.stats import skew, kurtosis
import statsmodels.api as sm
import pylab


def Diagnostic_plot(df, x):
    plt.figure(figsize=(12, 6))
    plt.subplot(1, 2, 1)
    sns.histplot(x=df[x],  kde=True)
    plt.subplot(1, 2, 2)
    stat.probplot(df[x], dist="norm", plot = pylab)

    print("Skew:", skew(df[x]))
    print("Kurtosis:", kurtosis(df[x]))
    plt.show()

Diagnostic_plot(df, 'regression')

In [None]:
# Transformation to achieve to a symmetrical distribution

train_copy = train.copy()
train_copy['Log_CO2_Emissions'] = np.log(train['CO2 emissions (metric tons per capita)'])

hist_data = train_copy['Log_CO2_Emissions']
skewness = skew(hist_data)
kurtosis_value = kurtosis(hist_data)
fig = ff.create_distplot([hist_data], ['CO2 Emissions'], curve_type='kde')
fig.update_layout(title= f"C02 Emissions Distribution. Skewness: {round(skewness, 2)} and Kurtosis: {round(kurtosis_value, 2)}")
fig.show()

# 2. Visualize Relationship between features and Target

# 3. Explore Correlations between Features

## Compute pairwise correlation

In [None]:
numerical_df = df.select_dtypes(include=['int', 'float'])
sns.pairplot(numerical_df)

## Visualize Correlation using Heatmap

In [None]:
# When we have regression problem

plt.figure(figsize=(8, 4))
numerical_df = train_df.select_dtypes(include=['int', 'float'])
sns.heatmap(numerical_df.corr(), annot=True, fmt=".2f", linewidths=0.5)
plt.title("Correlation Matrix")
plt.show()

## Identify Highly Correlated Columns

### Regression

In [None]:
# When both are Continuous

def Pearson_correlation_list(df, target):
    df_copy = df.copy()
    correlation_matrix = df_copy.corr()
    correlation_matrix.reset_index(inplace=True)
    correlation_df = correlation_matrix[['index', target]]
    correlation_df = correlation_df[correlation_df['index'] != target]
    return correlation_df.sort_values("value", ascending=False)

correlation_df = Pearson_correlation_list(df= df, target= 'regression')
correlation_df

In [None]:
def High_correlated(dataset, threshold):
    high_correlated = []
    correlation_matrix = dataset.corr()

    for i in range(len(correlation_matrix.columns)):
        for j in range(i):
            if correlation_matrix.iloc[i, j] > threshold:
                colname = correlation_matrix.columns[i]
                high_correlated.append(colname)
    return high_correlated


high_correlated_features = High_correlated(df, 0.8)
high_correlated_features

In [None]:
not_remove = ['classification', 'classification',  'regression', 'regression', 'regression' ]
print("Not Remove: ", not_remove)

high_correlated_features = [item for item in high_correlated_features if item not in not_remove]
high_correlated_features

# Drop High Correlated Columns
df = df.drop(high_correlated_features, axis="columns")
df.head()

### Classification

#### Binary Classification

In [None]:
#When Target is Binary and Features are continuous

from scipy.stats import pointbiserialr

def Pointbiserial(df, target):
    df_copy = df.copy()
    columns = [col for col in df_copy.columns]
    correlation_df = pd.DataFrame(columns=['column', 'value'])

    for i in columns:
        correlation, p_value = pointbiserialr(df_copy[i], df_copy[target])
        correlation_df = pd.concat([correlation_df, pd.DataFrame({'column': [i], 'value': [correlation]})], ignore_index=True)

    correlation_df = correlation_df[correlation_df['column'] != target]
    return correlation_df.sort_values("value", ascending=False)


correlation_df = Pointbiserial(numerical_df, target= 'classification')
correlation_df

#### Categorical Classification

In [None]:
# When target are categorical and Features are continuous

def Classification_correlation_ratio(df, target):
    df_copy = df.copy()
    def correlation_ratio(categories, measurements):
        # Convert categorical labels to integers
        categories = np.array(categories)
        unique_categories = np.unique(categories)
        category_counts = {category: np.sum(categories == category) for category in unique_categories}
        # Total sum of squares
        total_variance = np.var(measurements) * (len(measurements) - 1)

        # Between-group sum of squares
        numerator = 0
        for category, count in category_counts.items():
            category_measurements = measurements[categories == category]
            numerator += count * np.var(category_measurements)

        # Calculate correlation ratio
        eta = numerator / total_variance
        return eta

    columns = [col for col in df_copy.columns]
    correlation_df = pd.DataFrame(columns=['column', 'value'])
    for i in columns:
        correlation_eta = correlation_ratio(df_copy[target], df_copy[i])
        correlation_df = pd.concat([correlation_df, pd.DataFrame({'column': [i], 'value': [correlation_eta]})], ignore_index=True)

    correlation_df = correlation_df[correlation_df['column'] != target]
    return correlation_df.sort_values("value", ascending=False)

In [None]:
correlation_df = Classification_correlation_ratio(df= numerical_df, target= 'classification')
correlation_df

##### When Both Features and Target are categorical

In [None]:
# When both are Categorical

from scipy.stats import chi2_contingency


def CramersV(df, target):
    df_copy = df.copy()

    def cramers_v(confusion_matrix):
        chi2 = chi2_contingency(confusion_matrix)[0]
        n = confusion_matrix.sum()
        r, k = confusion_matrix.shape
        phi2 = chi2 / n
        phi2corr = max(0, phi2 - ((k-1)*(r-1))/(n-1))
        rcorr = r - ((r-1)**2)/(n-1)
        kcorr = k - ((k-1)**2)/(n-1)
        return np.sqrt(phi2corr / min((kcorr-1), (rcorr-1)))


    columns = [col for col in df_copy.columns]
    correlation_df = pd.DataFrame(columns=['column', 'value'])
    for i in columns:
        conf_matrix = pd.crosstab(df_copy[i], df_copy[target])
        # Calculate Cramér's V
        cramers_v_value = cramers_v(conf_matrix.values)
        correlation_df = pd.concat([correlation_df, pd.DataFrame({'column': [i], 'value': [cramers_v_value]})], ignore_index=True)

    correlation_df = correlation_df[correlation_df['column'] != target]
    return correlation_df.sort_values("value", ascending=False)

In [None]:
correlation_df = CramersV(df= df, target='target')
correlation_df

#### Drop High Correlated Columns

In [None]:
max_threshold = 0.90
min_threshold = 0.10

removed_columns = correlation_df[
    (correlation_df['value'] > max_threshold) |
    ((correlation_df['value'] < min_threshold ) & (correlation_df['value'] >= 0)) |
    (correlation_df['value'].isna())
]['column'].tolist()

removed_columns

In [None]:
df = df.drop(removed_columns, axis="columns")