<div style="background-color:#ddecfc; color:#100; padding:30px; border-radius:50px; max-width:1200px; margin:left;">
<font color='Navy'> <b><u><h1> EDA - Explenatory Data Analysis</h1></b></u> </font>

- Libraries
- Functions
- 

</font>


| Step | Check Type                                | Purpose                                             |
| ---- | ----------------------------------------- | --------------------------------------------------- |
| 1Ô∏è‚É£  | Continuous ‚Üí Target                       | Find numeric predictors                             |
| 2Ô∏è‚É£  | Continuous ‚Üî Continuous                   | Drop redundant numerics                             |
| 3Ô∏è‚É£  | Categorical ‚Üí Target                      | Find categorical predictors                         |
| 4Ô∏è‚É£  | Categorical ‚Üî Categorical                 | Drop redundant categoricals                         |
| 5Ô∏è‚É£  | **Continuous ‚Üî Categorical (non-target)** | Explore structure, feature interactions, redundancy |


In [None]:
import sys
print(sys.executable)
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import seaborn as sns
import warnings
from autoviz.AutoViz_Class import AutoViz_Class
%matplotlib inline
from scipy.stats import chi2_contingency
import scipy.stats as stats
from itertools import combinations
%matplotlib inline
warnings.filterwarnings("ignore")
from scipy.stats import kruskal



## <font color='Navy'> <h3>üî∑ <u> Functions </u></h3>
- useful functions called along the code.

In [None]:
def plot_fraud_rate_by_bin(df, feature):
    """
    Plots fraud rate by quantile bins of a specified feature.
    """

    df[f"{feature}_bin"] = pd.qcut(df[feature], q=10, duplicates="drop")

    fraud_rate_by_bin = (
        df.groupby(f"{feature}_bin")
        .agg(fraud_rate=("is_fraud", "mean"),
            count=("is_fraud", "size"))
        .reset_index()
    )

    fig, ax1 = plt.subplots(figsize=(10, 4))

    sns.lineplot(
        data=fraud_rate_by_bin,
        x=fraud_rate_by_bin.index,
        y="fraud_rate",
        marker="o",
        color="blue",
        ax=ax1
    )
    ax1.set_ylabel("Fraud Rate", color="blue")
    ax1.set_xlabel(f"{feature.title()} (Quantile Bin)")
    ax1.tick_params(axis="y", labelcolor="blue")

    # Add volume as bars
    ax2 = ax1.twinx()
    sns.barplot(
        data=fraud_rate_by_bin,
        x=fraud_rate_by_bin.index,
        y="count",
        alpha=0.3,
        color="gray",
        ax=ax2
    )
    ax2.set_ylabel("Transaction Count", color="gray")

    plt.title(f"Fraud Rate by {feature.title()} (Quantile Bins)")
    plt.tight_layout()
    plt.show()

In [None]:
def check_skewness(df, features, threshold=1):
    """
    Computes skewness for each feature in a list and flags if it's highly skewed.

    Parameters:
        df (pd.DataFrame)
        features (list): list of numeric feature names
        threshold (float): cutoff to label strong skew (default=1)

    Returns:
        pd.DataFrame with feature, skewness value, and skew type
    """
    results = []
    for col in features:
        skew = df[col].skew()
        if skew > threshold:
            skew_type = "Right-skewed"
        elif skew < -threshold:
            skew_type = "Left-skewed"
        else:
            skew_type = "Approximately normal"
        results.append({"Feature": col, "Skewness": round(skew, 3), "Type": skew_type})
    
    skew_df = pd.DataFrame(results).sort_values(by="Skewness", ascending=False)
    return skew_df


In [None]:
def plot_distributions_by_skewness(df, skew_table):
    """
    Plots each numeric feature based on its skewness result.
    Uses log scale for strongly skewed features.
    
    Parameters:
        df (pd.DataFrame): your dataset
        skew_table (pd.DataFrame): output from check_skewness()
    """
    
    for _, row in skew_table.iterrows():
        feature = row["Feature"]
        skew = row["Skewness"]
        skew_type = row["Type"]

        plt.figure(figsize=(7,4))
        if abs(skew) > 1:
            sns.histplot(np.log1p(df[feature]), bins=40, kde=True, color="orange", alpha=0.6)
            plt.title(f"{feature} (log-transformed for heavy skew, skew={skew:.2f})")
            plt.xlabel(f"log({feature} + 1)")
        else:
            sns.histplot(df[feature], bins=40, kde=True, color="skyblue", alpha=0.6)
            plt.title(f"{feature} Distribution (skew={skew:.2f}, {skew_type})")
            plt.xlabel(feature)
        
        plt.ylabel("Count")
        plt.tight_layout()
        plt.show()


In [None]:
def plot_feature_distribution(df, feature, target="is_fraud"):
    plt.figure(figsize=(7, 4))
    sns.kdeplot(
        data=df,
        x=feature,
        hue=target,
        fill=True,
        common_norm=False,
        alpha=0.5,
        palette={0: "skyblue", 1: "salmon"}
    )
    plt.xlim(left=0)  # removes the misleading negative tail
    plt.title(f"{feature} Distribution by Fraud Status", fontsize=12, fontweight="bold")
    plt.xlabel(feature)
    plt.ylabel("Density")
    plt.tight_layout()
    plt.show()

In [None]:
# Numeric Summary by Class :

numeric_focus = ["amt", "distance_km", "age", "city_pop"]

def summary_by_class(df, cols, target="is_fraud"):
    out_frames = []
    for c in cols:
        g = df.groupby(target)[c].agg(["count", "mean", "median", "std", "min", "max"]).reset_index()
        g.insert(1, "feature", c)
        out_frames.append(g)
    res = pd.concat(out_frames, ignore_index=True)
    res[target] = res[target].map({0: "Non-Fraud", 1: "Fraud"})
    res = res.round(3)
    return res


In [None]:
def plot_categorical(df, feature, target="is_fraud"):
    # Group by profile and sum the frauds (1 means fraud)
    fraud_by_feature = df.groupby(feature)[target].sum()

    # Calculate the total number of transactions per profile
    total_transactions_by_feature = df.groupby(feature).size()

    # Calculate the percentage of fraud transactions for each profile
    fraud_percentage_by_feature = (fraud_by_feature / total_transactions_by_feature) * 100

    # Sort by the number of fraud cases in ascending order``
    fraud_by_feature = fraud_by_feature.sort_values(ascending=True)
    fraud_percentage_by_feature = fraud_percentage_by_feature[fraud_by_feature.index]
    
    # Set up the subplots (1 row, 2 columns)
    fig, axes = plt.subplots(1, 2, figsize=(14, 6))
    plt.suptitle(f'Fraud Analysis by {feature}', fontsize=16, fontweight='bold')

    # Plot 1: Fraud count by profile
    sns.barplot(x=fraud_by_feature.index, y=fraud_by_feature.values, palette='viridis', ax=axes[0])
    axes[0].set_xlabel(f'{feature}', fontsize=12)
    axes[0].set_ylabel(f'Number of Fraud Cases', fontsize=12)
    axes[0].set_title(f'Fraud Count by {feature}', fontsize=14)
    axes[0].tick_params(axis='x', rotation=90)
    axes[0].grid(axis='y', linestyle='-', alpha=0.7)

    # Plot 2: Fraud percentage by profile
    sns.barplot(x=fraud_percentage_by_feature.index, y=fraud_percentage_by_feature.values , palette='coolwarm', ax=axes[1])
    axes[1].set_xlabel(f'{feature}', fontsize=12)
    axes[1].set_ylabel('Fraud Percentage (%)', fontsize=12)
    axes[1].set_title(f'Fraud Percentage by {feature}', fontsize=14)
    axes[1].tick_params(axis='x', rotation=90)
    axes[1].grid(axis='y', linestyle='-', alpha=0.7)

    # Adjust layout for better spacing
    plt.tight_layout()

    # Show the plot
    plt.show()


## <font color='Navy'> <h3>üî∑ <u> Data Load & Prep </u></h3>

### <font color='Navy'> 1üîπ<u><b>Flat File Load: </u></b> </h3>
- Since our data set is still huge 17M - i've decided to take a sample using strantified random sample method.

In [None]:
df = pd.read_pickle("df_clean3.pkl")

In [None]:
df.info()

In [None]:
df.shape

### <font color='Navy'> 2üîπ<u><b>Stratified Random Sample Application:</b></u>
- Since our data set is still huge 17M (!) - i've decided to take a sample using strantified random sample method.
</font>


In [None]:
# Stratified sampling to maintaine fraud ratio
df_sample, _ = train_test_split(
    df, 
    test_size=0.97, 
    stratify=df['is_fraud'], 
    random_state=42
)
print(df_sample.shape)


<font color='Navy'>
- Our data set is now reduced to (518772 - rows).
</font>

In [None]:
df_sample.head(5)

### <font color='Navy'> 3üîπ<u><b>Save to .pkl file:</b></u>
- for future quick loads.
</font>

In [None]:
# df_sample.to_pickle("df_sample.pkl")

### <font color='Navy'> 3üîπ<u><b>Read .pkl file:</b></u>
- for future quick loads.
</font>

In [None]:
df = pd.read_pickle("df_sample.pkl")

### <font color='Navy'> 4üîπ<u><b>Data Protocol :</b></u>
- Reporting types Missing, min, max.. </font>

In [None]:
with pd.ExcelWriter("EDA_Data_Protocol.xlsx") as xw:
    df.dtypes.astype(str).rename("dtype").to_excel(xw, sheet_name="data_type")
    df.max(numeric_only=True).to_excel(xw, sheet_name="max_numeric")
    df.min(numeric_only=True).to_excel(xw, sheet_name="min_numeric")
    df.isnull().sum().rename("missing").to_excel(xw, sheet_name="missing")
    df.nunique().rename("unique").to_excel(xw, sheet_name="unique")

#### <font color='Navy'> 4.1 Notes
The dataset contains **25 columns** and **~518k transactions**.  
- <u> Looking at our df Columns/features , we have :</u>


    -  **Target Variable** - `is_fraud`. The variable we want to predict.

    - **Customer Metadata** - `cc_num`, `acct_num`.

    - **Customer Demographics** - `gender`, `age`, `age_group`, `job_category`, `location_profile`.

    - **Transaction Information** - `amt`, `category`, `merchant`, `trans_month`, `trans_day`, `trans_hour`, `trans_dayofweek`, `trans_quarter`, `trans_time_group`.

    - **Geographic Information** - `city`, `state`, `zip`, `city_pop`, `distance_km`.

    - **Encodings / Engineered Features** - `gender_encoded`, `unix_time`, `trans_time`.


### <font color='Navy'> 5üîπ<u><b> df Describe Summary :</b></u>
</font>

In [None]:
df.describe().T

#### <font color='Navy'> 5.1 Notes :
- `cc_num`  & `acct_num` - very large numbers, identifier numbers - not really intresting.
- `city pop` - hits about being skewed towards the larger city populations (median < mean>) range is huge 44K to about 29 M.
- `trans time` - 0-23 hrs, mean is close to medain arount the afternoon 4PMs to 5PMs time.\
- `amt` - wide range, and abnormal Max value indicating outlier, very far from mean and median. STD - also high, indicating large range of transaction values. 
- `is_fraud` - 0.0054,, indicating and imbalanced class (outcome) set.
- `age` - range 11 to 93, mean 41 median 39 (pretty close) +-17 stdev
</font>

## <font color='Navy'><h3>üî∑<u>  AutoViz - Automatic EDA Reports </u></h3>
- just for view

In [None]:
AV = AutoViz_Class()
AV.AutoViz(df_sample, depVar="is_fraud",max_rows_analyzed=300000)

## <font color='Navy'><h3>üî∑<u>  EDA </u></h3>
- Target Analysis
- Class Sensitive Analysis
    - Univarite Analysis
    - Bivariate Analysis
    - Multivariate Analysis

### <font color='Navy'> 1üîπ<u><b> Target Distribution Analysis:</b></u>
- Global fraud/non-fraud ratio check
- Evaluation for imbalanced dataset.

In [None]:
fraud_counts = df["is_fraud"].value_counts().rename({0: "Non-Fraud", 1: "Fraud"})
fraud_percentage = (fraud_counts / len(df) * 100).round(3)

fraud_summary = pd.DataFrame({
    "Transaction Count": fraud_counts,
    "Percentage (%)": fraud_percentage
})

fig, axes = plt.subplots(1, 2, figsize=(10, 4))
plt.suptitle("Fraud vs Non-Fraud Transactions", fontsize=12)
axes[0].bar(fraud_summary.index, fraud_summary["Transaction Count"], color=['green', 'red'])
axes[0].set_ylabel("Transaction Count")
axes[0].bar_label(axes[0].containers[0])

axes[1].bar(fraud_summary.index, fraud_summary["Percentage (%)"], color=['green', 'red'])
axes[1].set_ylabel("Percentage (%)")
axes[1].bar_label(axes[1].containers[0])


plt.tight_layout()
plt.show()

fraud_summary

##### <font color='Navy'><b><u><h4> 1.1 findings: </h4></b></u>
- It appears that fraud cases are rare. only 0.54% are fraud cases.
- we can conclude that our data set is <b> imbalanced</b>.
- for imbalanced datasets, <b><u> we must:</u>
    - Use better metrics (like Precision, Recall, F1, ROC-AUC ‚Äî not accuracy).
    - Use balanced sampling or class weighting when training models.
    - During EDA, analyze each class separately (fraud vs non-fraud distributions).</b>
</font>

### <font color='Navy'> 2üîπ<u><b> Class Sensetive Analysis - Variable Types: :</b></u>
- Dummy Variables
- Continues Variables
- Categorical Variables

In [None]:
# Dummy features:
dummy_features = ['is_fraud', 'gender_encoded']

In [None]:
# Continues features:

numeric_features = df.select_dtypes(include=['int64', 'float64']).columns
print(f"Numeric features : {numeric_features}")
# simpler removal ‚Äî drop unwanted columns directly like IDs and target
cont_features = numeric_features.drop(['cc_num', 'acct_num','is_fraud', 'zip', 'unix_time'], errors='ignore').tolist()

print(f"Numeric continues features {numeric_features}")



In [None]:
# Categorical features :
categorical_features = df.select_dtypes(include=['object', 'category']).columns.tolist()
categorical_features

### <font color='Navy'> 3üîπ<u><b> Continues Variables :</b></u>
- Distribution
- Correlation to Targt
- Redundancy

#### <font color='Navy'> 3.1 üîπ<u> Univariate Analysis: </u>

##### <font color='Navy'> 3.11 üîπSkewness :

In [None]:
# Check skewness of numeric features:
skewness_df = check_skewness(df, cont_features)
skewness_df

In [None]:
for feature in cont_features:
    plot_fraud_rate_by_bin(df, feature)

#### <font color='Navy'> 3.2 üîπ<u> Bivariate Analysis: </u>
1.  Continuous to Target :
    - Find numeric predictors
2. Continuous to Continuous:
    - Drop redundant numerics

##### <font color='Navy'> 3.2.1 üîπContinues Feautre To Target :

üîπContinues Feautre = 'amt' Evaluation :

In [None]:
feature = "amt"
plot_feature_distribution(df, feature)

üîπ Initial Observation:
- Strong right skew + very few extreme values (density closer to 0):
    - Possible outliers
    - Higher amounts of transactions are rare.
- No Fraud Cases Tend to be for smaller amounts of transactions (high narrow line at lower values)
- Fraud cases : flatter and shifted to the right (2 peaks) - tend to be for larger amounts of transactions
- *** Distictive Pattern *** :
    - while both pattern overlap at lower amounts, there is a distiction in higher amounts.

üîπConclusion:
1. Predictive: Strong fraud indicator ‚Äî higher amounts linked to fraud.
2. Skewness: Heavily right-skewed ‚Üí apply log/robust scaling.
3. Outliers: Likely Present ‚Äî review in data-cleaning phase.
4. Distribution: Non-fraud = low amounts; fraud = higher, spread out.
5. Action: Keep feature, scale down, and monitor outliers.


<b> 3.2.2 üîπContinues Feautre = 'city pop' Distribution :</b>

In [None]:
plot_feature_distribution(df, "city_pop")

üîπInitial Observation:
- Both `fraud` and ` no_frad` cases are strongly skeweed to the right
    - Most Transactions are from smaller populations.
    - Possible outliers case, need to check in data clean.
    
- *** Distictive Pattern *** :
    - Both curves seem to mostly overlap in pattern (higher density in lower population) with no clear sperator at first glance.

üîπConclusion:
1. Predictivity: Possible Limited standalone predictive power.
2. Skewness: Right-skewed ‚Üí apply log/robust scaling.
3. Outliers: Possibly Present ‚Äî review in data-cleaning phase.
4. Distribution: Non-fraud = low amounts; fraud = higher, spread out.
5. Action: Keep feature, scale down, and monitor outliers.

<b>üîπContinues Feautre = 'age' Distribution :</b>

In [None]:
plot_feature_distribution(df, "age")

 üîπInitial Observation:
- Non Fraud transactions :
    - mostly dominated by younger customers
    - slightly skewed to the right, older customers included
- Fraud Cases :
    - while also speared, it is more centered towards middle aged to older people (40-60)
- *** Distictive Pattern *** :
    - No clear distiction, we will have to evaluate it more clearly.
    - For now appears as weak to moderate predictor. 
    
üîπConclusion:
1. Predictivity: Limited to Weak stand alone prediction, will check in correlation.
2. Skewness: very mild, 0.691 - non fraud cases appear more skewed.
3. Outliers: Possibly Present above age 95. 
5. Action: Preform test to evaluate if worth to keep, evaluate the category age bin.

<b> >3.2.4 üîπContinues Feautre = 'distance_km' Distribution : </b>

In [None]:
plot_feature_distribution(df, "distance_km")

üîπInitial Observation:
- Mild skewness to the left, but fairly normal.

- *** Distictive Pattern *** :
    - Almost total overlap between 2 curves no distictive pattern. 
    - For now appears as weak to moderate predictor. 
    
üîπConclusion:
1. Predictivity: No strong stand alone prediction, will check in correlation.
2. Skewness: mild, mostly normal.
3. Outliers: not visable, will check later.
5. Action: Preform test to evaluate if worth to keep, evaluate the category age bin.

In [None]:
summary_table = summary_by_class(df, numeric_focus, target="is_fraud")
summary_table

##### <font color='Navy'> 3.2.2 üîπContinues To Continues :
- Correlaiton marix, using spearman (as most features are skewed)
- Redundancy check

In [None]:
# continuous numerical features only
corr_features = ['amt', 'city_pop', 'age', 'distance_km']

# compute spearman correlation
corr_matrix = df[corr_features].corr(method='spearman')
corr_matrix


In [None]:
plt.figure(figsize=(6, 4))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', center=0, fmt=".2f")
plt.title("Spearman Correlation Heatmap (Continuous Features)", fontsize=12, fontweight='bold')
plt.tight_layout()
plt.show()


<font color='Navy'> Mlticollinearity among numeric variables check:

Multicollinearity happens when two or more independent (predictor) variables in dataset are highly correlated with each other ‚Äî typically correlation > |0.8| or |0.9|.
That means - They carry almost the same information, One can be predicted from another, It can cause problems for certain models (like logistic regression, linear regression).

<u> Spearman shows : </u>
- No special correlation between the variables we checked, `age` to `amt` has 0.12 the rest are even lower.
- We are safe to keep all of them for now.
- None of your numeric predictors are strongly related to each other.
- <b> Not redundancy </b>

</font></div>

================================================================================================================

In [None]:
corr_matrix_target = df[cont_features + ['is_fraud']].corr(method='spearman')
corr_with_target = corr_matrix_target['is_fraud'].drop('is_fraud').sort_values(key=abs, ascending=False)

plt.figure(figsize=(6, 4))
sns.barplot(x=corr_with_target.index, y=corr_with_target.values, palette='viridis')
plt.title("Spearman Correlation with Target (is_fraud)", fontsize=12, fontweight='bold')
plt.ylabel("Spearman Correlation")
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()


<font color='Navy'> Single correlation to Target:
- It seems that absolute correlation of individul features to `is farud` are weak.
- Transaction `amt`` is the most predictive numeric feature. 
</font></div>

### <font color='Navy'> 4üîπ<u><b> Categorical Variables :</b></u>
- Distribution
- Correlation to Targt
- Redundancy

In [None]:
display(categorical_features)

In [None]:
demografic_cat = ['gender','age_group', 'job_category', 'location_profile'] 
tran_info_cat = ['category', 'merchant', 'trans_month', 'trans_day', 'trans_hour', 'trans_dayofweek', 'trans_quarter', 'trans_time_group']
geo_cat =['city','state', 'zip3']

#### <font color='Navy'> 4.1 üîπ<u> Univariate Analysis: </u>

In [None]:
# demografic_cat = ['gender','age_group', 'job_category', 'location_profile'] 
plot_categorical(df, 'gender')

- no evidance for imbalance sets (count pretty much equal - no bais expected)
- predictive power as individual maybe weak- fraud rates seem pretty equal.

In [None]:
# demografic_cat = ['gender','age_group', 'job_category', 'location_profile'] 
plot_categorical(df, 'age_group')

- Higher age group seem to be more prone to fraud.
- Possible stand alone predictor.

In [None]:
# demografic_cat = ['gender','age_group', 'job_category', 'location_profile'] 
plot_categorical(df, 'job_category')

- Fraud rates are slightly higher for technological field

In [None]:
# demografic_cat = ['gender','age_group', 'job_category', 'location_profile'] 
plot_categorical(df, 'location_profile')

- majority of the fraud cases are from 'urban' locations.
- however, we see the same fruad risk for both locations, no preferance.

==================================================================

<u>Overall Takeaways </u>

- Most categorical variables show low to moderate predictive separation ‚Äî useful but not dominant.
- `Age group` and `job category` show both meaningful count and rate differences ‚Üí **likely to add value to models**.
- `Gender` and `location` are **largely neutral**.

In [None]:
# tran_info_cat = ['category', 'merchant', 'trans_month', 'trans_day', 'trans_hour', 'trans_dayofweek', 'trans_quarter', 'trans_time_group']
plot_categorical(df, 'category')

In [None]:
df["category_clean"] = df["category"].str.replace("_net|_pos", "")

In [None]:
plot_categorical(df, 'category_clean')

In [None]:
df["is_online"] = df["category_clean"].apply(lambda x: 1 if "_net" in x else 0)
df["is_online"].astype('category')
df["category_type"] = df["category_clean"].str.replace("_net|_pos", "", regex=True)

In [None]:
plot_categorical(df, 'is_online')
plot_categorical(df, 'category_clean')

- 100% of the online trasactions are prone to fraud
- The `is_online` comes directly from transaction metadata (or from _net/_pos category names), it‚Äôs safe.

In [None]:
# tran_info_cat = ['category', 'merchant', 'trans_month', 'trans_day', 'trans_hour', 'trans_dayofweek', 'trans_quarter', 'trans_time_group']
plot_categorical(df, 'trans_month')

- specific months show higher fraud rate - this can be an identifier.

In [None]:
plot_categorical (df, 'trans_quarter')

In [None]:
plot_categorical(df, 'trans_day')
# make trans_day categorical:
df['trans_day'] = df['trans_day'].astype('category')

In [None]:
plot_categorical(df, 'trans_dayofweek')

In [None]:
# tran_info_cat = ['category', 'merchant', 'trans_month', 'trans_day', 'trans_hour', 'trans_dayofweek', 'trans_quarter', 'trans_time_group']
plot_categorical(df, 'trans_time_group')

In [None]:
display(geo_cat)

In [None]:
df.info()

In [None]:
plot_categorical(df,'state')

In [None]:
df['zip_region'] = df['zip3'].str[0]  # 0-9 represent major USPS areas

In [None]:
df['zip_region'] = df['zip_region'].astype('category')

In [None]:
plot_categorical(df, 'zip_region')

In [None]:
df['merchant'].nunique()

In [None]:
#merchant - high cardinality, we will not inlcude this in the EDA

fraud_by_merchant = df.groupby('merchant')['is_fraud'].mean()
merchant_counts = df['merchant'].value_counts()
fraud_by_merchant = fraud_by_merchant[merchant_counts[fraud_by_merchant.index] > 50]
# fraud_by_merchant.hist(bins=30)
sns.histplot(fraud_by_merchant, bins= 30)




- Long right tail ‚Üí a small group of merchants have unusually high fraud rates.
- at this stage we will keep merchant as it is, handle it in feature engineering.


In [None]:
categorical_features_sub = df.select_dtypes(include=['category']).columns.tolist()
categorical_features_sub

In [None]:
# Remove high cardinality features, plus zip3 as we simplied it to zip_region due to high cardinality.
categorical_features_sub.remove('merchant')
categorical_features_sub.remove('zip3')
categorical_features_sub.remove('city')
display(categorical_features_sub)

#### <font color='Navy'> 4.2 üîπ<u> Bivariate Analysis: </u>
- Categorical to Target (is_fraud)
- Categorical to Categorical

##### <font color='Navy'> 4.2.1 üîπ<b><u> Categorical to Target (is_fraud):</b></u></font>
 
- To assess the which categories are linked to our traget feature `is_fruad`, we will use the following tests:

1.<b><u> Chi-Squre - is there a relationship at all?</b></u>
- Will help us check if the evaluated categorical feautre is independed/dependented from traget value.    - 
    - Does it affects how it behaves or not?

 - **œá¬≤** - Measures how far observed data is from the ‚Äúno-relationship‚Äù expectation
- **dof** - Chooses *which reference curve* to use (based on how many categories)
- **p-value** - Tells how extreme your œá¬≤ is on that curve (small p ‚Üí far out in the tail)

<br></br>
2. <b><u>Cramer's V - how strong is that relationship ?</b></u>
- if both values are related, how strong the correlation is ?

In [None]:
# Chi-Square Test (Categorical Feature Vs Categorical Target = `is_fraud`)

results = []

for cat in categorical_features_sub:
    contingency_table = pd.crosstab(df[cat], df['is_fraud'])
    chi2, p, dof, ex = chi2_contingency(contingency_table, correction=False)
    n = contingency_table.sum().sum()
    cramer_v = np.sqrt(chi2 / (n * (min(contingency_table.shape) - 1)))
    results.append({
        'Variable': cat,
        'Chi2 Statistic': chi2,
        'p-value': p,
        'Degrees of Freedom': dof,
        'Cramers V': cramer_v
    })

chi2_results = pd.DataFrame(results).sort_values(by='p-value', ascending=True)
chi2_results


In [None]:
# Filler the most significat p<0.05:

chi2_siginficat = chi2_results[chi2_results['p-value']<0.05]
display(chi2_siginficat)
cat_dep_features = (chi2_siginficat ['Variable']).tolist()
display(cat_dep_features)

- most significant categorical features : 
    - `category` : strongly dependendt to `is_fraud` : high Chi and high Cramer value 0.85 
    - `trans_hour`, `trans_time_group` - correlated/dependent to target, moderate impact
    - `age_group`, `trans_month`, 'trans_dayofweek`, `trans_quarter` = dependent, slightly impact.

In [None]:
# Independant Variables:

chi2_insiginficat = chi2_results[chi2_results['p-value']>0.05]
display(chi2_insiginficat)
cat_indep_features = (chi2_insiginficat ['Variable']).tolist()
display(cat_indep_features)

`gender`, `state`, `job_category`, `location_profile` - are all independant, and have no correlation to `is_fraud` target value.

##### <font color='Navy'> 4.2.1 üîπ<b><u> Categorical to Categorical:</b></u></font>

In [None]:
# cat_dep_features = ['category', 'trans_hour', 'trans_time_group',
#                   'age_group', 'trans_month', 'trans_dayofweek', 'trans_quarter']

results = []

for v1, v2 in combinations(cat_dep_features, 2):
    #Build the contingency table
    table = pd.crosstab(df[v1], df[v2])
    
    #Run the chi-square test of independence
    chi2, p, dof, ex = chi2_contingency(table, correction=False)
    
    #Compute Cram√©r‚Äôs V from the Chi¬≤ statistic
    n = table.sum().sum()
    cramer_v = np.sqrt(chi2 / (n * (min(table.shape) - 1)))
    
    results.append({
        'Var1': v1,
        'Var2': v2,
        'Chi2': chi2,
        'Degrees_of_Freedom': dof,
        'p-value': p,
        'CramerV': cramer_v
    })

cat_to_cat = pd.DataFrame(results).sort_values('CramerV', ascending=False)
cat_to_cat


In [None]:
# Convert df into a pivot table
heatmap_data = cat_to_cat.pivot(index='Var1', columns='Var2', values='CramerV')

# Make it symmetric
heatmap_data = heatmap_data.combine_first(heatmap_data.T)

# Create a mask for the upper triangle
mask = np.triu(np.ones_like(heatmap_data, dtype=bool))

# Plot the half heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(
    heatmap_data,
    mask=mask,          # hide the upper half
    annot=True,
    fmt=".2f",
    cmap="RdYlGn_r",
    vmin=0, vmax=1,
    square=True,
    linewidths=0.5
)
plt.title("Cram√©r‚Äôs V Heatmap (Lower Triangle Only)")
plt.tight_layout()
plt.show()


<b><u>Duplicate features ( prefect correlation)</b></u>
1. `trans_quarter` to `trans month` - quarter was created from month.
    - since month is more informative we should keep it.
2. `trans_time_group` is just a binned version of `trans_hour` (e.g. morning/afternoon/night).
    - trans_hour carries more precise information.
3. `category` to `trans_quarter`- totally unexpeced..
    - this suggested that the locaiton profile of the user ( rural vs urban) is seasonal.
    - this is not bais we added by feature engineering.
    - since correlation is so strong, we should drop the quarter.

probabbly at this stage we should keep :<br>
['category', 'trans_hour', 'age_group', 'trans_month', 'trans_dayofweek']</br>

In [None]:
cats_2drop= ['trans_quarter', 'trans_time_group'] #for feauture drop.


Continuous variables with strong correlation (|Spearman| > 0.1 or 0.2)
and
Categorical variables with strong Chi-square (p < 0.05) and moderate/strong Cram√©r‚Äôs V (> 0.1)

Whether a numeric variable is influenced by or structured within a categorical one.

Hidden redundancy or interactions between feature types.

In [None]:
continuous_vars = ['amt', 'city_pop', 'age', 'distance_km']
categorical_vars = ['age_group', 'category', 'trans_dayofweek', 'trans_hour', 'trans_month']

results = []

for cont in continuous_vars:
    for cat in categorical_vars:
        # Skip columns with too many unique categories (optional safeguard)
        if df[cat].nunique() < 2 or df[cat].nunique() > 50:
            continue
        
        # Create samples by group
        groups = [df.loc[df[cat] == level, cont].dropna() for level in df[cat].unique()]
        
        # Apply Kruskal-Wallis test
        stat, p = kruskal(*groups)
        
        results.append({
            'Continuous': cont,
            'Categorical': cat,
            'Kruskal H-stat': stat,
            'p-value': p
        })

kruskal_results = pd.DataFrame(results).sort_values(by='p-value')
kruskal_results.head(10)



In [None]:
# Pivot the results
heatmap_data = kruskal_results.pivot(index='Continuous', columns='Categorical', values='p-value')

# Convert to 1 - p for better visual contrast (so low p => dark)
heatmap_data_visual = 1 - heatmap_data

plt.figure(figsize=(8, 5))
sns.heatmap(
    heatmap_data_visual, 
    annot=True, 
    fmt=".3f", 
    cmap="RdYlGn_r",  # red = strong relation (low p)
    cbar_kws={'label': '1 - p-value'}
)
plt.title("Continuous ‚Üî Categorical Relationships (Kruskal‚ÄìWallis)")
plt.tight_layout()
plt.show()


Continuous‚ÄìCategorical Interaction Summary:

- amt shows significant variation across all categorical groupings ‚Äî suggesting transaction patterns differ strongly by category and time.
- age and city_pop are also related to categorical variables, likely reflecting demographic and regional structure.
- distance_km shows weaker dependence, indicating it may capture independent spatial effects.
- amt to age:
    - age to amt in pearson correlation showed low value. while correlation to categorical group is very high 
    - this means the relationship is non-linear and group-driven.

In [None]:
df.to_pickle('EDA_final.pkl')