In [None]:
import pandas as pd
import numpy as np

In [None]:
COUNTRY_FRAUD_DIST = {
    "USA": 0.30,
    "UK": 0.18,
    "France": 0.13,
    "Germany": 0.10,
    "Canada": 0.10,
    "Brazil": 0.10,
    "Australia": 0.08,
    "Russia": 0.07,
    "Mexico": 0.06,
    "Japan": 0.05,
    "Nigeria": 0.02,
    "Singapore": 0.01
}

In [None]:
input_file = "transactions_rebalanced_2.csv"
output_file = "transactions_rebalanced_pays.csv"
CHUNKSIZE = 500_000

In [None]:
total_fraud = 0

for chunk in pd.read_csv(input_file, chunksize=CHUNKSIZE):
    chunk["is_fraud"] = chunk["is_fraud"].astype(bool)
    total_fraud += (chunk["is_fraud"] == True).sum()

print("Total fraudes :", total_fraud)


In [None]:
fraud_targets = {
    country: int(total_fraud * ratio)
    for country, ratio in COUNTRY_FRAUD_DIST.items()
}

print(fraud_targets)


In [None]:
fraud_collected = {country: 0 for country in COUNTRY_FRAUD_DIST}
first_chunk = True

for chunk in pd.read_csv(input_file, chunksize=CHUNKSIZE):
    chunk["is_fraud"] = chunk["is_fraud"].astype(bool)

    # 1️⃣ Garder TOUTES les transactions légitimes
    df_legit = chunk[chunk["is_fraud"] == False]

    # 2️⃣ Traiter les fraudes
    df_fraud = chunk[chunk["is_fraud"] == True]
    fraud_selected = []

    for country, target in fraud_targets.items():
        remaining = target - fraud_collected[country]
        if remaining <= 0:
            continue

        subset = df_fraud[df_fraud["country"] == country]

        if len(subset) == 0:
            continue

        sampled = subset.head(remaining)  # ❌ PAS DE replace
        fraud_collected[country] += len(sampled)
        fraud_selected.append(sampled)

    df_fraud_final = (
        pd.concat(fraud_selected, axis=0)
        if fraud_selected else
        pd.DataFrame(columns=chunk.columns)
    )

    # 3️⃣ Écriture progressive
    output_chunk = pd.concat([df_legit, df_fraud_final], axis=0)

    if first_chunk:
        output_chunk.to_csv(output_file, index=False, mode="w")
        first_chunk = False
    else:
        output_chunk.to_csv(output_file, index=False, mode="a", header=False)

    print("Chunk écrit")

print("✅ Fichier final créé")


In [None]:
df = pd.read_csv(output_file)
df[df["is_fraud"] == True]["country"].value_counts(normalize=True)

In [None]:
df_counts = (
    pd.DataFrame.from_dict(counter, orient="index", columns=["count"])
    .reset_index()
)

df_counts[["country", "is_fraud", "merchant_category"]] = pd.DataFrame(
    df_counts["index"].tolist(),
    index=df_counts.index
)

df_counts = df_counts.drop(columns="index")

In [None]:
df_counts.head()

In [None]:
df_counts["distribution"] = (
    (df_counts["count"]
    / df_counts.groupby(["country", "is_fraud"])["count"].transform("sum"))*100
)
df_counts

In [None]:
pivot_pays = df_counts.pivot_table(
    index="country",
    columns="is_fraud",
    values="distribution"
)

pivot_pays