In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import dask.dataframe as dd
from datetime import datetime, timedelta
import warnings
warnings.filterwarnings('ignore')

# –£—Å—Ç–∞–Ω–æ–≤–∏–º —Å—Ç–∏–ª–∏ –¥–ª—è –≥—Ä–∞—Ñ–∏–∫–æ–≤
plt.style.use('default')
sns.set_palette("husl")

In [None]:
def load_and_explore_data():
    # –ó–∞–≥—Ä—É–∂–∞–µ–º –æ—Å–Ω–æ–≤–Ω—ã–µ –¥–∞–Ω–Ω—ã–µ
    dd_transactions = dd.read_parquet('transaction_fraud_data.parquet', engine='pyarrow')
    df_transactions = dd_transactions.compute()
    dd_currency = dd.read_parquet('historical_currency_exchange.parquet', engine='pyarrow')
    df_currency = dd_currency.compute()
    
    print("‚úÖ –î–∞–Ω–Ω—ã–µ —É—Å–ø–µ—à–Ω–æ –∑–∞–≥—Ä—É–∂–µ–Ω—ã!")
    print(f"–¢—Ä–∞–Ω–∑–∞–∫—Ü–∏–∏: {df_transactions.shape}")
    print(f"–ö—É—Ä—Å—ã –≤–∞–ª—é—Ç: {df_currency.shape}")
    
    return df_transactions, df_currency
    
df, currency_df = load_and_explore_data()
# –ü—Ä–µ–æ–±—Ä–∞–∑–æ–≤–∞–Ω–∏–µ timestamp –∫ datetime
df['timestamp'] = pd.to_datetime(df['timestamp'])

In [None]:
def amount_usd(df):    
    ddf = dd.from_pandas(df, npartitions=4)
    currency_ddf = dd.from_pandas(currency_df, npartitions=1)
    avg_exchange_rates = currency_ddf.mean(numeric_only=True).compute()
    currency_to_usd = {currency: 1 / rate for currency, rate in avg_exchange_rates.items()}
    rates_df = dd.from_pandas(
        pd.DataFrame(list(currency_to_usd.items()), columns=["currency", "rate"]),
        npartitions=1
    )
    ddf = ddf.merge(rates_df, on="currency", how="left")
    ddf["rate"] = ddf["rate"].fillna(1)
    ddf["amount_usd"] = (ddf["amount"] * ddf["rate"])
    df = ddf.compute()
    return df

df = amount_usd(df)

### 1. –†–∞–∑–≤–µ–¥–æ—á–Ω—ã–π –∞–Ω–∞–ª–∏–∑ –¥–∞–Ω–Ω—ã—Ö (EDA)

#### –û–±—â–∞—è —Å—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞

In [None]:
def gen_stat(df):
    # –†–∞–∑–º–µ—Ä –¥–∞—Ç–∞—Å–µ—Ç–∞
    total_transactions = len(df)
    unique_customers = df['customer_id'].nunique()
    unique_vendors = df['vendor'].nunique()
    unique_countries = df['country'].nunique()

    print(f"–†–∞–∑–º–µ—Ä –¥–∞—Ç–∞—Å–µ—Ç–∞: {total_transactions:,} —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π")
    print(f"–£–Ω–∏–∫–∞–ª—å–Ω—ã—Ö –∫–ª–∏–µ–Ω—Ç–æ–≤: {unique_customers:,}")
    print(f"–£–Ω–∏–∫–∞–ª—å–Ω—ã—Ö –≤–µ–Ω–¥–æ—Ä–æ–≤: {unique_vendors:,}")
    print(f"–£–Ω–∏–∫–∞–ª—å–Ω—ã—Ö —Å—Ç—Ä–∞–Ω: {unique_countries}")

    # –î–∏—Å–±–∞–ª–∞–Ω—Å –∫–ª–∞—Å—Å–æ–≤
    fraud_count = df['is_fraud'].sum()
    fraud_rate = df['is_fraud'].mean()
    legitimate_count = total_transactions - fraud_count

    print(f"\nüö® –î–ò–°–ë–ê–õ–ê–ù–° –ö–õ–ê–°–°–û–í:")
    print(f"–õ–µ–≥–∏—Ç–∏–º–Ω—ã–µ –æ–ø–µ—Ä–∞—Ü–∏–∏: {legitimate_count:,} ({1-fraud_rate:.1%})")
    print(f"–ú–æ—à–µ–Ω–Ω–∏—á–µ—Å–∫–∏–µ –æ–ø–µ—Ä–∞—Ü–∏–∏: {fraud_count:,} ({fraud_rate:.1%})")
    print(f"–°–æ–æ—Ç–Ω–æ—à–µ–Ω–∏–µ –ª–µ–≥–∏—Ç:–º–æ—à–µ–Ω–Ω = {legitimate_count/fraud_count:.1f}:1")

    # –†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ –∫–∞–Ω–∞–ª–∞–º
    print(f"\nüì° –†–ê–°–ü–†–ï–î–ï–õ–ï–ù–ò–ï –ü–û –ö–ê–ù–ê–õ–ê–ú:")
    channel_dist = df['channel'].value_counts()
    for channel, count in channel_dist.items():
        pct = count / total_transactions * 100
        print(f"  {channel}: {count:,} ({pct:.1f}%)")

    # –†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ —É—Å—Ç—Ä–æ–π—Å—Ç–≤–∞–º
    print(f"\nüì± –†–ê–°–ü–†–ï–î–ï–õ–ï–ù–ò–ï –ü–û –£–°–¢–†–û–ô–°–¢–í–ê–ú:")
    device_dist = df['device'].value_counts()
    for device, count in device_dist.head().items():
        pct = count / total_transactions * 100
        print(f"  {device}: {count:,} ({pct:.1f}%)")

    # –†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –ø–æ —Ç–∏–ø–∞–º –∫–∞—Ä—Ç
    print(f"\nüí≥ –†–ê–°–ü–†–ï–î–ï–õ–ï–ù–ò–ï –ü–û –¢–ò–ü–ê–ú –ö–ê–†–¢:")
    card_dist = df['card_type'].value_counts()
    for card, count in card_dist.items():
        pct = count / total_transactions * 100
        print(f"  {card}: {count:,} ({pct:.1f}%)")

    # –ö–æ–ª–∏—á–µ—Å—Ç–≤–æ –∏ —Å—É–º–º–∞ —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π –ø–æ –∫–∞—Ç–µ–≥–æ—Ä–∏—è–º –≤–µ–Ω–¥–æ—Ä–∞
    print(f"\nüè™ –¢–†–ê–ù–ó–ê–ö–¶–ò–ò –ü–û –ö–ê–¢–ï–ì–û–†–ò–Ø–ú –í–ï–ù–î–û–†–û–í:")
    vendor_summary = df.groupby('vendor_category').agg({
        'transaction_id': 'count',
        'amount': 'sum'
    }).round(2)
    vendor_summary.columns = ['–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ', '–û–±—â–∞—è_—Å—É–º–º–∞']
    vendor_summary = vendor_summary.sort_values('–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ', ascending=False)

    for category, row in vendor_summary.iterrows():
        pct = row['–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ'] / total_transactions * 100
        print(f"  {category}: {row['–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ']:,} —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π ({pct:.1f}%), ${row['–û–±—â–∞—è_—Å—É–º–º–∞']:,.0f}")

gen_stat(df)

#### –í—Ä–µ–º–µ–Ω–Ω—ã–µ –ø–∞—Ç—Ç–µ—Ä–Ω—ã

In [None]:
def time_pattern(df):
    # –î–æ–±–∞–≤–ª—è–µ–º –≤—Ä–µ–º–µ–Ω–Ω—ã–µ –ø—Ä–∏–∑–Ω–∞–∫–∏
    df['hour'] = df['timestamp'].dt.hour
    df['day_of_week'] = df['timestamp'].dt.day_name()
    df['date'] = df['timestamp'].dt.date

    # –î–æ–ª—è –º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞ –≤ –≤—ã—Ö–æ–¥–Ω—ã–µ
    weekend_fraud = df.groupby('is_weekend')['is_fraud'].agg(['count', 'sum', 'mean'])
    weekend_fraud.columns = ['–í—Å–µ–≥–æ_—Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π', '–ú–æ—à–µ–Ω–Ω–∏—á–µ—Å–∫–∏—Ö', '–î–æ–ª—è_–º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞']

    print(f"\nüìÖ –ú–û–®–ï–ù–ù–ò–ß–ï–°–¢–í–û –í –í–´–•–û–î–ù–´–ï:")
    for is_weekend, row in weekend_fraud.iterrows():
        weekend_label = "–í—ã—Ö–æ–¥–Ω—ã–µ" if is_weekend else "–ë—É–¥–Ω–∏"
        print(f"  {weekend_label}: {row['–ú–æ—à–µ–Ω–Ω–∏—á–µ—Å–∫–∏—Ö']}/{row['–í—Å–µ–≥–æ_—Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π']} = {row['–î–æ–ª—è_–º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞']:.3f}")

    # –î–æ–ª—è –º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞ –Ω–æ—á—å—é (—Å 0 –¥–æ 6 —É—Ç—Ä–∞)
    df['is_night'] = df['hour'].between(0, 6)
    night_fraud = df.groupby('is_night')['is_fraud'].agg(['count', 'sum', 'mean'])
    night_fraud.columns = ['–í—Å–µ–≥–æ_—Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π', '–ú–æ—à–µ–Ω–Ω–∏—á–µ—Å–∫–∏—Ö', '–î–æ–ª—è_–º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞']

    print(f"\nüåô –ú–û–®–ï–ù–ù–ò–ß–ï–°–¢–í–û –ù–û–ß–¨–Æ (0-6 —á–∞—Å–æ–≤):")
    for is_night, row in night_fraud.iterrows():
        time_label = "–ù–æ—á—å" if is_night else "–î–µ–Ω—å"
        print(f"  {time_label}: {row['–ú–æ—à–µ–Ω–Ω–∏—á–µ—Å–∫–∏—Ö']}/{row['–í—Å–µ–≥–æ_—Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π']} = {row['–î–æ–ª—è_–º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞']:.3f}")

    # –ê–∫—Ç–∏–≤–Ω–æ—Å—Ç—å –ø–æ —á–∞—Å–∞–º
    hourly_activity = df.groupby('hour').agg({
        'transaction_id': 'count',
        'is_fraud': ['sum', 'mean']
    }).round(4)
    hourly_activity.columns = ['–í—Å–µ–≥–æ_—Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π', '–ú–æ—à–µ–Ω–Ω–∏—á–µ—Å–∫–∏—Ö', '–î–æ–ª—è_–º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞']

    print(f"\n‚è∞ –ê–ö–¢–ò–í–ù–û–°–¢–¨ –ü–û –ß–ê–°–ê–ú (—Ç–æ–ø-5 –ø–æ –º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤—É):")
    top_fraud_hours = hourly_activity.sort_values('–î–æ–ª—è_–º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞', ascending=False).head()
    for hour, row in top_fraud_hours.iterrows():
        print(f"  {hour:02d}:00 - {row['–ú–æ—à–µ–Ω–Ω–∏—á–µ—Å–∫–∏—Ö']}/{row['–í—Å–µ–≥–æ_—Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π']} = {row['–î–æ–ª—è_–º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞']:.3f}")

    # –°–µ–∑–æ–Ω–Ω–æ—Å—Ç—å - –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π –ø–æ –¥–Ω—è–º
    daily_stats = df.groupby('date').agg({
        'transaction_id': 'count',
        'is_fraud': 'sum',
        'amount': 'sum'
    }).round(2)
    daily_stats.columns = ['–¢—Ä–∞–Ω–∑–∞–∫—Ü–∏–π', '–ú–æ—à–µ–Ω–Ω–∏—á–µ—Å–∫–∏—Ö', '–û–±—â–∞—è_—Å—É–º–º–∞']

    print(f"\nüìä –°–¢–ê–¢–ò–°–¢–ò–ö–ê –ü–û –î–ù–Ø–ú (–ø–µ—Ä–≤—ã–µ 5 –¥–Ω–µ–π):")
    for date, row in daily_stats.head().iterrows():
        fraud_rate_day = row['–ú–æ—à–µ–Ω–Ω–∏—á–µ—Å–∫–∏—Ö'] / row['–¢—Ä–∞–Ω–∑–∞–∫—Ü–∏–π'] if row['–¢—Ä–∞–Ω–∑–∞–∫—Ü–∏–π'] > 0 else 0
        print(f"  {date}: {row['–¢—Ä–∞–Ω–∑–∞–∫—Ü–∏–π']} —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π, {row['–ú–æ—à–µ–Ω–Ω–∏—á–µ—Å–∫–∏—Ö']} –º–æ—à–µ–Ω–Ω–∏—á–µ—Å–∫–∏—Ö ({fraud_rate_day:.3f})")

    print(f"\n–¢—Ä–µ–Ω–¥ —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π: –æ—Ç {daily_stats['–¢—Ä–∞–Ω–∑–∞–∫—Ü–∏–π'].iloc[0]} –¥–æ {daily_stats['–¢—Ä–∞–Ω–∑–∞–∫—Ü–∏–π'].iloc[-1]} –≤ –¥–µ–Ω—å")

time_pattern(df)

#### –ì–µ–æ—Ä–≥—Ä–∞—Ñ–∏—è

In [None]:
def georgraphy(df):
    # –¢—Ä–∞–Ω–∑–∞–∫—Ü–∏–∏ –≤ —Å–≤–æ–µ–π —Å—Ç—Ä–∞–Ω–µ vs –∑–∞ –µ–µ –ø—Ä–µ–¥–µ–ª–∞–º–∏
    outside_country_stats = df.groupby('is_outside_home_country')['is_fraud'].agg(['count', 'sum', 'mean'])
    outside_country_stats.columns = ['–í—Å–µ–≥–æ_—Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π', '–ú–æ—à–µ–Ω–Ω–∏—á–µ—Å–∫–∏—Ö', '–î–æ–ª—è_–º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞']

    print(f"üó∫Ô∏è –¢–†–ê–ù–ó–ê–ö–¶–ò–ò –ü–û –õ–û–ö–ê–¶–ò–ò:")
    for is_outside, row in outside_country_stats.iterrows():
        location = "–ó–∞ –ø—Ä–µ–¥–µ–ª–∞–º–∏ —Ä–æ–¥–Ω–æ–π —Å—Ç—Ä–∞–Ω—ã" if is_outside else "–í —Ä–æ–¥–Ω–æ–π —Å—Ç—Ä–∞–Ω–µ"
        pct_total = row['–í—Å–µ–≥–æ_—Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π'] / len(df) * 100
        print(f"  {location}: {row['–ú–æ—à–µ–Ω–Ω–∏—á–µ—Å–∫–∏—Ö']}/{row['–í—Å–µ–≥–æ_—Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π']} = {row['–î–æ–ª—è_–º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞']:.3f} ({pct_total:.1f}% –≤—Å–µ—Ö —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π)")

    # –†–µ–≥–∏–æ–Ω—ã —Å –Ω–∞–∏–±–æ–ª—å—à–µ–π –¥–æ–ª–µ–π –º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞
    fraud_by_country = df.groupby('country')['is_fraud'].agg(['count', 'sum', 'mean']).round(4)
    fraud_by_country.columns = ['–í—Å–µ–≥–æ_—Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π', '–ú–æ—à–µ–Ω–Ω–∏—á–µ—Å–∫–∏—Ö', '–î–æ–ª—è_–º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞']
    fraud_by_country = fraud_by_country.sort_values('–î–æ–ª—è_–º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞', ascending=False)

    print(f"\nüö© –¢–û–ü-5 –°–¢–†–ê–ù –ü–û –î–û–õ–ï –ú–û–®–ï–ù–ù–ò–ß–ï–°–¢–í–ê:")
    for country, row in fraud_by_country.head().iterrows():
        if row['–í—Å–µ–≥–æ_—Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π'] >= 100:  # –¢–æ–ª—å–∫–æ —Å—Ç—Ä–∞–Ω—ã —Å –¥–æ—Å—Ç–∞—Ç–æ—á–Ω—ã–º –∫–æ–ª–∏—á–µ—Å—Ç–≤–æ–º —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π
            print(f"  {country}: {row['–ú–æ—à–µ–Ω–Ω–∏—á–µ—Å–∫–∏—Ö']}/{row['–í—Å–µ–≥–æ_—Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π']} = {row['–î–æ–ª—è_–º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞']:.3f}")

    # –ü–µ—Ä–µ—Å—á–µ—Ç —Å—É–º–º—ã —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π –≤ USD
    print(f"\nüí± –ö–û–ù–í–ï–†–¢–ê–¶–ò–Ø –í –ï–î–ò–ù–£–Æ –í–ê–õ–Æ–¢–£ (USD):")

    # –ò—Å–ø–æ–ª—å–∑—É–µ–º —Å—Ä–µ–¥–Ω–∏–µ –∫—É—Ä—Å—ã –≤–∞–ª—é—Ç
    currency_rates = currency_df.drop('date', axis=1).mean()
    print("–°—Ä–µ–¥–Ω–∏–µ –∫—É—Ä—Å—ã –≤–∞–ª—é—Ç –æ—Ç–Ω–æ—Å–∏—Ç–µ–ª—å–Ω–æ USD:")
    for currency, rate in currency_rates.items():
        if currency != 'USD':
            print(f"  1 USD = {rate:.3f} {currency}")

    # –°—Ç–∞—Ç–∏—Å—Ç–∏–∫–∞ —Å—É–º–º –≤ USD
    total_volume_usd = df['amount_usd'].sum()
    fraud_volume_usd = df[df['is_fraud']]['amount_usd'].sum()

    print(f"\nüí∞ –û–ë–™–ï–ú–´ –í USD:")
    print(f"–û–±—â–∏–π –æ–±–æ—Ä–æ—Ç: ${total_volume_usd:,.0f}")
    print(f"–û–±–æ—Ä–æ—Ç –º–æ—à–µ–Ω–Ω–∏—á–µ—Å–∫–∏—Ö –æ–ø–µ—Ä–∞—Ü–∏–π: ${fraud_volume_usd:,.0f}")
    print(f"–î–æ–ª—è –º–æ—à–µ–Ω–Ω–∏—á–µ—Å–∫–æ–≥–æ –æ–±–æ—Ä–æ—Ç–∞: {fraud_volume_usd/total_volume_usd:.2%}")

    # –°—Ä–µ–¥–Ω–∏–µ —Å—É–º–º—ã –ø–æ —Å—Ç—Ä–∞–Ω–∞–º
    country_amounts = df.groupby('country')['amount_usd'].agg(['count', 'mean', 'median']).round(2)
    country_amounts.columns = ['–¢—Ä–∞–Ω–∑–∞–∫—Ü–∏–π', '–°—Ä–µ–¥–Ω—è—è_—Å—É–º–º–∞_USD', '–ú–µ–¥–∏–∞–Ω–Ω–∞—è_—Å—É–º–º–∞_USD']
    country_amounts = country_amounts.sort_values('–°—Ä–µ–¥–Ω—è—è_—Å—É–º–º–∞_USD', ascending=False)

    print(f"\nüíµ –°–†–ï–î–ù–ò–ï –°–£–ú–ú–´ –¢–†–ê–ù–ó–ê–ö–¶–ò–ô –ü–û –°–¢–†–ê–ù–ê–ú:")
    for country, row in country_amounts.head().iterrows():
        print(f"  {country}: {row['–¢—Ä–∞–Ω–∑–∞–∫—Ü–∏–π']} —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π, —Å—Ä–µ–¥–Ω—è—è ${row['–°—Ä–µ–¥–Ω—è—è_—Å—É–º–º–∞_USD']:.0f}, –º–µ–¥–∏–∞–Ω–Ω–∞—è ${row['–ú–µ–¥–∏–∞–Ω–Ω–∞—è_—Å—É–º–º–∞_USD']:.0f}")
    
    # –°—Ä–µ–¥–Ω–∏–µ –∏ –º–µ–¥–∏–∞–Ω–Ω—ã–µ —Å—É–º–º—ã –¥–ª—è —Ä–∞–∑–Ω—ã—Ö —Ç–∏–ø–æ–≤ –∫–∞—Ä—Ç
    card_amounts = df.groupby('card_type')['amount_usd'].agg(['count', 'mean', 'median', 'std']).round(2)
    card_amounts.columns = ['–¢—Ä–∞–Ω–∑–∞–∫—Ü–∏–π', '–°—Ä–µ–¥–Ω—è—è_USD', '–ú–µ–¥–∏–∞–Ω–Ω–∞—è_USD', '–°—Ç–¥_–æ—Ç–∫–ª_USD']

    print(f"üí≥ –°–£–ú–ú–´ –¢–†–ê–ù–ó–ê–ö–¶–ò–ô –ü–û –¢–ò–ü–ê–ú –ö–ê–†–¢:")
    for card_type, row in card_amounts.iterrows():
        cv = row['–°—Ç–¥_–æ—Ç–∫–ª_USD'] / row['–°—Ä–µ–¥–Ω—è—è_USD'] if row['–°—Ä–µ–¥–Ω—è—è_USD'] > 0 else 0  # –ö–æ—ç—Ñ—Ñ–∏—Ü–∏–µ–Ω—Ç –≤–∞—Ä–∏–∞—Ü–∏–∏
        print(f"  {card_type}: —Å—Ä–µ–¥–Ω—è—è ${row['–°—Ä–µ–¥–Ω—è—è_USD']:.0f}, –º–µ–¥–∏–∞–Ω–Ω–∞—è ${row['–ú–µ–¥–∏–∞–Ω–Ω–∞—è_USD']:.0f}, –≤–∞—Ä–∏–∞—Ü–∏—è {cv:.2f}")

    # –ê–Ω–∞–ª–∏–∑ –±–æ–ª—å—à–∏—Ö –æ—Ç–∫–ª–æ–Ω–µ–Ω–∏–π
    print(f"\nüìä –ê–ù–ê–õ–ò–ó –ë–û–õ–¨–®–ò–• –û–¢–ö–õ–û–ù–ï–ù–ò–ô:")
    q95 = df['amount_usd'].quantile(0.95)
    q99 = df['amount_usd'].quantile(0.99)
    max_amount = df['amount_usd'].max()

    large_transactions = df[df['amount_usd'] > q95]
    large_fraud_rate = large_transactions['is_fraud'].mean()
    normal_fraud_rate = df[df['amount_usd'] <= q95]['is_fraud'].mean()

    print(f"95-–π –ø—Ä–æ—Ü–µ–Ω—Ç–∏–ª—å: ${q95:.0f}")
    print(f"99-–π –ø—Ä–æ—Ü–µ–Ω—Ç–∏–ª—å: ${q99:.0f}")
    print(f"–ú–∞–∫—Å–∏–º–∞–ª—å–Ω–∞—è —Å—É–º–º–∞: ${max_amount:.0f}")
    print(f"–î–æ–ª—è –º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞ –≤ –∫—Ä—É–ø–Ω—ã—Ö —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏—è—Ö (>95%): {large_fraud_rate:.3f}")
    print(f"–î–æ–ª—è –º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞ –≤ –æ–±—ã—á–Ω—ã—Ö —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏—è—Ö (‚â§95%): {normal_fraud_rate:.3f}")
    print(f"Lift –∫—Ä—É–ø–Ω—ã—Ö —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π: {large_fraud_rate/normal_fraud_rate:.2f}x")

georgraphy(df)

#### –ò—Ç–æ–≥–∏

In [None]:
def results(df):
    fraud_rate = df['is_fraud'].mean()
    outside_country_stats = df.groupby('is_outside_home_country')['is_fraud'].agg(['count', 'sum', 'mean'])
    outside_country_stats.columns = ['–í—Å–µ–≥–æ_—Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π', '–ú–æ—à–µ–Ω–Ω–∏—á–µ—Å–∫–∏—Ö', '–î–æ–ª—è_–º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞']
    fraud_by_country = df.groupby('country')['is_fraud'].agg(['count', 'sum', 'mean']).round(4)
    fraud_by_country.columns = ['–í—Å–µ–≥–æ_—Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π', '–ú–æ—à–µ–Ω–Ω–∏—á–µ—Å–∫–∏—Ö', '–î–æ–ª—è_–º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞']
    fraud_by_country = fraud_by_country.sort_values('–î–æ–ª—è_–º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞', ascending=False)
    q95 = df['amount_usd'].quantile(0.95)
    q99 = df['amount_usd'].quantile(0.99)
    max_amount = df['amount_usd'].max()
    large_transactions = df[df['amount_usd'] > q95]
    large_fraud_rate = large_transactions['is_fraud'].mean()
    normal_fraud_rate = df[df['amount_usd'] <= q95]['is_fraud'].mean()
    total_volume_usd = df['amount_usd'].sum()
    fraud_volume_usd = df[df['is_fraud']]['amount_usd'].sum()
    
    print("\n" + "="*60)
    print("üìã –ö–õ–Æ–ß–ï–í–´–ï –ò–ù–°–ê–ô–¢–´ –ò –í–´–í–û–î–´")
    print("="*60)
    
    print("\nüéØ –û–°–ù–û–í–ù–´–ï –ù–ê–•–û–î–ö–ò:")
    
    print("\n1. –î–ò–°–ë–ê–õ–ê–ù–° –ö–õ–ê–°–°–û–í:")
    print(f"   ‚Ä¢ {fraud_rate:.1%} –º–æ—à–µ–Ω–Ω–∏—á–µ—Å–∫–∏—Ö —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π (—Å–æ–æ—Ç–Ω–æ—à–µ–Ω–∏–µ 14.3:1)")
    print(f"   ‚Ä¢ –≠—Ç–æ —É–º–µ—Ä–µ–Ω–Ω—ã–π –¥–∏—Å–±–∞–ª–∞–Ω—Å, –ø–æ–¥—Ö–æ–¥—è—â–∏–π –¥–ª—è ML-–º–æ–¥–µ–ª–µ–π")
    
    print("\n2. –ì–ï–û–ì–†–ê–§–ò–ß–ï–°–ö–ò–ï –ü–ê–¢–¢–ï–†–ù–´:")
    outside_lift = outside_country_stats.loc[True, '–î–æ–ª—è_–º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞'] / outside_country_stats.loc[False, '–î–æ–ª—è_–º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞']
    print(f"   ‚Ä¢ –¢—Ä–∞–Ω–∑–∞–∫—Ü–∏–∏ –∑–∞ –ø—Ä–µ–¥–µ–ª–∞–º–∏ —Ä–æ–¥–Ω–æ–π —Å—Ç—Ä–∞–Ω—ã –∏–º–µ—é—Ç {outside_lift:.2f}x –±–æ–ª—å—à–∏–π —Ä–∏—Å–∫")
    print(f"   ‚Ä¢ –¢–æ–ø —Ä–∏—Å–∫–æ–≤–∞–Ω–Ω—ã–µ —Å—Ç—Ä–∞–Ω—ã: {', '.join(fraud_by_country.head(3).index.tolist())}")
    
    print("\n3. –í–†–ï–ú–ï–ù–ù–´–ï –ê–ù–û–ú–ê–õ–ò–ò:")
    print(f"   ‚Ä¢ –ú–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–æ –ø—Ä–∞–∫—Ç–∏—á–µ—Å–∫–∏ –æ–¥–∏–Ω–∞–∫–æ–≤–æ –≤ –≤—ã—Ö–æ–¥–Ω—ã–µ –∏ –±—É–¥–Ω–∏")
    print(f"   ‚Ä¢ –ù–æ—á–Ω—ã–µ —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–∏ –º–µ–Ω–µ–µ —Ä–∏—Å–∫–æ–≤–∞–Ω–Ω—ã (–≤–æ–∑–º–æ–∂–Ω–æ, –º–µ–Ω—å—à–µ –∞–∫—Ç–∏–≤–Ω–æ—Å—Ç–∏)")
    print(f"   ‚Ä¢ –ü–∏–∫–æ–≤—ã–µ —á–∞—Å—ã –º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞: 17:00, 22:00-23:00")
    
    print("\n4. –§–ò–ù–ê–ù–°–û–í–´–ï –ü–ê–¢–¢–ï–†–ù–´:")
    print(f"   ‚Ä¢ –ö—Ä—É–ø–Ω—ã–µ —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–∏ (>95% –ø–µ—Ä—Ü–µ–Ω—Ç–∏–ª—å) –∏–º–µ—é—Ç {large_fraud_rate/normal_fraud_rate:.2f}x –±–æ–ª—å—à–∏–π —Ä–∏—Å–∫")
    print(f"   ‚Ä¢ –î–æ–ª—è –º–æ—à–µ–Ω–Ω–∏—á–µ—Å–∫–æ–≥–æ –æ–±–æ—Ä–æ—Ç–∞ ({fraud_volume_usd/total_volume_usd:.1%}) –≤—ã—à–µ –¥–æ–ª–∏ —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π")
    print(f"   ‚Ä¢ –¢–∏–ø—ã –∫–∞—Ä—Ç –ø–æ–∫–∞–∑—ã–≤–∞—é—Ç —Å—Ö–æ–∂–∏–µ –ø–∞—Ç—Ç–µ—Ä–Ω—ã –º–æ—à–µ–Ω–Ω–∏—á–µ—Å—Ç–≤–∞")
    
    print(f"\n‚úÖ EDA –ó–ê–í–ï–†–®–ï–ù")
    print(f"–ü—Ä–æ–∞–Ω–∞–ª–∏–∑–∏—Ä–æ–≤–∞–Ω–æ {len(df):,} —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π —Å {df.columns.nunique()} –ø—Ä–∏–∑–Ω–∞–∫–∞–º–∏")
    print(f"–í—ã—è–≤–ª–µ–Ω–æ {len([col for col in df.columns if 'is_' in col])} –±–∏–Ω–∞—Ä–Ω—ã—Ö –∏–Ω–¥–∏–∫–∞—Ç–æ—Ä–æ–≤ —Ä–∏—Å–∫–∞")

results(df)

#### –ò–ª–ª—é—Å—Ç—Ä–∞—Ü–∏–∏

In [None]:
plt.figure(figsize=(5, 4))
sns.countplot(x="is_fraud", data=df, palette="Set2")
plt.title("–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ –º–æ—à–µ–Ω–Ω–∏—á–µ—Å–∫–∏—Ö –∏ –Ω–µ–º–æ—à–µ–Ω–Ω–∏—á–µ—Å–∫–∏—Ö —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π")
plt.xlabel("is_fraud")
plt.ylabel("–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ")
plt.show()

plt.figure(figsize=(8, 5))
sns.histplot(df["amount"], bins=50, kde=True)
plt.title("–†–∞—Å–ø—Ä–µ–¥–µ–ª–µ–Ω–∏–µ —Å—É–º–º—ã —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π")
plt.xlabel("–°—É–º–º–∞")
plt.show()

num_df = df.select_dtypes(include=["float64", "int64", "bool"])
plt.figure(figsize=(10, 8))
sns.heatmap(num_df.corr(), annot=True, fmt=".2f", cmap="coolwarm")
plt.title("–ö–æ—Ä—Ä–µ–ª—è—Ü–∏—è —á–∏—Å–ª–æ–≤—ã—Ö –ø—Ä–∏–∑–Ω–∞–∫–æ–≤")
plt.show()

if "timestamp" in df.columns:
    df["date"] = pd.to_datetime(df["timestamp"]).dt.date
    daily_counts = df.groupby("date")["transaction_id"].count()
    plt.figure(figsize=(12, 5))
    daily_counts.plot()
    plt.title("–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π –ø–æ –¥–Ω—è–º")
    plt.ylabel("–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ")
    plt.xlabel("–î–∞—Ç–∞")
    plt.show()

top_countries = df["country"].value_counts().head(10)
plt.figure(figsize=(10, 5))
sns.barplot(x=top_countries.index, y=top_countries.values, palette="viridis")
plt.title("–¢–æ–ø-10 —Å—Ç—Ä–∞–Ω –ø–æ –∫–æ–ª–∏—á–µ—Å—Ç–≤—É —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π")
plt.ylabel("–ö–æ–ª–∏—á–µ—Å—Ç–≤–æ —Ç—Ä–∞–Ω–∑–∞–∫—Ü–∏–π")
plt.xticks(rotation=45)
plt.show()

print("\nEDA –∑–∞–≤–µ—Ä—à—ë–Ω.")