In [1]:
# CRYPTO SCAM DETECTION SYSTEM


# SETUP

!pip install pycoingecko requests pandas numpy scikit-learn xgboost lightgbm -q
!pip install plotly kaleido imbalanced-learn beautifulsoup4 shap -q

print("[SUCCESS] All packages installed successfully")

# IMPORTS

import pandas as pd
import numpy as np
import requests
import json
import time
import warnings
from datetime import datetime, timedelta
from typing import Dict, List, Tuple, Optional
import random

# ML Libraries
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import (classification_report, confusion_matrix, roc_auc_score,
                            roc_curve, accuracy_score, f1_score, precision_score,
                            recall_score, precision_recall_curve)
from imblearn.over_sampling import SMOTE
from imblearn.combine import SMOTETomek
import xgboost as xgb
import lightgbm as lgb

# Visualization
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import matplotlib.pyplot as plt
import seaborn as sns

warnings.filterwarnings('ignore')
np.random.seed(42)
random.seed(42)

print("[SUCCESS] All libraries imported successfully")
print(f"[INFO] Analysis Date: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/69.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━[0m [32m61.4/69.0 kB[0m [31m33.5 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m69.0/69.0 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/49.3 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.3/49.3 kB[0m [31m1.7 MB/s[0m eta [36m0:00:00[0m
[?25h[SUCCESS] All packages installed successfully
[SUCCESS] All libraries imported successfully
[INFO] Analysis Date: 2025-12-12 14:19:30


In [9]:
# COINGECKO DATA FETCHER

class CryptoDataFetcher:
    """Fetches real cryptocurrency data from CoinGecko API"""

    def __init__(self):
        self.base_url = "https://api.coingecko.com/api/v3"
        self.session = requests.Session()

    def fetch_top_coins(self, limit: int = 100) -> pd.DataFrame:
        """Fetch top coins by market cap"""
        print(f"[INFO] Fetching top {limit} coins from CoinGecko...")
        all_coins = []

        for page in range(1, (limit // 100) + 2):
            try:
                url = f"{self.base_url}/coins/markets"
                params = {
                    'vs_currency': 'usd',
                    'order': 'market_cap_desc',
                    'per_page': 100,
                    'page': page,
                    'sparkline': True,
                    'price_change_percentage': '24h,7d,30d'
                }
                response = self.session.get(url, params=params, timeout=30)

                if response.status_code == 200:
                    data = response.json()
                    if not data:
                        break
                    all_coins.extend(data)
                    print(f"   [OK] Page {page}: {len(data)} coins fetched")
                else:
                    print(f"   [ERROR] Page {page}: HTTP Error {response.status_code}")

                time.sleep(1.5)  # Rate limiting

                if len(all_coins) >= limit:
                    break

            except Exception as e:
                print(f"   [ERROR] Exception occurred: {e}")
                time.sleep(3)

        return pd.DataFrame(all_coins[:limit])

    def process_live_data(self, df: pd.DataFrame) -> List[Dict]:
        """Process live data into our format"""
        processed = []

        for _, row in df.iterrows():
            try:
                # Infer category
                name = str(row.get('name', '')).lower()
                category = 'altcoin'
                if any(x in name for x in ['bitcoin', 'ethereum', 'solana', 'cardano', 'polkadot']):
                    category = 'layer1'
                elif any(x in name for x in ['tether', 'usd coin', 'usdc', 'usdt', 'dai']):
                    category = 'stablecoin'
                elif any(x in name for x in ['doge', 'shib', 'pepe', 'floki', 'bonk', 'meme']):
                    category = 'meme_coin'
                elif any(x in name for x in ['uniswap', 'aave', 'compound', 'curve', 'sushi']):
                    category = 'defi'

                rank = row.get('market_cap_rank', 500) or 500

                coin = {
                    'name': row.get('name', 'Unknown'),
                    'symbol': str(row.get('symbol', 'UNK')).upper(),
                    'category': category,
                    'scam_type': 'none',
                    'is_scam': 0,
                    'description': f"Rank #{rank} by market cap",

                    # Security features - top coins are generally safe
                    'had_audit': 1 if rank < 50 else (1 if random.random() < 0.7 else 0),
                    'team_doxxed': 1 if rank < 100 else (1 if random.random() < 0.6 else 0),
                    'liquidity_locked': 1,
                    'ownership_renounced': 1 if rank < 30 else (1 if random.random() < 0.5 else 0),
                    'contract_verified': 1,
                    'honeypot': 0,

                    # Market data
                    'market_cap': row.get('market_cap', 0) or 0,
                    'volume_24h': row.get('total_volume', 0) or 0,
                    'price_change_24h': row.get('price_change_percentage_24h', 0) or 0,
                    'price_change_7d': row.get('price_change_percentage_7d_in_currency', 0) or 0,

                    # Derived
                    'age_days': random.randint(365, 3000) if rank < 100 else random.randint(100, 1000),
                    'holder_count': int(row.get('market_cap', 1000000) / 100),
                    'social_followers': int(row.get('market_cap', 1000000) / 50),

                    # Low risk indicators
                    'max_tx_limit': 0,
                    'buy_tax': random.uniform(0, 2),
                    'sell_tax': random.uniform(0, 3),
                    'top_holder_percent': random.uniform(5, 25),
                }
                processed.append(coin)

            except Exception as e:
                continue

        return processed

print("[SUCCESS] CryptoDataFetcher class defined")

[SUCCESS] CryptoDataFetcher class defined


In [10]:
# COMPREHENSIVE SCAM DATABASE

class ScamDatabase:
    """
    Comprehensive database of documented scams and legitimate coins
    Includes all real-world examples with realistic variance
    """

    @staticmethod
    def get_documented_scams() -> List[Dict]:
        """
        Real documented crypto scams with variance added to prevent overfitting
        """

        base_scams = [
            # CELEBRITY/INFLUENCER RUG PULLS
            {
                'name': 'Save The Kids Token',
                'symbol': 'KIDS',
                'category': 'influencer_coin',
                'scam_type': 'rug_pull',
                'is_scam': 1,
                'description': 'FaZe Clan members promoted then dumped',
                'loss_usd': 30000000,
                'year': 2021,
            },
            {
                'name': 'Lil Yachty Coin',
                'symbol': 'YACHTY',
                'category': 'influencer_coin',
                'scam_type': 'pump_dump',
                'is_scam': 1,
                'description': 'Rapper launched coin that crashed 95%',
                'loss_usd': 20000000,
                'year': 2021,
            },
            {
                'name': 'Soulja Boy Coin',
                'symbol': 'SOULJA',
                'category': 'influencer_coin',
                'scam_type': 'pump_dump',
                'is_scam': 1,
                'description': 'Multiple pump and dump schemes',
                'loss_usd': 5000000,
                'year': 2021,
            },
            {
                'name': 'Logan Paul CryptoZoo',
                'symbol': 'ZOO',
                'category': 'influencer_coin',
                'scam_type': 'fraud',
                'is_scam': 1,
                'description': 'NFT game that never delivered promises',
                'loss_usd': 3000000,
                'year': 2022,
            },
            {
                'name': 'Kim Kardashian EthereumMax',
                'symbol': 'EMAX',
                'category': 'influencer_coin',
                'scam_type': 'pump_dump',
                'is_scam': 1,
                'description': 'Paid promotion led to SEC fine for Kim K',
                'loss_usd': 100000000,
                'year': 2021,
            },
            {
                'name': 'Floyd Mayweather EthereumMax',
                'symbol': 'EMAX2',
                'category': 'influencer_coin',
                'scam_type': 'pump_dump',
                'is_scam': 1,
                'description': 'Boxer promoted multiple scam tokens',
                'loss_usd': 50000000,
                'year': 2021,
            },
            {
                'name': 'Faze Kay Promoted Token',
                'symbol': 'FAZEK',
                'category': 'influencer_coin',
                'scam_type': 'rug_pull',
                'is_scam': 1,
                'description': 'YouTube influencer pump and dump',
                'loss_usd': 15000000,
                'year': 2021,
            },
            {
                'name': 'IcePoseidon CxCoin',
                'symbol': 'CX',
                'category': 'influencer_coin',
                'scam_type': 'rug_pull',
                'is_scam': 1,
                'description': 'Streamer admitted to scamming fans',
                'loss_usd': 500000,
                'year': 2022,
            },
            {
                'name': 'Adin Ross Love Token',
                'symbol': 'LOVE',
                'category': 'influencer_coin',
                'scam_type': 'pump_dump',
                'is_scam': 1,
                'description': 'Streamer promoted token that crashed',
                'loss_usd': 2000000,
                'year': 2022,
            },
            {
                'name': 'Hawk Tuah',
                'symbol': 'HAWK',
                'category': 'influencer_coin',
                'scam_type': 'pump_dump',
                'is_scam': 1,
                'description': 'Viral meme person coin crash 2024',
                'loss_usd': 25000000,
                'year': 2024,
            },
            {
                'name': 'Jake Paul Promoted Tokens',
                'symbol': 'JPTOKEN',
                'category': 'influencer_coin',
                'scam_type': 'pump_dump',
                'is_scam': 1,
                'description': 'Multiple promoted tokens crashed',
                'loss_usd': 8000000,
                'year': 2021,
            },
            {
                'name': 'Ricegum Mystery Token',
                'symbol': 'RICE',
                'category': 'influencer_coin',
                'scam_type': 'pump_dump',
                'is_scam': 1,
                'description': 'YouTuber promoted scam token',
                'loss_usd': 1000000,
                'year': 2021,
            },

            # MAJOR DEFI RUG PULLS
            {
                'name': 'Squid Game Token',
                'symbol': 'SQUID',
                'category': 'meme_coin',
                'scam_type': 'honeypot',
                'is_scam': 1,
                'description': 'Famous honeypot - users could not sell',
                'loss_usd': 12000000,
                'year': 2021,
            },
            {
                'name': 'AnubisDAO',
                'symbol': 'ANKH',
                'category': 'defi',
                'scam_type': 'rug_pull',
                'is_scam': 1,
                'description': '$60M drained 20 hours after launch',
                'loss_usd': 60000000,
                'year': 2021,
            },
            {
                'name': 'Meerkat Finance',
                'symbol': 'MKAT',
                'category': 'defi',
                'scam_type': 'rug_pull',
                'is_scam': 1,
                'description': '$31M drained one day after launch',
                'loss_usd': 31000000,
                'year': 2021,
            },
            {
                'name': 'Compounder Finance',
                'symbol': 'CP3R',
                'category': 'defi',
                'scam_type': 'rug_pull',
                'is_scam': 1,
                'description': '$10.8M rug pull on Ethereum',
                'loss_usd': 10800000,
                'year': 2020,
            },
            {
                'name': 'Uranium Finance',
                'symbol': 'U92',
                'category': 'defi',
                'scam_type': 'exploit',
                'is_scam': 1,
                'description': '$50M stolen during migration',
                'loss_usd': 50000000,
                'year': 2021,
            },
            {
                'name': 'WhaleFarm',
                'symbol': 'WHALE',
                'category': 'defi',
                'scam_type': 'rug_pull',
                'is_scam': 1,
                'description': '$2.3M rug pull on BSC',
                'loss_usd': 2300000,
                'year': 2021,
            },
            {
                'name': 'TurtleDex',
                'symbol': 'TTDX',
                'category': 'defi',
                'scam_type': 'rug_pull',
                'is_scam': 1,
                'description': '$2.5M rug pull on BSC',
                'loss_usd': 2500000,
                'year': 2021,
            },
            {
                'name': 'StableMagnet',
                'symbol': 'SMAG',
                'category': 'defi',
                'scam_type': 'rug_pull',
                'is_scam': 1,
                'description': '$27M BSC rug pull',
                'loss_usd': 27000000,
                'year': 2021,
            },
            {
                'name': 'PolyButterfly',
                'symbol': 'PBUTT',
                'category': 'defi',
                'scam_type': 'rug_pull',
                'is_scam': 1,
                'description': 'Polygon DeFi rug pull',
                'loss_usd': 1500000,
                'year': 2021,
            },
            {
                'name': 'Snowdog',
                'symbol': 'SDOG',
                'category': 'meme_coin',
                'scam_type': 'rug_pull',
                'is_scam': 1,
                'description': 'Avalanche meme coin rug',
                'loss_usd': 30000000,
                'year': 2021,
            },
            {
                'name': 'Luna Yield',
                'symbol': 'LUNY',
                'category': 'defi',
                'scam_type': 'rug_pull',
                'is_scam': 1,
                'description': 'Solana DeFi rug pull',
                'loss_usd': 6700000,
                'year': 2021,
            },
            {
                'name': 'SwirlLend',
                'symbol': 'SWIRL',
                'category': 'defi',
                'scam_type': 'rug_pull',
                'is_scam': 1,
                'description': 'Base chain lending rug pull',
                'loss_usd': 460000,
                'year': 2023,
            },
            {
                'name': 'Magnate Finance',
                'symbol': 'MAG',
                'category': 'defi',
                'scam_type': 'rug_pull',
                'is_scam': 1,
                'description': 'Base chain lending rug',
                'loss_usd': 6500000,
                'year': 2023,
            },

            # PONZI SCHEMES
            {
                'name': 'BitConnect',
                'symbol': 'BCC',
                'category': 'ponzi',
                'scam_type': 'ponzi_scheme',
                'is_scam': 1,
                'description': 'Famous lending Ponzi scheme',
                'loss_usd': 3500000000,
                'year': 2018,
            },
            {
                'name': 'OneCoin',
                'symbol': 'ONE',
                'category': 'ponzi',
                'scam_type': 'ponzi_scheme',
                'is_scam': 1,
                'description': 'Largest crypto Ponzi - $4B+ stolen',
                'loss_usd': 4000000000,
                'year': 2019,
            },
            {
                'name': 'PlusToken',
                'symbol': 'PLUS',
                'category': 'ponzi',
                'scam_type': 'ponzi_scheme',
                'is_scam': 1,
                'description': 'Chinese Ponzi scheme - $2B+',
                'loss_usd': 2900000000,
                'year': 2019,
            },
            {
                'name': 'Fintoch',
                'symbol': 'FTH',
                'category': 'ponzi',
                'scam_type': 'ponzi_scheme',
                'is_scam': 1,
                'description': 'Promised 1% daily returns',
                'loss_usd': 31600000,
                'year': 2023,
            },
            {
                'name': 'NovaTech FX',
                'symbol': 'NOVA',
                'category': 'ponzi',
                'scam_type': 'ponzi_scheme',
                'is_scam': 1,
                'description': 'MLM crypto Ponzi',
                'loss_usd': 100000000,
                'year': 2023,
            },
            {
                'name': 'HyperFund',
                'symbol': 'HYP',
                'category': 'ponzi',
                'scam_type': 'ponzi_scheme',
                'is_scam': 1,
                'description': 'Global MLM Ponzi scheme',
                'loss_usd': 1700000000,
                'year': 2022,
            },

            # EXCHANGE FRAUDS
            {
                'name': 'Thodex',
                'symbol': 'THODEX',
                'category': 'exchange_token',
                'scam_type': 'exit_scam',
                'is_scam': 1,
                'description': 'Turkish exchange CEO fled with $2B',
                'loss_usd': 2000000000,
                'year': 2021,
            },
            {
                'name': 'FTX Token',
                'symbol': 'FTT',
                'category': 'exchange_token',
                'scam_type': 'fraud',
                'is_scam': 1,
                'description': 'SBF exchange fraud - $8B customer funds',
                'loss_usd': 8000000000,
                'year': 2022,
            },
            {
                'name': 'Celsius',
                'symbol': 'CEL',
                'category': 'defi',
                'scam_type': 'fraud',
                'is_scam': 1,
                'description': 'Crypto lending platform fraud',
                'loss_usd': 4700000000,
                'year': 2022,
            },
            {
                'name': 'Voyager Token',
                'symbol': 'VGX',
                'category': 'exchange_token',
                'scam_type': 'fraud',
                'is_scam': 1,
                'description': 'Exchange bankruptcy',
                'loss_usd': 1300000000,
                'year': 2022,
            },
            {
                'name': 'QuadrigaCX',
                'symbol': 'QUAD',
                'category': 'exchange_token',
                'scam_type': 'exit_scam',
                'is_scam': 1,
                'description': 'CEO faked death with $190M',
                'loss_usd': 190000000,
                'year': 2019,
            },
            {
                'name': 'Mt Gox Token',
                'symbol': 'MTGOX',
                'category': 'exchange_token',
                'scam_type': 'fraud',
                'is_scam': 1,
                'description': 'Historic exchange hack/fraud',
                'loss_usd': 450000000,
                'year': 2014,
            },

            # MEME COIN SCAMS
            {
                'name': 'SafeMars',
                'symbol': 'SAFEMARS',
                'category': 'meme_coin',
                'scam_type': 'pump_dump',
                'is_scam': 1,
                'description': 'SafeMoon clone that crashed',
                'loss_usd': 5000000,
                'year': 2021,
            },
            {
                'name': 'ElonGate',
                'symbol': 'ELONGATE',
                'category': 'meme_coin',
                'scam_type': 'pump_dump',
                'is_scam': 1,
                'description': 'Elon-themed token crash',
                'loss_usd': 10000000,
                'year': 2021,
            },
            {
                'name': 'SafeStar',
                'symbol': 'SAFESTAR',
                'category': 'meme_coin',
                'scam_type': 'pump_dump',
                'is_scam': 1,
                'description': 'Another Safe clone scam',
                'loss_usd': 3000000,
                'year': 2021,
            },
            {
                'name': 'MoonRat',
                'symbol': 'MRAT',
                'category': 'meme_coin',
                'scam_type': 'pump_dump',
                'is_scam': 1,
                'description': 'BSC meme scam',
                'loss_usd': 1000000,
                'year': 2021,
            },
            {
                'name': 'FairMoon',
                'symbol': 'FAIR',
                'category': 'meme_coin',
                'scam_type': 'rug_pull',
                'is_scam': 1,
                'description': 'Fair launch that wasnt fair',
                'loss_usd': 500000,
                'year': 2021,
            },
            {
                'name': 'PokeMon Inu',
                'symbol': 'POKEINU',
                'category': 'meme_coin',
                'scam_type': 'rug_pull',
                'is_scam': 1,
                'description': 'IP infringing meme scam',
                'loss_usd': 800000,
                'year': 2021,
            },
            {
                'name': 'Shibachu',
                'symbol': 'SHIBACHU',
                'category': 'meme_coin',
                'scam_type': 'honeypot',
                'is_scam': 1,
                'description': 'Shiba + Pikachu honeypot',
                'loss_usd': 200000,
                'year': 2021,
            },
            {
                'name': 'BabyDoge Scam Clone',
                'symbol': 'BABYDOGE2',
                'category': 'meme_coin',
                'scam_type': 'rug_pull',
                'is_scam': 1,
                'description': 'Fake BabyDoge clone',
                'loss_usd': 400000,
                'year': 2021,
            },
            {
                'name': 'MiniDoge',
                'symbol': 'MINIDOGE',
                'category': 'meme_coin',
                'scam_type': 'pump_dump',
                'is_scam': 1,
                'description': 'Another Doge derivative scam',
                'loss_usd': 600000,
                'year': 2021,
            },
            {
                'name': 'Milady Maker Fake',
                'symbol': 'MILADYF',
                'category': 'meme_coin',
                'scam_type': 'impersonation',
                'is_scam': 1,
                'description': 'Fake token impersonating project',
                'loss_usd': 500000,
                'year': 2023,
            },

            #  2023-2024 SCAMS
            {
                'name': 'Multichain Bridge',
                'symbol': 'MULTI',
                'category': 'defi',
                'scam_type': 'exit_scam',
                'is_scam': 1,
                'description': 'Cross-chain bridge collapse',
                'loss_usd': 130000000,
                'year': 2023,
            },
            {
                'name': 'Kokomo Finance',
                'symbol': 'KOKO',
                'category': 'defi',
                'scam_type': 'rug_pull',
                'is_scam': 1,
                'description': 'Optimism DeFi rug',
                'loss_usd': 4000000,
                'year': 2023,
            },
            {
                'name': 'Defrost Finance',
                'symbol': 'FROST',
                'category': 'defi',
                'scam_type': 'rug_pull',
                'is_scam': 1,
                'description': 'Avalanche lending rug',
                'loss_usd': 12000000,
                'year': 2022,
            },
            {
                'name': 'Bald Token',
                'symbol': 'BALD',
                'category': 'meme_coin',
                'scam_type': 'rug_pull',
                'is_scam': 1,
                'description': 'Base chain meme rug',
                'loss_usd': 25000000,
                'year': 2023,
            },
        ]

        # Adding realistic variance to each scam
        processed_scams = []
        for scam in base_scams:
            processed = ScamDatabase._add_scam_features(scam)
            processed_scams.append(processed)

        return processed_scams

    @staticmethod
    def _add_scam_features(scam: Dict) -> Dict:
        """Add realistic features with variance for scam coins"""
        scam_type = scam.get('scam_type', 'rug_pull')

        # Different scam types with different patterns
        if scam_type == 'honeypot':
            audit_prob, doxxed_prob, liq_prob = 0.05, 0.1, 0.1
            honeypot = 1
            sell_tax = random.uniform(50, 100)
        elif scam_type == 'rug_pull':
            audit_prob, doxxed_prob, liq_prob = 0.1, 0.2, 0.05
            honeypot = 0 if random.random() > 0.15 else 1
            sell_tax = random.uniform(5, 30)
        elif scam_type == 'pump_dump':
            audit_prob, doxxed_prob, liq_prob = 0.15, 0.5, 0.2
            honeypot = 0
            sell_tax = random.uniform(0, 15)
        elif scam_type == 'ponzi_scheme':
            audit_prob, doxxed_prob, liq_prob = 0.2, 0.6, 0.3
            honeypot = 0
            sell_tax = random.uniform(0, 10)
        elif scam_type == 'fraud':
            audit_prob, doxxed_prob, liq_prob = 0.4, 0.8, 0.5
            honeypot = 0
            sell_tax = random.uniform(0, 5)
        else:
            audit_prob, doxxed_prob, liq_prob = 0.1, 0.3, 0.15
            honeypot = 0 if random.random() > 0.1 else 1
            sell_tax = random.uniform(5, 25)

        scam.update({
            # Security features with variance
            'had_audit': 1 if random.random() < audit_prob else 0,
            'team_doxxed': 1 if random.random() < doxxed_prob else 0,
            'liquidity_locked': 1 if random.random() < liq_prob else 0,
            'ownership_renounced': 1 if random.random() < 0.1 else 0,
            'contract_verified': 1 if random.random() < 0.7 else 0,
            'honeypot': honeypot,

            # Age - scams are usually common in younger demographic
            'age_days': int(np.random.exponential(45) + 1),

            # Market metrics with high variance
            'holder_count': int(np.random.lognormal(7, 1.5)),
            'social_followers': int(np.random.lognormal(8, 2)),

            # Price behavior
            'price_drop_percent': float(np.clip(np.random.beta(5, 2) * 100, 50, 99.9)),
            'days_to_crash': int(np.random.exponential(15) + 1),
            'volatility': float(np.random.lognormal(4.5, 0.8)),

            # Contract red flags
            'buy_tax': float(np.random.beta(2, 5) * 20),
            'sell_tax': sell_tax,
            'max_tx_limit': 1 if random.random() < 0.4 else 0,
            'top_holder_percent': float(np.random.beta(4, 2) * 70 + 20),

            # Presence
            'website_exists': 1 if random.random() < 0.6 else 0,
            'whitepaper_exists': 1 if random.random() < 0.25 else 0,
            'github_exists': 1 if random.random() < 0.1 else 0,
        })

        return scam

    @staticmethod
    def get_legitimate_coins() -> List[Dict]:
        """Documented legitimate cryptocurrencies"""

        base_legitimate = [
            # LAYER 1 BLOCKCHAINS
            {
                'name': 'Bitcoin',
                'symbol': 'BTC',
                'category': 'layer1',
                'is_scam': 0,
                'description': 'First cryptocurrency, digital gold',
                'year': 2009,
            },
            {
                'name': 'Ethereum',
                'symbol': 'ETH',
                'category': 'layer1',
                'is_scam': 0,
                'description': 'Leading smart contract platform',
                'year': 2015,
            },
            {
                'name': 'Solana',
                'symbol': 'SOL',
                'category': 'layer1',
                'is_scam': 0,
                'description': 'High-performance blockchain',
                'year': 2020,
            },
            {
                'name': 'Cardano',
                'symbol': 'ADA',
                'category': 'layer1',
                'is_scam': 0,
                'description': 'Peer-reviewed blockchain',
                'year': 2017,
            },
            {
                'name': 'Avalanche',
                'symbol': 'AVAX',
                'category': 'layer1',
                'is_scam': 0,
                'description': 'Fast smart contract platform',
                'year': 2020,
            },
            {
                'name': 'Polkadot',
                'symbol': 'DOT',
                'category': 'layer1',
                'is_scam': 0,
                'description': 'Multi-chain network',
                'year': 2020,
            },
            {
                'name': 'Cosmos',
                'symbol': 'ATOM',
                'category': 'layer1',
                'is_scam': 0,
                'description': 'Internet of blockchains',
                'year': 2019,
            },
            {
                'name': 'Near Protocol',
                'symbol': 'NEAR',
                'category': 'layer1',
                'is_scam': 0,
                'description': 'Sharded blockchain',
                'year': 2020,
            },
            {
                'name': 'Algorand',
                'symbol': 'ALGO',
                'category': 'layer1',
                'is_scam': 0,
                'description': 'Pure proof of stake',
                'year': 2019,
            },
            {
                'name': 'Fantom',
                'symbol': 'FTM',
                'category': 'layer1',
                'is_scam': 0,
                'description': 'DAG-based smart contract platform',
                'year': 2019,
            },
            {
                'name': 'Tezos',
                'symbol': 'XTZ',
                'category': 'layer1',
                'is_scam': 0,
                'description': 'Self-amending blockchain',
                'year': 2018,
            },
            {
                'name': 'Sui',
                'symbol': 'SUI',
                'category': 'layer1',
                'is_scam': 0,
                'description': 'Move-based blockchain',
                'year': 2023,
            },
            {
                'name': 'Aptos',
                'symbol': 'APT',
                'category': 'layer1',
                'is_scam': 0,
                'description': 'Move-based Layer 1',
                'year': 2022,
            },

            # LAYER 2 SOLUTIONS
            {
                'name': 'Polygon',
                'symbol': 'MATIC',
                'category': 'layer2',
                'is_scam': 0,
                'description': 'Ethereum scaling solution',
                'year': 2019,
            },
            {
                'name': 'Arbitrum',
                'symbol': 'ARB',
                'category': 'layer2',
                'is_scam': 0,
                'description': 'Ethereum L2 rollup',
                'year': 2023,
            },
            {
                'name': 'Optimism',
                'symbol': 'OP',
                'category': 'layer2',
                'is_scam': 0,
                'description': 'Optimistic rollup L2',
                'year': 2022,
            },
            {
                'name': 'Immutable X',
                'symbol': 'IMX',
                'category': 'layer2',
                'is_scam': 0,
                'description': 'NFT-focused L2',
                'year': 2021,
            },

            # DEFI TOKENS
            {
                'name': 'Uniswap',
                'symbol': 'UNI',
                'category': 'defi',
                'is_scam': 0,
                'description': 'Leading DEX protocol',
                'year': 2020,
            },
            {
                'name': 'Aave',
                'symbol': 'AAVE',
                'category': 'defi',
                'is_scam': 0,
                'description': 'Leading lending protocol',
                'year': 2020,
            },
            {
                'name': 'Chainlink',
                'symbol': 'LINK',
                'category': 'defi',
                'is_scam': 0,
                'description': 'Decentralized oracle network',
                'year': 2017,
            },
            {
                'name': 'Maker',
                'symbol': 'MKR',
                'category': 'defi',
                'is_scam': 0,
                'description': 'DAI stablecoin governance',
                'year': 2017,
            },
            {
                'name': 'Compound',
                'symbol': 'COMP',
                'category': 'defi',
                'is_scam': 0,
                'description': 'DeFi lending protocol',
                'year': 2020,
            },
            {
                'name': 'Curve',
                'symbol': 'CRV',
                'category': 'defi',
                'is_scam': 0,
                'description': 'Stablecoin DEX',
                'year': 2020,
            },
            {
                'name': 'SushiSwap',
                'symbol': 'SUSHI',
                'category': 'defi',
                'is_scam': 0,
                'description': 'DEX and DeFi platform',
                'year': 2020,
            },
            {
                'name': 'Lido',
                'symbol': 'LDO',
                'category': 'defi',
                'is_scam': 0,
                'description': 'Liquid staking protocol',
                'year': 2021,
            },
            {
                'name': 'Synthetix',
                'symbol': 'SNX',
                'category': 'defi',
                'is_scam': 0,
                'description': 'Synthetic assets protocol',
                'year': 2018,
            },
            {
                'name': '1inch',
                'symbol': '1INCH',
                'category': 'defi',
                'is_scam': 0,
                'description': 'DEX aggregator',
                'year': 2020,
            },
            {
                'name': 'Yearn Finance',
                'symbol': 'YFI',
                'category': 'defi',
                'is_scam': 0,
                'description': 'Yield aggregator',
                'year': 2020,
            },
            {
                'name': 'PancakeSwap',
                'symbol': 'CAKE',
                'category': 'defi',
                'is_scam': 0,
                'description': 'BSC DEX',
                'year': 2020,
            },
            {
                'name': 'GMX',
                'symbol': 'GMX',
                'category': 'defi',
                'is_scam': 0,
                'description': 'Perpetual DEX',
                'year': 2021,
            },
            {
                'name': 'dYdX',
                'symbol': 'DYDX',
                'category': 'defi',
                'is_scam': 0,
                'description': 'Decentralized derivatives',
                'year': 2021,
            },

            # STABLECOINS
            {
                'name': 'Tether',
                'symbol': 'USDT',
                'category': 'stablecoin',
                'is_scam': 0,
                'description': 'Largest stablecoin',
                'year': 2014,
            },
            {
                'name': 'USD Coin',
                'symbol': 'USDC',
                'category': 'stablecoin',
                'is_scam': 0,
                'description': 'Regulated stablecoin by Circle',
                'year': 2018,
            },
            {
                'name': 'DAI',
                'symbol': 'DAI',
                'category': 'stablecoin',
                'is_scam': 0,
                'description': 'Decentralized stablecoin',
                'year': 2017,
            },
            {
                'name': 'Frax',
                'symbol': 'FRAX',
                'category': 'stablecoin',
                'is_scam': 0,
                'description': 'Fractional stablecoin',
                'year': 2020,
            },
            {
                'name': 'TrueUSD',
                'symbol': 'TUSD',
                'category': 'stablecoin',
                'is_scam': 0,
                'description': 'Regulated stablecoin',
                'year': 2018,
            },

            #  MEME COINS (LEGITIMATE)
            {
                'name': 'Dogecoin',
                'symbol': 'DOGE',
                'category': 'meme_coin',
                'is_scam': 0,
                'description': 'Original meme coin since 2013',
                'year': 2013,
            },
            {
                'name': 'Shiba Inu',
                'symbol': 'SHIB',
                'category': 'meme_coin',
                'is_scam': 0,
                'description': 'Second largest meme coin',
                'year': 2020,
            },
            {
                'name': 'Pepe',
                'symbol': 'PEPE',
                'category': 'meme_coin',
                'is_scam': 0,
                'description': 'Frog-themed meme coin',
                'year': 2023,
            },
            {
                'name': 'Bonk',
                'symbol': 'BONK',
                'category': 'meme_coin',
                'is_scam': 0,
                'description': 'Solana meme coin',
                'year': 2022,
            },
            {
                'name': 'Floki',
                'symbol': 'FLOKI',
                'category': 'meme_coin',
                'is_scam': 0,
                'description': 'Viking-themed meme coin',
                'year': 2021,
            },
            {
                'name': 'dogwifhat',
                'symbol': 'WIF',
                'category': 'meme_coin',
                'is_scam': 0,
                'description': 'Solana meme coin',
                'year': 2023,
            },

            # EXCHANGE TOKENS
            {
                'name': 'BNB',
                'symbol': 'BNB',
                'category': 'exchange_token',
                'is_scam': 0,
                'description': 'Binance ecosystem token',
                'year': 2017,
            },
            {
                'name': 'Cronos',
                'symbol': 'CRO',
                'category': 'exchange_token',
                'is_scam': 0,
                'description': 'Crypto.com token',
                'year': 2018,
            },
            {
                'name': 'OKB',
                'symbol': 'OKB',
                'category': 'exchange_token',
                'is_scam': 0,
                'description': 'OKX exchange token',
                'year': 2019,
            },
            {
                'name': 'KuCoin Token',
                'symbol': 'KCS',
                'category': 'exchange_token',
                'is_scam': 0,
                'description': 'KuCoin exchange token',
                'year': 2017,
            },

            # GAMING/METAVERSE
            {
                'name': 'The Sandbox',
                'symbol': 'SAND',
                'category': 'gaming',
                'is_scam': 0,
                'description': 'Metaverse gaming platform',
                'year': 2020,
            },
            {
                'name': 'Decentraland',
                'symbol': 'MANA',
                'category': 'gaming',
                'is_scam': 0,
                'description': 'Virtual reality platform',
                'year': 2017,
            },
            {
                'name': 'Axie Infinity',
                'symbol': 'AXS',
                'category': 'gaming',
                'is_scam': 0,
                'description': 'Play-to-earn game',
                'year': 2020,
            },
            {
                'name': 'Gala',
                'symbol': 'GALA',
                'category': 'gaming',
                'is_scam': 0,
                'description': 'Gaming ecosystem',
                'year': 2020,
            },
            {
                'name': 'Enjin',
                'symbol': 'ENJ',
                'category': 'gaming',
                'is_scam': 0,
                'description': 'Gaming NFT platform',
                'year': 2017,
            },

            #  AI TOKENS
            {
                'name': 'Render',
                'symbol': 'RNDR',
                'category': 'ai_token',
                'is_scam': 0,
                'description': 'Distributed GPU rendering',
                'year': 2020,
            },
            {
                'name': 'Fetch.ai',
                'symbol': 'FET',
                'category': 'ai_token',
                'is_scam': 0,
                'description': 'AI and automation',
                'year': 2019,
            },
            {
                'name': 'SingularityNET',
                'symbol': 'AGIX',
                'category': 'ai_token',
                'is_scam': 0,
                'description': 'AI marketplace',
                'year': 2018,
            },
            {
                'name': 'Ocean Protocol',
                'symbol': 'OCEAN',
                'category': 'ai_token',
                'is_scam': 0,
                'description': 'Data marketplace',
                'year': 2019,
            },

            # SOCIAL/CREATOR TOKENS
            {
                'name': 'Chiliz',
                'symbol': 'CHZ',
                'category': 'social_token',
                'is_scam': 0,
                'description': 'Sports fan tokens platform',
                'year': 2019,
            },
            {
                'name': 'Rally',
                'symbol': 'RLY',
                'category': 'social_token',
                'is_scam': 0,
                'description': 'Creator coin platform',
                'year': 2020,
            },
            {
                'name': 'DeSo',
                'symbol': 'DESO',
                'category': 'social_token',
                'is_scam': 0,
                'description': 'Decentralized social network',
                'year': 2021,
            },

            # ==================== PRIVACY COINS ====================
            {
                'name': 'Monero',
                'symbol': 'XMR',
                'category': 'privacy',
                'is_scam': 0,
                'description': 'Leading privacy coin',
                'year': 2014,
            },
            {
                'name': 'Zcash',
                'symbol': 'ZEC',
                'category': 'privacy',
                'is_scam': 0,
                'description': 'Privacy-focused cryptocurrency',
                'year': 2016,
            },
        ]

        # Adding realistic features
        processed = []
        for coin in base_legitimate:
            processed.append(ScamDatabase._add_legitimate_features(coin))

        return processed

    @staticmethod
    def _add_legitimate_features(coin: Dict) -> Dict:
        """Add realistic features with variance for legitimate coins"""
        category = coin.get('category', 'altcoin')
        year = coin.get('year', 2020)
        age_years = 2024 - year

        # Category-specific patterns
        if category in ['layer1', 'stablecoin']:
            audit_prob, doxxed_prob, liq_prob = 0.95, 0.9, 0.95
        elif category == 'defi':
            audit_prob, doxxed_prob, liq_prob = 0.85, 0.8, 0.9
        elif category == 'meme_coin':
            audit_prob, doxxed_prob, liq_prob = 0.5, 0.4, 0.7
        else:
            audit_prob, doxxed_prob, liq_prob = 0.7, 0.7, 0.8

        coin.update({
            'scam_type': 'none',

            # Security features - generally higher for legit coins
            'had_audit': 1 if random.random() < audit_prob else 0,
            'team_doxxed': 1 if random.random() < doxxed_prob else 0,
            'liquidity_locked': 1 if random.random() < liq_prob else 0,
            'ownership_renounced': 1 if random.random() < 0.6 else 0,
            'contract_verified': 1 if random.random() < 0.95 else 0,
            'honeypot': 0,

            # Age - legitimate coins are usually common older demographic
            'age_days': int(age_years * 365 + random.randint(-100, 100)),

            # Market metrics
            'holder_count': int(np.random.lognormal(10, 1.5)),
            'social_followers': int(np.random.lognormal(11, 2)),

            # Price behavior - towards more stable
            'price_drop_percent': float(np.clip(np.random.beta(2, 5) * 100, 10, 85)),
            'days_to_crash': random.randint(180, 730),
            'volatility': float(np.random.lognormal(3.5, 0.6)),

            # Contract features - reasonable
            'buy_tax': float(np.random.beta(1, 10) * 5),
            'sell_tax': float(np.random.beta(1, 8) * 8),
            'max_tx_limit': 0,
            'top_holder_percent': float(np.random.beta(2, 5) * 40 + 5),

            # Presence - usually strong
            'website_exists': 1 if random.random() < 0.95 else 0,
            'whitepaper_exists': 1 if random.random() < 0.8 else 0,
            'github_exists': 1 if random.random() < 0.7 else 0,

            'loss_usd': 0,
        })

        return coin

    @staticmethod
    def generate_edge_cases(n_each: int = 40) -> List[Dict]:
        """
        Generate edge cases to prevent overfitting:
        1. Legit coins that look suspicious
        2. Scams that look legitimate
        """
        edge_cases = []

        # Legitimate coins that look suspicious (coins that can be: new, anonymous, volatile)
        for i in range(n_each):
            coin = {
                'name': f'NewLegitProject_{i}',
                'symbol': f'NLP{i}',
                'category': random.choice(['meme_coin', 'defi', 'gaming', 'ai_token']),
                'scam_type': 'none',
                'is_scam': 0,
                'description': 'New but legitimate project',

                # Looks suspicious but legitimate
                'had_audit': 1 if random.random() < 0.3 else 0,
                'team_doxxed': 1 if random.random() < 0.35 else 0,
                'liquidity_locked': 1 if random.random() < 0.5 else 0,
                'ownership_renounced': 1 if random.random() < 0.4 else 0,
                'contract_verified': 1 if random.random() < 0.8 else 0,
                'honeypot': 0,

                'age_days': int(np.random.exponential(40) + 5),
                'holder_count': int(np.random.lognormal(6, 1.2)),
                'social_followers': int(np.random.lognormal(7, 1.5)),

                'price_drop_percent': float(np.clip(np.random.beta(4, 3) * 100, 30, 90)),
                'days_to_crash': random.randint(20, 200),
                'volatility': float(np.random.lognormal(4.2, 0.7)),

                'buy_tax': float(np.random.beta(2, 6) * 12),
                'sell_tax': float(np.random.beta(2, 5) * 18),
                'max_tx_limit': 1 if random.random() < 0.3 else 0,
                'top_holder_percent': float(np.random.beta(3, 3) * 55 + 10),

                'website_exists': 1 if random.random() < 0.65 else 0,
                'whitepaper_exists': 1 if random.random() < 0.4 else 0,
                'github_exists': 1 if random.random() < 0.35 else 0,

                'loss_usd': 0,
                'year': random.randint(2022, 2024),
            }
            edge_cases.append(coin)

        # Sophisticated scams that look legitimate
        for i in range(n_each):
            coin = {
                'name': f'SophisticatedScam_{i}',
                'symbol': f'SOPH{i}',
                'category': random.choice(['defi', 'layer2', 'ai_token', 'gaming']),
                'scam_type': random.choice(['slow_rug', 'fraud', 'pump_dump']),
                'is_scam': 1,
                'description': 'Sophisticated scam with good optics',

                # Looks legitimate but is scam
                'had_audit': 1 if random.random() < 0.6 else 0,
                'team_doxxed': 1 if random.random() < 0.7 else 0,
                'liquidity_locked': 1 if random.random() < 0.55 else 0,
                'ownership_renounced': 1 if random.random() < 0.35 else 0,
                'contract_verified': 1 if random.random() < 0.9 else 0,
                'honeypot': 1 if random.random() < 0.1 else 0,

                'age_days': int(np.random.exponential(90) + 20),
                'holder_count': int(np.random.lognormal(8, 1.3)),
                'social_followers': int(np.random.lognormal(9, 1.5)),

                'price_drop_percent': float(np.clip(np.random.beta(3, 4) * 100, 20, 85)),
                'days_to_crash': random.randint(30, 180),
                'volatility': float(np.random.lognormal(3.8, 0.6)),

                'buy_tax': float(np.random.beta(1.5, 7) * 10),
                'sell_tax': float(np.random.beta(2.5, 5) * 20),
                'max_tx_limit': 1 if random.random() < 0.25 else 0,
                'top_holder_percent': float(np.random.beta(4, 3) * 45 + 15),

                'website_exists': 1 if random.random() < 0.9 else 0,
                'whitepaper_exists': 1 if random.random() < 0.7 else 0,
                'github_exists': 1 if random.random() < 0.45 else 0,

                'loss_usd': int(np.random.lognormal(14, 2)),
                'year': random.randint(2021, 2024),
            }
            edge_cases.append(coin)

        return edge_cases

    @staticmethod
    def generate_synthetic_variants(base_coins: List[Dict], n_variants: int = 100) -> List[Dict]:
        """Generate synthetic variants of existing coins to expand dataset"""
        variants = []

        for _ in range(n_variants):
            base = random.choice(base_coins).copy()

            # Adding noise to numerical features
            noise_cols = ['age_days', 'holder_count', 'social_followers',
                         'price_drop_percent', 'volatility', 'buy_tax',
                         'sell_tax', 'top_holder_percent']

            for col in noise_cols:
                if col in base:
                    noise = np.random.normal(1.0, 0.2)
                    base[col] = max(0, base[col] * noise)

            # Occasionally flip binary features
            flip_cols = ['had_audit', 'team_doxxed', 'liquidity_locked',
                        'website_exists', 'whitepaper_exists', 'github_exists']

            for col in flip_cols:
                if col in base and random.random() < 0.15:
                    base[col] = 1 - base[col]

            # Update name
            base['name'] = f"{base['name']}_v{random.randint(1, 999)}"
            base['symbol'] = f"{base['symbol']}{random.randint(1, 99)}"

            variants.append(base)

        return variants

print("[SUCCESS] ScamDatabase class defined with all documented examples")

[SUCCESS] ScamDatabase class defined with all documented examples


In [2]:
#  BUILDING DATASET

class DatasetBuilder:
    """Builds comprehensive dataset from all sources"""

    def __init__(self):
        self.fetcher = CryptoDataFetcher()
        self.scam_db = ScamDatabase()

    def build_full_dataset(self, fetch_live: bool = True, live_limit: int = 100) -> pd.DataFrame:
        """Build complete dataset"""
        print("\n" + "="*70)
        print("BUILDING COMPREHENSIVE DATASET")
        print("="*70)

        all_data = []

        # 1. Documented scams
        print("\n[INFO] Loading documented scam cases...")
        scams = self.scam_db.get_documented_scams()
        print(f"   [OK] {len(scams)} documented scams loaded (Kim K, Logan Paul, BitConnect, etc.)")
        all_data.extend(scams)

        # 2. Legitimate coins
        print("\n[INFO] Loading legitimate coins...")
        legitimate = self.scam_db.get_legitimate_coins()
        print(f"   [OK] {len(legitimate)} legitimate coins loaded (BTC, ETH, UNI, etc.)")
        all_data.extend(legitimate)

        # 3. Edge cases
        print("\n[INFO] Generating edge cases...")
        edge_cases = self.scam_db.generate_edge_cases(40)
        print(f"   [OK] {len(edge_cases)} edge cases generated (hard to classify samples)")
        all_data.extend(edge_cases)

        # 4. Synthetic variants
        print("\n[INFO] Generating synthetic variants...")
        variants = self.scam_db.generate_synthetic_variants(all_data, 80)
        print(f"   [OK] {len(variants)} synthetic variants generated for diversity")
        all_data.extend(variants)

        # 5. Live data
        if fetch_live:
            print("\n[INFO] Fetching live data from API...")
            try:
                live_df = self.fetcher.fetch_top_coins(live_limit)
                if not live_df.empty:
                    live_coins = self.fetcher.process_live_data(live_df)
                    print(f"   [OK] {len(live_coins)} live coins fetched from CoinGecko")
                    all_data.extend(live_coins)
            except Exception as e:
                print(f"   [WARNING] Live fetch failed: {e}")

        # Creating DataFrame
        df = pd.DataFrame(all_data)

        # Cleaning duplicates
        df = df.drop_duplicates(subset=['symbol'], keep='first')

        # Summary
        print(f"\n[SUMMARY] DATASET STATISTICS:")
        print(f"   Total coins: {len(df)}")
        print(f"   Scams: {df['is_scam'].sum()} ({df['is_scam'].mean()*100:.1f}%)")
        print(f"   Legitimate: {(df['is_scam']==0).sum()} ({(df['is_scam']==0).mean()*100:.1f}%)")
        print(f"\n   Categories:")
        for cat in df['category'].value_counts().head(10).items():
            print(f"      {cat[0]}: {cat[1]}")

        return df

print("[SUCCESS] DatasetBuilder class defined")

[SUCCESS] DatasetBuilder class defined


In [3]:
# FEATURE ENGINEERING

class FeatureEngineer:
    """Creates ML features without data leakage"""

    def __init__(self):
        self.scaler = RobustScaler()
        self.label_encoders = {}
        self.feature_cols = []

    def engineer_features(self, df: pd.DataFrame) -> pd.DataFrame:
        """Create comprehensive features"""
        print("\n[INFO] Engineering features...")
        df = df.copy()

        # Filling the missing values
        df = self._fill_missing(df)

        # Log transforms for skewed distributions
        log_cols = ['holder_count', 'social_followers', 'volatility', 'age_days']
        for col in log_cols:
            if col in df.columns:
                df[f'{col}_log'] = np.log1p(df[col].clip(lower=0))

        # Age-based features
        df['is_very_new'] = (df['age_days'] < 14).astype(int)
        df['is_new'] = (df['age_days'] < 60).astype(int)
        df['is_established'] = (df['age_days'] > 365).astype(int)
        df['is_veteran'] = (df['age_days'] > 1000).astype(int)

        # Trust score (composite - no data leakage)
        df['trust_score'] = (
            df['had_audit'].fillna(0) * 25 +
            df['team_doxxed'].fillna(0) * 20 +
            df['liquidity_locked'].fillna(0) * 25 +
            df['ownership_renounced'].fillna(0) * 15 +
            df['contract_verified'].fillna(0) * 10 +
            df['website_exists'].fillna(0) * 3 +
            df['whitepaper_exists'].fillna(0) * 5 +
            df['github_exists'].fillna(0) * 7
        ) / 100

        # Red flag score
        df['red_flag_score'] = (
            df['honeypot'].fillna(0) * 50 +
            (1 - df['had_audit'].fillna(0)) * 12 +
            (1 - df['team_doxxed'].fillna(0)) * 8 +
            (1 - df['liquidity_locked'].fillna(0)) * 15 +
            (df['sell_tax'].fillna(0) > 10).astype(int) * 12 +
            (df['top_holder_percent'].fillna(0) > 50).astype(int) * 12 +
            (df['age_days'].fillna(365) < 14).astype(int) * 8
        ) / 100

        # Tax features
        df['tax_difference'] = df['sell_tax'].fillna(0) - df['buy_tax'].fillna(0)
        df['total_tax'] = df['sell_tax'].fillna(0) + df['buy_tax'].fillna(0)
        df['high_sell_tax'] = (df['sell_tax'].fillna(0) > 15).astype(int)
        df['suspicious_tax'] = (df['tax_difference'] > 10).astype(int)

        # Concentration features
        df['high_concentration'] = (df['top_holder_percent'].fillna(0) > 40).astype(int)
        df['extreme_concentration'] = (df['top_holder_percent'].fillna(0) > 60).astype(int)

        # Social/holder ratio
        df['social_holder_ratio'] = (
            df['social_followers'].fillna(0) / (df['holder_count'].fillna(1) + 1)
        )
        df['social_holder_ratio_log'] = np.log1p(df['social_holder_ratio'])

        # Volatility features
        df['high_volatility'] = (df['volatility'].fillna(0) > 100).astype(int)
        df['extreme_volatility'] = (df['volatility'].fillna(0) > 200).astype(int)

        # Crash features
        df['severe_crash'] = (df['price_drop_percent'].fillna(0) > 90).astype(int)
        df['quick_crash'] = (df['days_to_crash'].fillna(365) < 14).astype(int)
        df['crash_velocity'] = df['price_drop_percent'].fillna(0) / (df['days_to_crash'].fillna(365) + 1)

        # Category encoding
        df['high_risk_category'] = df['category'].isin(
            ['meme_coin', 'influencer_coin', 'ponzi']
        ).astype(int)

        # One-hot encoding for categories
        category_dummies = pd.get_dummies(df['category'], prefix='cat')
        df = pd.concat([df, category_dummies], axis=1)

        # Interaction features
        df['anon_no_audit'] = ((df['team_doxxed']==0) & (df['had_audit']==0)).astype(int)
        df['new_high_risk'] = (df['is_new'] & df['high_risk_category']).astype(int)
        df['honeypot_indicator'] = df['honeypot'].fillna(0)

        print(f"   [OK] Created {len(df.columns)} total columns")

        return df

    def _fill_missing(self, df: pd.DataFrame) -> pd.DataFrame:
        """Fill missing values"""
        # Numerical columns
        num_cols = df.select_dtypes(include=[np.number]).columns
        for col in num_cols:
            df[col] = df[col].fillna(df[col].median())

        # Binary columns
        binary_cols = ['had_audit', 'team_doxxed', 'liquidity_locked',
                      'ownership_renounced', 'contract_verified', 'honeypot',
                      'website_exists', 'whitepaper_exists', 'github_exists', 'max_tx_limit']
        for col in binary_cols:
            if col in df.columns:
                df[col] = df[col].fillna(0).astype(int)

        return df

    def get_feature_columns(self, df: pd.DataFrame) -> List[str]:
        """Get feature columns for ML (excluding leaky columns)"""
        exclude = ['name', 'symbol', 'category', 'scam_type', 'description',
                  'is_scam', 'year', 'loss_usd', 'market_cap', 'volume_24h',
                  'price_change_24h', 'price_change_7d']

        feature_cols = [col for col in df.columns
                       if col not in exclude
                       and df[col].dtype in ['int64', 'float64', 'int32', 'float32', 'uint8', 'int8']]

        self.feature_cols = feature_cols
        print(f"   [OK] Selected {len(feature_cols)} features for ML training")
        return feature_cols

print("[SUCCESS] FeatureEngineer class defined")

[SUCCESS] FeatureEngineer class defined


In [4]:
# ML TRAINER

class ScamDetector:
    """ML models with proper regularization to prevent overfitting"""

    def __init__(self):
        self.models = {}
        self.scaler = RobustScaler()
        self.results = {}
        self.feature_cols = None
        self.is_trained = False

    def prepare_data(self, df: pd.DataFrame, feature_cols: List[str],
                     test_size: float = 0.25) -> Tuple:
        """Prepare data for training"""
        print("\n[INFO] Preparing data for ML training...")

        self.feature_cols = feature_cols
        X = df[feature_cols].copy()
        y = df['is_scam'].copy()

        # Handling inf/nan
        X = X.replace([np.inf, -np.inf], np.nan)
        X = X.fillna(X.median())

        print(f"   Features: {X.shape[1]}")
        print(f"   Samples: {X.shape[0]}")
        print(f"   Scam rate: {y.mean():.1%}")

        # Stratified split
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=42, stratify=y
        )

        # Scale
        X_train_scaled = self.scaler.fit_transform(X_train)
        X_test_scaled = self.scaler.transform(X_test)

        # Balancing with SMOTE
        print("   [INFO] Balancing dataset with SMOTE...")
        try:
            smote = SMOTE(random_state=42, k_neighbors=3)
            X_train_bal, y_train_bal = smote.fit_resample(X_train_scaled, y_train)
            print(f"   [OK] Balanced dataset: {len(X_train_bal)} samples")
        except:
            X_train_bal, y_train_bal = X_train_scaled, y_train
            print("   [WARNING] SMOTE failed, using original dataset")

        return X_train_bal, X_test_scaled, y_train_bal, y_test

    def train_models(self, X_train, X_test, y_train, y_test) -> Dict:
        """Train models with strong regularization"""
        print("\n[INFO] Training ML models with regularization...")

        # Models with STRONG regularization
        models_config = {
            'Logistic Regression': LogisticRegression(
                C=0.1,  # Strong L2 regularization
                penalty='l2',
                max_iter=1000,
                random_state=42,
                class_weight='balanced'
            ),
            'Random Forest': RandomForestClassifier(
                n_estimators=100,
                max_depth=7,           # Limit depth
                min_samples_split=15,  # Require samples to split
                min_samples_leaf=8,    # Require samples in leaf
                max_features='sqrt',   # Limit features per tree
                random_state=42,
                class_weight='balanced',
                oob_score=True
            ),
            'XGBoost': xgb.XGBClassifier(
                n_estimators=100,
                max_depth=5,           # Shallow trees
                learning_rate=0.05,    # Slow learning
                subsample=0.75,        # Use subset of data
                colsample_bytree=0.75, # Use subset of features
                reg_alpha=1.5,         # L1 regularization
                reg_lambda=2.0,        # L2 regularization
                min_child_weight=5,
                random_state=42,
                eval_metric='logloss'
            ),
            'LightGBM': lgb.LGBMClassifier(
                n_estimators=100,
                max_depth=5,
                learning_rate=0.05,
                subsample=0.75,
                colsample_bytree=0.75,
                reg_alpha=1.5,
                reg_lambda=2.0,
                min_child_samples=15,
                random_state=42,
                verbose=-1
            ),
            'Gradient Boosting': GradientBoostingClassifier(
                n_estimators=100,
                max_depth=5,
                learning_rate=0.05,
                subsample=0.75,
                min_samples_split=15,
                min_samples_leaf=8,
                random_state=42
            )
        }

        results = {}

        for name, model in models_config.items():
            print(f"\n   [TRAINING] {name}...")

            # Cross-validation
            cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
            cv_scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='f1')

            # Fit
            model.fit(X_train, y_train)

            # Predictions
            y_pred = model.predict(X_test)
            y_prob = model.predict_proba(X_test)[:, 1]
            y_train_pred = model.predict(X_train)

            # Metrics
            test_f1 = f1_score(y_test, y_pred)
            train_f1 = f1_score(y_train, y_train_pred)
            test_acc = accuracy_score(y_test, y_pred)
            test_precision = precision_score(y_test, y_pred)
            test_recall = recall_score(y_test, y_pred)
            test_roc = roc_auc_score(y_test, y_prob)

            # Overfitting gap
            overfit_gap = train_f1 - test_f1

            results[name] = {
                'model': model,
                'accuracy': test_acc,
                'f1': test_f1,
                'precision': test_precision,
                'recall': test_recall,
                'roc_auc': test_roc,
                'cv_mean': cv_scores.mean(),
                'cv_std': cv_scores.std(),
                'train_f1': train_f1,
                'overfit_gap': overfit_gap,
                'y_pred': y_pred,
                'y_prob': y_prob
            }

            self.models[name] = model

            # Print results
            status = "[OK]" if overfit_gap < 0.1 else "[WARNING]"
            print(f"      CV F1: {cv_scores.mean():.3f} (+/- {cv_scores.std():.3f})")
            print(f"      Test: F1={test_f1:.3f} | Precision={test_precision:.3f} | Recall={test_recall:.3f} | AUC={test_roc:.3f}")
            print(f"      {status} Overfit gap: {overfit_gap:.3f}")

                    # Create Ensemble
        print(f"\n   [TRAINING] Creating Voting Ensemble...")
        self.ensemble = VotingClassifier(
            estimators=[
                ('lr', models_config['Logistic Regression']),
                ('rf', models_config['Random Forest']),
                ('lgb', models_config['LightGBM'])
            ],
            voting='soft',
            weights=[1, 1.5, 1.5]
        )
        self.ensemble.fit(X_train, y_train)

        y_pred_ens = self.ensemble.predict(X_test)
        y_prob_ens = self.ensemble.predict_proba(X_test)[:, 1]

        results['Ensemble'] = {
            'model': self.ensemble,
            'accuracy': accuracy_score(y_test, y_pred_ens),
            'f1': f1_score(y_test, y_pred_ens),
            'precision': precision_score(y_test, y_pred_ens),
            'recall': recall_score(y_test, y_pred_ens),
            'roc_auc': roc_auc_score(y_test, y_prob_ens),
            'cv_mean': 0,
            'cv_std': 0,
            'train_f1': 0,
            'overfit_gap': 0,
            'y_pred': y_pred_ens,
            'y_prob': y_prob_ens
        }

        print(f"      Ensemble: F1={results['Ensemble']['f1']:.3f} | AUC={results['Ensemble']['roc_auc']:.3f}")

        self.results = results
        self.is_trained = True

        return results

    def get_feature_importance(self, top_n: int = 15) -> pd.DataFrame:
        """Get feature importance from Random Forest"""
        if 'Random Forest' not in self.models:
            return pd.DataFrame()

        rf = self.models['Random Forest']
        importance = pd.DataFrame({
            'feature': self.feature_cols,
            'importance': rf.feature_importances_
        }).sort_values('importance', ascending=False)

        return importance.head(top_n)

print("[SUCCESS] ScamDetector class defined")

[SUCCESS] ScamDetector class defined


In [5]:
# RISK SCORER

class RiskScorer:
    """Calculates comprehensive risk scores"""

    def __init__(self, detector: ScamDetector):
        self.detector = detector

    def calculate_risk(self, coin_data: Dict) -> Dict:
        """Calculate risk score for a coin"""

        # Rule-based score (0-100)
        rule_score = self._rule_based_score(coin_data)

        # ML probability
        ml_prob = self._ml_probability(coin_data)

        # Combined (60% ML, 40% rules)
        combined = 0.6 * (ml_prob * 100) + 0.4 * rule_score
        combined = min(100, max(0, combined))

        # Risk level
        if combined >= 70:
            level, rec, color = "CRITICAL", "HIGH RISK - Strong scam indicators detected", "red"
        elif combined >= 50:
            level, rec, color = "HIGH", "CAUTION - Multiple red flags present", "orange"
        elif combined >= 30:
            level, rec, color = "MEDIUM", "MODERATE RISK - Conduct thorough research", "yellow"
        else:
            level, rec, color = "LOW", "LOWER RISK - Standard due diligence recommended", "green"

        return {
            'risk_score': round(combined, 1),
            'risk_level': level,
            'ml_probability': round(ml_prob * 100, 1),
            'rule_score': round(rule_score, 1),
            'recommendation': rec,
            'color': color,
            'red_flags': self._get_red_flags(coin_data),
            'green_flags': self._get_green_flags(coin_data)
        }

    def _rule_based_score(self, coin: Dict) -> float:
        """Rule-based risk scoring"""
        score = 0

        # Critical
        if coin.get('honeypot', 0) == 1:
            score += 40
        if coin.get('sell_tax', 0) > 25:
            score += 20
        elif coin.get('sell_tax', 0) > 15:
            score += 12
        elif coin.get('sell_tax', 0) > 10:
            score += 6

        # Security
        if coin.get('had_audit', 0) == 0:
            score += 10
        if coin.get('team_doxxed', 0) == 0:
            score += 8
        if coin.get('liquidity_locked', 0) == 0:
            score += 12
        if coin.get('ownership_renounced', 0) == 0:
            score += 4

        # Age
        age = coin.get('age_days', 365)
        if age < 7:
            score += 12
        elif age < 14:
            score += 8
        elif age < 30:
            score += 4

        # Concentration
        top_holder = coin.get('top_holder_percent', 0)
        if top_holder > 60:
            score += 12
        elif top_holder > 40:
            score += 6

        # Category
        category = coin.get('category', '')
        if category == 'influencer_coin':
            score += 10
        elif category == 'meme_coin':
            score += 5
        elif category == 'ponzi':
            score += 25

        return min(100, score)

    def _ml_probability(self, coin: Dict) -> float:
        """Get ML scam probability"""
        if not self.detector.is_trained or not self.detector.feature_cols:
            return 0.5

        try:
            # Build feature vector
            features = []
            for col in self.detector.feature_cols:
                val = coin.get(col, 0)
                features.append(float(val) if val is not None else 0.0)

            X = np.array(features).reshape(1, -1)
            X = np.nan_to_num(X, 0)
            X_scaled = self.detector.scaler.transform(X)

            prob = self.detector.ensemble.predict_proba(X_scaled)[0][1]
            return float(prob)
        except Exception as e:
            return 0.5

    def _get_red_flags(self, coin: Dict) -> List[str]:
        """Get red flags"""
        flags = []

        if coin.get('honeypot', 0) == 1:
            flags.append("[CRITICAL] HONEYPOT DETECTED - Cannot sell tokens")
        if coin.get('had_audit', 0) == 0:
            flags.append("[HIGH] No security audit performed")
        if coin.get('team_doxxed', 0) == 0:
            flags.append("[HIGH] Anonymous team")
        if coin.get('liquidity_locked', 0) == 0:
            flags.append("[HIGH] Liquidity not locked")
        if coin.get('ownership_renounced', 0) == 0:
            flags.append("[MEDIUM] Ownership not renounced")
        if coin.get('sell_tax', 0) > 15:
            flags.append(f"[HIGH] High sell tax: {coin.get('sell_tax', 0):.1f}%")
        if coin.get('top_holder_percent', 0) > 50:
            flags.append(f"[HIGH] High concentration: {coin.get('top_holder_percent', 0):.1f}%")
        if coin.get('age_days', 365) < 14:
            flags.append(f"[HIGH] Very new project: {coin.get('age_days', 0)} days old")
        if coin.get('category', '') == 'influencer_coin':
            flags.append("[MEDIUM] Influencer-promoted token")

        return flags

    def _get_green_flags(self, coin: Dict) -> List[str]:
        """Get green flags"""
        flags = []

        if coin.get('had_audit', 0) == 1:
            flags.append("[POSITIVE] Security audit completed")
        if coin.get('team_doxxed', 0) == 1:
            flags.append("[POSITIVE] Team identity verified")
        if coin.get('liquidity_locked', 0) == 1:
            flags.append("[POSITIVE] Liquidity locked")
        if coin.get('ownership_renounced', 0) == 1:
            flags.append("[POSITIVE] Ownership renounced")
        if coin.get('github_exists', 0) == 1:
            flags.append("[POSITIVE] Open source code available")
        if coin.get('whitepaper_exists', 0) == 1:
            flags.append("[POSITIVE] Whitepaper available")
        if coin.get('age_days', 0) > 365:
            flags.append("[POSITIVE] Established project (1+ year)")
        if coin.get('age_days', 0) > 1000:
            flags.append("[POSITIVE] Veteran project (3+ years)")

        return flags

print("[SUCCESS] RiskScorer class defined")

[SUCCESS] RiskScorer class defined


In [6]:
# SETTING VISUALIZATIONS

class Visualizer:
    """Comprehensive visualizations"""

    def __init__(self):
        self.colors = {'scam': '#e74c3c', 'legit': '#2ecc71', 'neutral': '#3498db'}

    def plot_model_comparison(self, results: Dict) -> go.Figure:
        """Compare all models"""
        models = list(results.keys())
        metrics = ['accuracy', 'f1', 'precision', 'recall', 'roc_auc']

        fig = go.Figure()
        for metric in metrics:
            values = [results[m][metric] for m in models]
            fig.add_trace(go.Bar(
                name=metric.upper(),
                x=models,
                y=values,
                text=[f'{v:.3f}' for v in values],
                textposition='auto'
            ))

        fig.update_layout(
            title='Model Performance Comparison',
            barmode='group',
            yaxis=dict(range=[0, 1.1]),
            template='plotly_white',
            height=500
        )
        return fig

    def plot_roc_curves(self, results: Dict, y_test) -> go.Figure:
        """ROC curves for all models"""
        fig = go.Figure()

        for name, res in results.items():
            if 'y_prob' in res:
                fpr, tpr, _ = roc_curve(y_test, res['y_prob'])
                fig.add_trace(go.Scatter(
                    x=fpr, y=tpr,
                    name=f"{name} (AUC={res['roc_auc']:.3f})",
                    mode='lines'
                ))

        fig.add_trace(go.Scatter(
            x=[0, 1], y=[0, 1],
            name='Random Baseline',
            mode='lines',
            line=dict(dash='dash', color='gray')
        ))

        fig.update_layout(
            title='ROC Curves - All Models',
            xaxis_title='False Positive Rate',
            yaxis_title='True Positive Rate',
            template='plotly_white',
            height=500
        )
        return fig

    def plot_feature_importance(self, importance_df: pd.DataFrame) -> go.Figure:
        """Feature importance chart"""
        fig = go.Figure()
        fig.add_trace(go.Bar(
            x=importance_df['importance'],
            y=importance_df['feature'],
            orientation='h',
            marker_color=self.colors['neutral']
        ))

        fig.update_layout(
            title='Top Feature Importance (Random Forest)',
            xaxis_title='Importance Score',
            template='plotly_white',
            height=500,
            yaxis=dict(autorange='reversed')
        )
        return fig

    def plot_confusion_matrix(self, y_test, y_pred, model_name: str) -> go.Figure:
        """Confusion matrix"""
        cm = confusion_matrix(y_test, y_pred)

        fig = go.Figure(data=go.Heatmap(
            z=cm,
            x=['Predicted Legitimate', 'Predicted Scam'],
            y=['Actual Legitimate', 'Actual Scam'],
            text=cm,
            texttemplate='%{text}',
            colorscale='RdYlGn_r',
            showscale=False
        ))

        fig.update_layout(
            title=f'Confusion Matrix - {model_name}',
            template='plotly_white',
            height=400
        )
        return fig

    def plot_category_distribution(self, df: pd.DataFrame) -> go.Figure:
        """Scam distribution by category"""
        cat_stats = df.groupby('category').agg({
            'is_scam': ['sum', 'count']
        }).reset_index()
        cat_stats.columns = ['category', 'scam_count', 'total']
        cat_stats['legit_count'] = cat_stats['total'] - cat_stats['scam_count']
        cat_stats['scam_rate'] = (cat_stats['scam_count'] / cat_stats['total'] * 100).round(1)
        cat_stats = cat_stats.sort_values('total', ascending=False)

        fig = go.Figure()
        fig.add_trace(go.Bar(
            name='Legitimate',
            x=cat_stats['category'],
            y=cat_stats['legit_count'],
            marker_color=self.colors['legit']
        ))
        fig.add_trace(go.Bar(
            name='Scam',
            x=cat_stats['category'],
            y=cat_stats['scam_count'],
            marker_color=self.colors['scam']
        ))

        fig.update_layout(
            title='Scam Distribution by Category',
            barmode='stack',
            template='plotly_white',
            height=500
        )
        return fig

    def plot_scam_types(self, df: pd.DataFrame) -> go.Figure:
        """Distribution of scam types"""
        scam_df = df[df['is_scam'] == 1]
        type_counts = scam_df['scam_type'].value_counts()

        fig = go.Figure(data=[go.Pie(
            labels=type_counts.index,
            values=type_counts.values,
            hole=0.4
        )])

        fig.update_layout(
            title='Scam Types Distribution',
            template='plotly_white',
            height=450
        )
        return fig

    def create_risk_gauge(self, score: float) -> go.Figure:
        """Risk score gauge"""
        if score >= 70:
            color = "red"
        elif score >= 50:
            color = "orange"
        elif score >= 30:
            color = "yellow"
        else:
            color = "green"

        fig = go.Figure(go.Indicator(
            mode='gauge+number',
            value=score,
            title={'text': 'Risk Score'},
            gauge={
                'axis': {'range': [0, 100]},
                'bar': {'color': color},
                'steps': [
                    {'range': [0, 30], 'color': 'lightgreen'},
                    {'range': [30, 50], 'color': 'lightyellow'},
                    {'range': [50, 70], 'color': 'lightsalmon'},
                    {'range': [70, 100], 'color': 'lightcoral'}
                ]
            }
        ))
        fig.update_layout(height=300)
        return fig

print("[SUCCESS] Visualizer class defined")

[SUCCESS] Visualizer class defined


In [11]:
# EXECUTION

def main():
    """Main execution function"""
    print("\n" + "="*70)
    print("CRYPTO SCAM DETECTION SYSTEM v3.0")
    print("="*70)
    print(f"[INFO] Started: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

    # Building dataset
    builder = DatasetBuilder()
    df = builder.build_full_dataset(fetch_live=True, live_limit=100)

    # Engineer features
    engineer = FeatureEngineer()
    df = engineer.engineer_features(df)
    feature_cols = engineer.get_feature_columns(df)

    # Training models
    detector = ScamDetector()
    X_train, X_test, y_train, y_test = detector.prepare_data(df, feature_cols)
    results = detector.train_models(X_train, X_test, y_train, y_test)

    # Creating scorer and visualizer
    scorer = RiskScorer(detector)
    visualizer = Visualizer()

    return {
        'df': df,
        'detector': detector,
        'scorer': scorer,
        'visualizer': visualizer,
        'results': results,
        'y_test': y_test,
        'feature_cols': feature_cols,
        'engineer': engineer
    }

# Running the analysis
analysis = main()

print("\n" + "="*70)
print("[SUCCESS] ANALYSIS COMPLETE")
print("="*70)



CRYPTO SCAM DETECTION SYSTEM v3.0
[INFO] Started: 2025-12-12 14:40:50

BUILDING COMPREHENSIVE DATASET

[INFO] Loading documented scam cases...
   [OK] 51 documented scams loaded (Kim K, Logan Paul, BitConnect, etc.)

[INFO] Loading legitimate coins...
   [OK] 60 legitimate coins loaded (BTC, ETH, UNI, etc.)

[INFO] Generating edge cases...
   [OK] 80 edge cases generated (hard to classify samples)

[INFO] Generating synthetic variants...
   [OK] 80 synthetic variants generated for diversity

[INFO] Fetching live data from API...
[INFO] Fetching top 100 coins from CoinGecko...
   [OK] Page 1: 100 coins fetched
   [OK] 100 live coins fetched from CoinGecko

[SUMMARY] DATASET STATISTICS:
   Total coins: 341
   Scams: 125 (36.7%)
   Legitimate: 216 (63.3%)

   Categories:
      defi: 66
      altcoin: 61
      ai_token: 41
      meme_coin: 35
      gaming: 30
      layer2: 29
      layer1: 23
      influencer_coin: 16
      exchange_token: 14
      stablecoin: 11

[INFO] Engineering featu

In [12]:
# RESULTS

print("\n" + "="*70)
print("MODEL PERFORMANCE SUMMARY")
print("="*70)

results_df = pd.DataFrame({
    'Model': list(analysis['results'].keys()),
    'Test F1': [f"{r['f1']:.3f}" for r in analysis['results'].values()],
    'Precision': [f"{r['precision']:.3f}" for r in analysis['results'].values()],
    'Recall': [f"{r['recall']:.3f}" for r in analysis['results'].values()],
    'ROC-AUC': [f"{r['roc_auc']:.3f}" for r in analysis['results'].values()],
    'CV Mean': [f"{r['cv_mean']:.3f}" for r in analysis['results'].values()],
    'Overfit Gap': [f"{r['overfit_gap']:.3f}" for r in analysis['results'].values()]
})

print(results_df.to_string(index=False))

# Checking for for overfitting
max_f1 = max(r['f1'] for r in analysis['results'].values())
if max_f1 >= 0.99:
    print("\n[WARNING] Very high scores detected - review for potential overfitting")
elif max_f1 >= 0.95:
    print("\n[NOTE] High scores detected - ensure proper validation")
else:
    print("\n[OK] Realistic scores observed - no apparent overfitting")

# Best model
best_name = max(analysis['results'].items(), key=lambda x: x[1]['f1'])[0]
print(f"\n[RESULT] Best Model: {best_name} (F1: {analysis['results'][best_name]['f1']:.3f})")



MODEL PERFORMANCE SUMMARY
              Model Test F1 Precision Recall ROC-AUC CV Mean Overfit Gap
Logistic Regression   0.824     0.778  0.875   0.965   0.937       0.143
      Random Forest   0.848     0.824  0.875   0.976   0.947       0.121
            XGBoost   0.848     0.824  0.875   0.968   0.952       0.124
           LightGBM   0.903     0.933  0.875   0.982   0.958       0.091
  Gradient Boosting   0.889     0.903  0.875   0.989   0.960       0.111
           Ensemble   0.866     0.829  0.906   0.977   0.000       0.000

[OK] Realistic scores observed - no apparent overfitting

[RESULT] Best Model: LightGBM (F1: 0.903)


In [13]:
# VISUALIZATIONS

print("\n[INFO] Generating visualizations...")

# Model comparison
fig1 = analysis['visualizer'].plot_model_comparison(analysis['results'])
fig1.show()

# ROC curves
fig2 = analysis['visualizer'].plot_roc_curves(analysis['results'], analysis['y_test'])
fig2.show()

# Feature importance
importance = analysis['detector'].get_feature_importance(15)
if not importance.empty:
    fig3 = analysis['visualizer'].plot_feature_importance(importance)
    fig3.show()

# Category distribution
fig4 = analysis['visualizer'].plot_category_distribution(analysis['df'])
fig4.show()

# Scam types
fig5 = analysis['visualizer'].plot_scam_types(analysis['df'])
fig5.show()

# Confusion matrix
best_model = max(analysis['results'].items(), key=lambda x: x[1]['f1'])[0]
fig6 = analysis['visualizer'].plot_confusion_matrix(
    analysis['y_test'],
    analysis['results'][best_model]['y_pred'],
    best_model
)
fig6.show()

print("[SUCCESS] All visualizations generated")


[INFO] Generating visualizations...


[SUCCESS] All visualizations generated


In [14]:
# KNOWN SCAMS TABLE

print("\n" + "="*70)
print("DOCUMENTED SCAM EXAMPLES IN DATASET")
print("="*70)

scams_df = analysis['df'][analysis['df']['is_scam'] == 1].copy()

# Displaying the notable scams
notable_scams = scams_df[scams_df['loss_usd'] > 0].sort_values('loss_usd', ascending=False).head(20)

print(f"\n{'Name':<35} {'Symbol':<10} {'Category':<18} {'Type':<15} {'Loss USD':<15}")
print("-" * 95)

for _, row in notable_scams.iterrows():
    name = str(row['name'])[:34]
    symbol = str(row['symbol'])[:9]
    category = str(row['category'])[:17]
    scam_type = str(row.get('scam_type', 'N/A'))[:14]
    loss = row.get('loss_usd', 0)
    loss_str = f"${loss:,.0f}" if loss > 0 else "N/A"
    print(f"{name:<35} {symbol:<10} {category:<18} {scam_type:<15} {loss_str:<15}")

print(f"\n[INFO] Total documented losses: ${scams_df['loss_usd'].sum():,.0f}")


DOCUMENTED SCAM EXAMPLES IN DATASET

Name                                Symbol     Category           Type            Loss USD       
-----------------------------------------------------------------------------------------------
FTX Token                           FTT        exchange_token     fraud           $8,000,000,000 
Celsius                             CEL        defi               fraud           $4,700,000,000 
Celsius_v789                        CEL10      defi               fraud           $4,700,000,000 
OneCoin_v13                         ONE32      ponzi              ponzi_scheme    $4,000,000,000 
OneCoin                             ONE        ponzi              ponzi_scheme    $4,000,000,000 
BitConnect                          BCC        ponzi              ponzi_scheme    $3,500,000,000 
PlusToken                           PLUS       ponzi              ponzi_scheme    $2,900,000,000 
Thodex                              THODEX     exchange_token     exit_scam       

In [15]:
# INTERACTIVE COIN CHECKER

def check_coin(
    name: str,
    symbol: str,
    category: str = 'defi',
    had_audit: int = 0,
    team_doxxed: int = 0,
    liquidity_locked: int = 0,
    ownership_renounced: int = 0,
    contract_verified: int = 1,
    honeypot: int = 0,
    age_days: int = 30,
    holder_count: int = 1000,
    social_followers: int = 5000,
    buy_tax: float = 5.0,
    sell_tax: float = 5.0,
    top_holder_percent: float = 20.0,
    website_exists: int = 1,
    whitepaper_exists: int = 0,
    github_exists: int = 0,
    volatility: float = 50.0,
    price_drop_percent: float = 30.0,
    days_to_crash: int = 365,
    max_tx_limit: int = 0
) -> Dict:
    """
    Check risk score for any cryptocurrency

    Parameters:
    -----------
    name : str - Coin name
    symbol : str - Ticker symbol
    category : str - One of: layer1, layer2, defi, meme_coin, stablecoin,
                     influencer_coin, exchange_token, gaming, ai_token, social_token
    had_audit : int - 1 if audited, 0 if not
    team_doxxed : int - 1 if team known, 0 if anonymous
    liquidity_locked : int - 1 if locked, 0 if not
    ownership_renounced : int - 1 if renounced, 0 if not
    contract_verified : int - 1 if verified, 0 if not
    honeypot : int - 1 if honeypot detected, 0 if not
    age_days : int - Age of project in days
    holder_count : int - Number of token holders
    social_followers : int - Twitter/social followers
    buy_tax : float - Buy tax percentage
    sell_tax : float - Sell tax percentage
    top_holder_percent : float - Percentage held by top holder
    website_exists : int - 1 if website exists, 0 if not
    whitepaper_exists : int - 1 if whitepaper exists, 0 if not
    github_exists : int - 1 if open source, 0 if not

    Returns:
    --------
    Dict with risk assessment
    """

    # Building coin data
    coin_data = {
        'name': name,
        'symbol': symbol,
        'category': category,
        'had_audit': had_audit,
        'team_doxxed': team_doxxed,
        'liquidity_locked': liquidity_locked,
        'ownership_renounced': ownership_renounced,
        'contract_verified': contract_verified,
        'honeypot': honeypot,
        'age_days': age_days,
        'holder_count': holder_count,
        'social_followers': social_followers,
        'buy_tax': buy_tax,
        'sell_tax': sell_tax,
        'top_holder_percent': top_holder_percent,
        'website_exists': website_exists,
        'whitepaper_exists': whitepaper_exists,
        'github_exists': github_exists,
        'volatility': volatility,
        'price_drop_percent': price_drop_percent,
        'days_to_crash': days_to_crash,
        'max_tx_limit': max_tx_limit,
    }

    # Adding engineered features
    coin_data['holder_count_log'] = np.log1p(holder_count)
    coin_data['social_followers_log'] = np.log1p(social_followers)
    coin_data['volatility_log'] = np.log1p(volatility)
    coin_data['age_days_log'] = np.log1p(age_days)

    coin_data['is_very_new'] = 1 if age_days < 14 else 0
    coin_data['is_new'] = 1 if age_days < 60 else 0
    coin_data['is_established'] = 1 if age_days > 365 else 0
    coin_data['is_veteran'] = 1 if age_days > 1000 else 0

    coin_data['trust_score'] = (
        had_audit * 25 + team_doxxed * 20 + liquidity_locked * 25 +
        ownership_renounced * 15 + contract_verified * 10 +
        website_exists * 3 + whitepaper_exists * 5 + github_exists * 7
    ) / 100

    coin_data['red_flag_score'] = (
        honeypot * 50 + (1 - had_audit) * 12 + (1 - team_doxxed) * 8 +
        (1 - liquidity_locked) * 15 + (1 if sell_tax > 10 else 0) * 12 +
        (1 if top_holder_percent > 50 else 0) * 12 + (1 if age_days < 14 else 0) * 8
    ) / 100

    coin_data['tax_difference'] = sell_tax - buy_tax
    coin_data['total_tax'] = sell_tax + buy_tax
    coin_data['high_sell_tax'] = 1 if sell_tax > 15 else 0
    coin_data['suspicious_tax'] = 1 if (sell_tax - buy_tax) > 10 else 0
    coin_data['high_concentration'] = 1 if top_holder_percent > 40 else 0
    coin_data['extreme_concentration'] = 1 if top_holder_percent > 60 else 0
    coin_data['social_holder_ratio'] = social_followers / (holder_count + 1)
    coin_data['social_holder_ratio_log'] = np.log1p(coin_data['social_holder_ratio'])
    coin_data['high_volatility'] = 1 if volatility > 100 else 0
    coin_data['extreme_volatility'] = 1 if volatility > 200 else 0
    coin_data['severe_crash'] = 1 if price_drop_percent > 90 else 0
    coin_data['quick_crash'] = 1 if days_to_crash < 14 else 0
    coin_data['crash_velocity'] = price_drop_percent / (days_to_crash + 1)
    coin_data['high_risk_category'] = 1 if category in ['meme_coin', 'influencer_coin', 'ponzi'] else 0
    coin_data['anon_no_audit'] = 1 if (team_doxxed == 0 and had_audit == 0) else 0
    coin_data['new_high_risk'] = 1 if (age_days < 60 and coin_data['high_risk_category']) else 0
    coin_data['honeypot_indicator'] = honeypot

    # Categorizing dummies
    all_cats = ['layer1', 'layer2', 'defi', 'meme_coin', 'stablecoin',
                'exchange_token', 'influencer_coin', 'gaming', 'ai_token',
                'social_token', 'privacy', 'altcoin', 'ponzi']
    for cat in all_cats:
        coin_data[f'cat_{cat}'] = 1 if category == cat else 0

    # Calculating risk
    risk = analysis['scorer'].calculate_risk(coin_data)

    # Displaying the results
    print(f"\n{'='*65}")
    print(f"RISK ASSESSMENT: {name} ({symbol})")
    print(f"{'='*65}")
    print(f"Category: {category}")
    print(f"Age: {age_days} days")
    print(f"\nRISK SCORE: {risk['risk_score']}/100")
    print(f"RISK LEVEL: {risk['risk_level']}")
    print(f"ML Probability: {risk['ml_probability']}%")
    print(f"Rule Score: {risk['rule_score']}")
    print(f"\nRECOMMENDATION: {risk['recommendation']}")

    if risk['red_flags']:
        print(f"\nRED FLAGS ({len(risk['red_flags'])}):")
        for flag in risk['red_flags']:
            print(f"   {flag}")

    if risk['green_flags']:
        print(f"\nGREEN FLAGS ({len(risk['green_flags'])}):")
        for flag in risk['green_flags']:
            print(f"   {flag}")

    # Show gauge
    gauge = analysis['visualizer'].create_risk_gauge(risk['risk_score'])
    gauge.show()

    return risk


In [16]:
# EXAMPLE CHECKS AND TEST CASES

print("\n" + "="*70)
print("EXAMPLE RISK ASSESSMENTS")
print("="*70)

# Example 1: Suspicious influencer coin
print("\n[EXAMPLE 1] Suspicious Influencer Coin")
check_coin(
    name="CelebToken",
    symbol="CELEB",
    category="influencer_coin",
    had_audit=0,
    team_doxxed=1,  # Celebrity is known
    liquidity_locked=0,
    ownership_renounced=0,
    age_days=5,
    sell_tax=15,
    top_holder_percent=55,
    social_followers=2000000
)

# Example 2: Established DeFi
print("\n[EXAMPLE 2] Established DeFi Protocol")
check_coin(
    name="SafeYield Protocol",
    symbol="SYIELD",
    category="defi",
    had_audit=1,
    team_doxxed=1,
    liquidity_locked=1,
    ownership_renounced=1,
    age_days=800,
    sell_tax=0,
    top_holder_percent=12,
    github_exists=1,
    whitepaper_exists=1
)

# Example 3: New meme coin (borderline)
print("\n[EXAMPLE 3] New Meme Coin (Borderline)")
check_coin(
    name="DogeMoonRocket",
    symbol="DMR",
    category="meme_coin",
    had_audit=0,
    team_doxxed=0,
    liquidity_locked=1,
    ownership_renounced=1,
    age_days=45,
    sell_tax=5,
    top_holder_percent=25,
    social_followers=50000
)



EXAMPLE RISK ASSESSMENTS

[EXAMPLE 1] Suspicious Influencer Coin

RISK ASSESSMENT: CelebToken (CELEB)
Category: influencer_coin
Age: 5 days

RISK SCORE: 66.5/100
RISK LEVEL: HIGH
ML Probability: 70.8%
Rule Score: 60

RECOMMENDATION: CAUTION - Multiple red flags present

RED FLAGS (6):
   [HIGH] No security audit performed
   [HIGH] Liquidity not locked
   [MEDIUM] Ownership not renounced
   [HIGH] High concentration: 55.0%
   [HIGH] Very new project: 5 days old
   [MEDIUM] Influencer-promoted token

GREEN FLAGS (1):
   [POSITIVE] Team identity verified



[EXAMPLE 2] Established DeFi Protocol

RISK ASSESSMENT: SafeYield Protocol (SYIELD)
Category: defi
Age: 800 days

RISK SCORE: 2.1/100
RISK LEVEL: LOW
ML Probability: 3.5%
Rule Score: 0

RECOMMENDATION: LOWER RISK - Standard due diligence recommended

GREEN FLAGS (7):
   [POSITIVE] Security audit completed
   [POSITIVE] Team identity verified
   [POSITIVE] Liquidity locked
   [POSITIVE] Ownership renounced
   [POSITIVE] Open source code available
   [POSITIVE] Whitepaper available
   [POSITIVE] Established project (1+ year)



[EXAMPLE 3] New Meme Coin (Borderline)

RISK ASSESSMENT: DogeMoonRocket (DMR)
Category: meme_coin
Age: 45 days

RISK SCORE: 20.2/100
RISK LEVEL: LOW
ML Probability: 18.3%
Rule Score: 23

RECOMMENDATION: LOWER RISK - Standard due diligence recommended

RED FLAGS (2):
   [HIGH] No security audit performed
   [HIGH] Anonymous team

GREEN FLAGS (2):
   [POSITIVE] Liquidity locked
   [POSITIVE] Ownership renounced


{'risk_score': 20.2,
 'risk_level': 'LOW',
 'ml_probability': 18.3,
 'rule_score': 23,
 'recommendation': 'LOWER RISK - Standard due diligence recommended',
 'color': 'green',
 'red_flags': ['[HIGH] No security audit performed', '[HIGH] Anonymous team'],
 'green_flags': ['[POSITIVE] Liquidity locked',
  '[POSITIVE] Ownership renounced']}

In [17]:
# DATASET STATISTICS

print("\n" + "="*70)
print("DATASET STATISTICS")
print("="*70)

df = analysis['df']

print(f"\nTotal coins: {len(df)}")
print(f"   Scams: {df['is_scam'].sum()} ({df['is_scam'].mean()*100:.1f}%)")
print(f"   Legitimate: {(df['is_scam']==0).sum()} ({(df['is_scam']==0).mean()*100:.1f}%)")

print(f"\nBy Category:")
for cat, count in df['category'].value_counts().items():
    scam_rate = df[df['category']==cat]['is_scam'].mean() * 100
    print(f"   {cat}: {count} coins ({scam_rate:.1f}% scam rate)")

print(f"\nScam Types:")
scam_df = df[df['is_scam'] == 1]
for stype, count in scam_df['scam_type'].value_counts().items():
    print(f"   {stype}: {count}")

print(f"\nYear Range: {int(df['year'].min()) if 'year' in df.columns else 'N/A'} - {int(df['year'].max()) if 'year' in df.columns else 'N/A'}")



DATASET STATISTICS

Total coins: 341
   Scams: 125 (36.7%)
   Legitimate: 216 (63.3%)

By Category:
   defi: 66 coins (45.5% scam rate)
   altcoin: 61 coins (0.0% scam rate)
   ai_token: 41 coins (29.3% scam rate)
   meme_coin: 35 coins (40.0% scam rate)
   gaming: 30 coins (40.0% scam rate)
   layer2: 29 coins (82.8% scam rate)
   layer1: 23 coins (0.0% scam rate)
   influencer_coin: 16 coins (100.0% scam rate)
   exchange_token: 14 coins (64.3% scam rate)
   stablecoin: 11 coins (0.0% scam rate)
   ponzi: 8 coins (100.0% scam rate)
   privacy: 4 coins (0.0% scam rate)
   social_token: 3 coins (0.0% scam rate)

Scam Types:
   pump_dump: 46
   fraud: 27
   rug_pull: 23
   slow_rug: 9
   ponzi_scheme: 8
   exit_scam: 7
   honeypot: 3
   exploit: 1
   impersonation: 1

Year Range: 2009 - 2024


In [18]:
# EXPORTING THE RESULTS

def export_results():
    """Export analysis results"""
    print("\n[INFO] Exporting results...")

    df = analysis['df'].copy()

    # Add risk scores
    risk_scores = []
    risk_levels = []

    for _, row in df.iterrows():
        try:
            risk = analysis['scorer'].calculate_risk(row.to_dict())
            risk_scores.append(risk['risk_score'])
            risk_levels.append(risk['risk_level'])
        except:
            risk_scores.append(50)
            risk_levels.append('UNKNOWN')

    df['risk_score'] = risk_scores
    df['risk_level'] = risk_levels

    # Select columns for export
    export_cols = ['name', 'symbol', 'category', 'is_scam', 'scam_type',
                   'risk_score', 'risk_level', 'had_audit', 'team_doxxed',
                   'liquidity_locked', 'age_days', 'sell_tax', 'top_holder_percent']

    export_df = df[[c for c in export_cols if c in df.columns]]
    export_df.to_csv('crypto_scam_analysis_v3.csv', index=False)

    print("[SUCCESS] Exported to 'crypto_scam_analysis_v3.csv'")
    print(f"\nPreview:")
    print(export_df.head(10).to_string())

    return export_df

export_df = export_results()



[INFO] Exporting results...
[SUCCESS] Exported to 'crypto_scam_analysis_v3.csv'

Preview:
                           name  symbol         category  is_scam  scam_type  risk_score risk_level  had_audit  team_doxxed  liquidity_locked  age_days   sell_tax  top_holder_percent
0           Save The Kids Token    KIDS  influencer_coin        1   rug_pull        66.8       HIGH          0            0                 0      22.0   5.625269           49.591715
1               Lil Yachty Coin  YACHTY  influencer_coin        1  pump_dump        71.0   CRITICAL          1            1                 0      16.0   7.580329           66.649613
2               Soulja Boy Coin  SOULJA  influencer_coin        1  pump_dump        84.3   CRITICAL          0            1                 0       6.0  10.472091           82.514100
3          Logan Paul CryptoZoo     ZOO  influencer_coin        1      fraud        65.3       HIGH          0            0                 1      80.0   3.648659           65.7

In [19]:
# SUMMARY

print("\n" + "="*70)
print("PROJECT SUMMARY")
print("="*70)

best_f1 = max(r['f1'] for r in analysis['results'].values())
best_auc = max(r['roc_auc'] for r in analysis['results'].values())

print(f"""
CRYPTO SCAM DETECTION SYSTEM v3.0 - COMPLETE

DATASET:
======================================================================
- Total samples: {len(analysis['df'])}
- Scams: {analysis['df']['is_scam'].sum()} ({analysis['df']['is_scam'].mean()*100:.1f}%)
- Legitimate: {(analysis['df']['is_scam']==0).sum()} ({(analysis['df']['is_scam']==0).mean()*100:.1f}%)
- Features: {len(analysis['feature_cols'])}

REAL EXAMPLES INCLUDED:
======================================================================
- Celebrity scams: Kim Kardashian, Logan Paul, FaZe Clan, etc.
- Major Ponzis: BitConnect, OneCoin, PlusToken
- Exchange frauds: FTX, Celsius, Thodex, Mt Gox
- DeFi rugs: Squid Game, AnubisDAO, Meerkat Finance
- Legitimate coins: BTC, ETH, SOL, UNI, AAVE, DOGE, etc.

MODEL PERFORMANCE:
======================================================================
- Best F1 Score: {best_f1:.3f}
- Best ROC-AUC: {best_auc:.3f}
- Models trained: {len(analysis['results'])}
- Overfitting: {'Check results - scores very high' if best_f1 > 0.95 else 'No apparent overfitting detected'}

KEY FEATURES:
======================================================================
- Real documented scam cases with sources
- Synthetic data augmentation for diversity
- Edge cases to prevent overfitting
- Strong regularization in all models
- Interactive coin checker function
- Comprehensive visualizations
- Risk scoring (ML + Rules combined)

FOR YOUR RESUME:
======================================================================
Project: "AI-Powered Cryptocurrency Scam Detection System"

- Developed ML system achieving {best_f1:.0%} F1-score for detecting
  crypto scams using ensemble methods
- Built dataset of {len(analysis['df'])}+ cryptocurrencies including
  {analysis['df']['is_scam'].sum()}+ documented scams
- Implemented 5 ML models with cross-validation and regularization
- Created real-time risk scoring combining ML + rule-based analysis
- Technologies: Python, Scikit-learn, XGBoost, LightGBM, Plotly

USAGE:
======================================================================
Use check_coin() to analyze any cryptocurrency:

check_coin(
    name="Token Name",
    symbol="TKN",
    category="meme_coin",
    had_audit=0,
    team_doxxed=0,
    liquidity_locked=0,
    age_days=10,
    sell_tax=20,
    ...
)
""")

print(f"\n[INFO] Completed: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
print("="*70)


PROJECT SUMMARY

CRYPTO SCAM DETECTION SYSTEM v3.0 - COMPLETE

DATASET:
- Total samples: 341
- Scams: 125 (36.7%)
- Legitimate: 216 (63.3%)
- Features: 46

REAL EXAMPLES INCLUDED:
- Celebrity scams: Kim Kardashian, Logan Paul, FaZe Clan, etc.
- Major Ponzis: BitConnect, OneCoin, PlusToken
- Exchange frauds: FTX, Celsius, Thodex, Mt Gox
- DeFi rugs: Squid Game, AnubisDAO, Meerkat Finance
- Legitimate coins: BTC, ETH, SOL, UNI, AAVE, DOGE, etc.

MODEL PERFORMANCE:
- Best F1 Score: 0.903
- Best ROC-AUC: 0.989
- Models trained: 6
- Overfitting: No apparent overfitting detected

KEY FEATURES:
- Real documented scam cases with sources
- Synthetic data augmentation for diversity
- Edge cases to prevent overfitting
- Strong regularization in all models
- Interactive coin checker function
- Comprehensive visualizations
- Risk scoring (ML + Rules combined)

FOR YOUR RESUME:
Project: "AI-Powered Cryptocurrency Scam Detection System"

- Developed ML system achieving 90% F1-score for detecting
  c

In [23]:
# SETTING UP LIVE COIN PREDICTION SYSTEM

class LiveCoinPredictor:
    """
    Live cryptocurrency prediction system with user-friendly interface
    Fetches real data and provides investment recommendations
    """

    def __init__(self, analysis_results):
        self.analysis = analysis_results
        self.scorer = analysis_results['scorer']
        self.visualizer = analysis_results['visualizer']
        self.coingecko_base = "https://api.coingecko.com/api/v3"

        # Category risk profiles based on research
        self.category_profiles = {
            'layer1': {
                'name': 'Layer 1 Blockchain',
                'base_risk': 'LOW',
                'description': 'Core blockchain protocols like Bitcoin, Ethereum',
                'avg_scam_rate': 0.05,
                'investment_horizon': 'Long-term (2-5 years)',
                'volatility': 'Medium-High',
                'examples': 'BTC, ETH, SOL, ADA, AVAX',
                'key_factors': ['Network adoption', 'Developer activity', 'Transaction volume'],
            },
            'layer2': {
                'name': 'Layer 2 Scaling',
                'base_risk': 'LOW-MEDIUM',
                'description': 'Scaling solutions built on Layer 1s',
                'avg_scam_rate': 0.08,
                'investment_horizon': 'Medium-term (1-3 years)',
                'volatility': 'Medium-High',
                'examples': 'MATIC, ARB, OP, IMX',
                'key_factors': ['L1 dependency', 'TVL growth', 'Ecosystem development'],
            },
            'defi': {
                'name': 'DeFi Protocol',
                'base_risk': 'MEDIUM',
                'description': 'Decentralized finance applications',
                'avg_scam_rate': 0.25,
                'investment_horizon': 'Medium-term (6-24 months)',
                'volatility': 'High',
                'examples': 'UNI, AAVE, LINK, MKR, CRV',
                'key_factors': ['TVL', 'Audit status', 'Smart contract security', 'Team reputation'],
            },
            'meme_coin': {
                'name': 'Meme Coin',
                'base_risk': 'HIGH',
                'description': 'Community-driven tokens, often speculative',
                'avg_scam_rate': 0.45,
                'investment_horizon': 'Short-term (days-weeks)',
                'volatility': 'Extreme',
                'examples': 'DOGE, SHIB, PEPE, BONK, FLOKI',
                'key_factors': ['Community size', 'Social momentum', 'Liquidity', 'Age'],
            },
            'stablecoin': {
                'name': 'Stablecoin',
                'base_risk': 'LOW',
                'description': 'Price-stable cryptocurrencies',
                'avg_scam_rate': 0.02,
                'investment_horizon': 'Utility/Savings',
                'volatility': 'Very Low',
                'examples': 'USDT, USDC, DAI, FRAX',
                'key_factors': ['Backing reserves', 'Regulatory status', 'Depegging history'],
            },
            'influencer_coin': {
                'name': 'Influencer/Celebrity Coin',
                'base_risk': 'VERY HIGH',
                'description': 'Tokens promoted by celebrities or influencers',
                'avg_scam_rate': 0.85,
                'investment_horizon': 'NOT RECOMMENDED',
                'volatility': 'Extreme',
                'examples': 'Most are scams - EMAX, KIDS, HAWK',
                'key_factors': ['WHO is promoting', 'Liquidity lock', 'Contract audit'],
            },
            'exchange_token': {
                'name': 'Exchange Token',
                'base_risk': 'MEDIUM',
                'description': 'Tokens issued by cryptocurrency exchanges',
                'avg_scam_rate': 0.15,
                'investment_horizon': 'Medium-term (1-2 years)',
                'volatility': 'Medium',
                'examples': 'BNB, CRO, OKB, KCS',
                'key_factors': ['Exchange reputation', 'Trading volume', 'Regulatory status'],
            },
            'gaming': {
                'name': 'Gaming/Metaverse',
                'base_risk': 'MEDIUM-HIGH',
                'description': 'Gaming and virtual world tokens',
                'avg_scam_rate': 0.30,
                'investment_horizon': 'Medium-term (1-3 years)',
                'volatility': 'High',
                'examples': 'SAND, MANA, AXS, GALA, ENJ',
                'key_factors': ['Active players', 'Game development', 'NFT ecosystem'],
            },
            'ai_token': {
                'name': 'AI/Tech Token',
                'base_risk': 'MEDIUM',
                'description': 'Artificial intelligence and tech-focused tokens',
                'avg_scam_rate': 0.20,
                'investment_horizon': 'Medium-long term (1-3 years)',
                'volatility': 'High',
                'examples': 'RNDR, FET, AGIX, OCEAN',
                'key_factors': ['Technology utility', 'Partnerships', 'Real-world adoption'],
            },
            'social_token': {
                'name': 'Social/Creator Token',
                'base_risk': 'MEDIUM-HIGH',
                'description': 'Tokens for social platforms and creators',
                'avg_scam_rate': 0.35,
                'investment_horizon': 'Medium-term (6-18 months)',
                'volatility': 'High',
                'examples': 'CHZ, RLY, DESO',
                'key_factors': ['Platform adoption', 'Creator engagement', 'Utility'],
            },
            'privacy': {
                'name': 'Privacy Coin',
                'base_risk': 'MEDIUM',
                'description': 'Privacy-focused cryptocurrencies',
                'avg_scam_rate': 0.10,
                'investment_horizon': 'Long-term (2-5 years)',
                'volatility': 'Medium-High',
                'examples': 'XMR, ZEC',
                'key_factors': ['Regulatory risk', 'Exchange listings', 'Technology'],
            },
            'ponzi': {
                'name': 'Suspected Ponzi',
                'base_risk': 'CRITICAL',
                'description': 'Schemes promising unrealistic returns',
                'avg_scam_rate': 1.0,
                'investment_horizon': 'NEVER INVEST',
                'volatility': 'N/A - Will go to zero',
                'examples': 'BitConnect, OneCoin, PlusToken',
                'key_factors': ['AVOID AT ALL COSTS'],
            }
        }

    def fetch_coin_data(self, coin_name: str) -> Optional[Dict]:
        """Fetch real-time data for a coin from CoinGecko"""
        try:
            # Search for the coin
            search_url = f"{self.coingecko_base}/search"
            response = requests.get(search_url, params={'query': coin_name}, timeout=10)

            if response.status_code != 200:
                return None

            search_results = response.json()
            coins = search_results.get('coins', [])

            if not coins:
                return None


            coin_id = coins[0]['id']
            coin_symbol = coins[0]['symbol'].upper()
            coin_name_official = coins[0]['name']

            time.sleep(1)  # Rate limiting

            # Fetching detailed data
            detail_url = f"{self.coingecko_base}/coins/{coin_id}"
            params = {
                'localization': 'false',
                'tickers': False,
                'market_data': True,
                'community_data': True,
                'developer_data': True,
                'sparkline': True
            }

            response = requests.get(detail_url, params=params, timeout=15)

            if response.status_code != 200:
                return {'name': coin_name_official, 'symbol': coin_symbol, 'id': coin_id}

            data = response.json()

            # Extract relevant information
            market_data = data.get('market_data', {})
            community_data = data.get('community_data', {})
            developer_data = data.get('developer_data', {})

            # Get historical prices for chart
            time.sleep(1)
            history_url = f"{self.coingecko_base}/coins/{coin_id}/market_chart"
            history_response = requests.get(
                history_url,
                params={'vs_currency': 'usd', 'days': 90},
                timeout=15
            )

            price_history = []
            if history_response.status_code == 200:
                history_data = history_response.json()
                price_history = history_data.get('prices', [])

            return {
                'id': coin_id,
                'name': coin_name_official,
                'symbol': coin_symbol,
                'current_price': market_data.get('current_price', {}).get('usd', 0),
                'market_cap': market_data.get('market_cap', {}).get('usd', 0),
                'market_cap_rank': market_data.get('market_cap_rank', 0),
                'total_volume': market_data.get('total_volume', {}).get('usd', 0),
                'price_change_24h': market_data.get('price_change_percentage_24h', 0),
                'price_change_7d': market_data.get('price_change_percentage_7d', 0),
                'price_change_30d': market_data.get('price_change_percentage_30d', 0),
                'price_change_1y': market_data.get('price_change_percentage_1y', 0),
                'ath': market_data.get('ath', {}).get('usd', 0),
                'ath_change': market_data.get('ath_change_percentage', {}).get('usd', 0),
                'atl': market_data.get('atl', {}).get('usd', 0),
                'circulating_supply': market_data.get('circulating_supply', 0),
                'total_supply': market_data.get('total_supply', 0),
                'twitter_followers': community_data.get('twitter_followers', 0),
                'reddit_subscribers': community_data.get('reddit_subscribers', 0),
                'github_stars': developer_data.get('stars', 0),
                'github_forks': developer_data.get('forks', 0),
                'sparkline': market_data.get('sparkline_7d', {}).get('price', []),
                'price_history': price_history,
                'genesis_date': data.get('genesis_date', None),
                'description': data.get('description', {}).get('en', '')[:500],
            }

        except Exception as e:
            print(f"[WARNING] Error fetching data: {e}")
            return None

    def infer_category(self, coin_data: Dict, user_category: str = None) -> str:
        """Infer or validate coin category"""
        if user_category and user_category.lower() != 'auto':
            return user_category.lower().replace(' ', '_')

        name = coin_data.get('name', '').lower()
        symbol = coin_data.get('symbol', '').lower()
        desc = coin_data.get('description', '').lower()

        # Inference rules
        if any(x in name for x in ['bitcoin', 'ethereum', 'solana', 'cardano', 'polkadot', 'avalanche', 'cosmos']):
            return 'layer1'
        elif any(x in name for x in ['tether', 'usd coin', 'usdc', 'usdt', 'dai', 'stablecoin']):
            return 'stablecoin'
        elif any(x in name for x in ['doge', 'shib', 'pepe', 'floki', 'bonk', 'meme', 'inu', 'moon', 'safe', 'baby']):
            return 'meme_coin'
        elif any(x in name for x in ['uniswap', 'aave', 'compound', 'curve', 'sushi', 'maker', 'lido', 'yearn']):
            return 'defi'
        elif any(x in name for x in ['polygon', 'arbitrum', 'optimism', 'layer 2', 'l2', 'zk']):
            return 'layer2'
        elif any(x in name for x in ['bnb', 'binance', 'okb', 'kucoin', 'exchange']):
            return 'exchange_token'
        elif any(x in name for x in ['sandbox', 'decentraland', 'axie', 'gala', 'enjin', 'game', 'play']):
            return 'gaming'
        elif any(x in name for x in ['render', 'fetch', 'singularity', 'ocean', 'ai', 'artificial']):
            return 'ai_token'
        elif any(x in name for x in ['monero', 'zcash', 'privacy']):
            return 'privacy'
        elif any(x in desc for x in ['social', 'creator', 'fan token']):
            return 'social_token'
        else:
            return 'defi'

    def estimate_coin_features(self, coin_data: Dict, category: str) -> Dict:
        """Estimate security features based on available data"""
        rank = coin_data.get('market_cap_rank', 1000) or 1000
        market_cap = coin_data.get('market_cap', 0) or 0
        twitter = coin_data.get('twitter_followers', 0) or 0
        github = coin_data.get('github_stars', 0) or 0
        genesis = coin_data.get('genesis_date', None)

        # Calculating age
        if genesis:
            try:
                genesis_date = datetime.strptime(genesis, '%Y-%m-%d')
                age_days = (datetime.now() - genesis_date).days
            except:
                age_days = 365 if rank < 100 else 180
        else:
            age_days = 365 * (5 - min(4, rank // 50)) if rank < 200 else 180

        # EstimatING features based on rank and market cap
        features = {
            'had_audit': 1 if (rank < 50 or market_cap > 1e9) else (1 if rank < 200 and random.random() < 0.7 else 0),
            'team_doxxed': 1 if rank < 100 else (1 if rank < 300 and random.random() < 0.6 else 0),
            'liquidity_locked': 1 if rank < 200 else (1 if random.random() < 0.7 else 0),
            'ownership_renounced': 1 if rank < 50 else (1 if random.random() < 0.5 else 0),
            'contract_verified': 1 if rank < 500 else (1 if random.random() < 0.8 else 0),
            'honeypot': 0,
            'age_days': age_days,
            'holder_count': int(market_cap / 100) if market_cap > 0 else 1000,
            'social_followers': twitter + coin_data.get('reddit_subscribers', 0),
            'buy_tax': 0 if rank < 100 else random.uniform(0, 5),
            'sell_tax': 0 if rank < 100 else random.uniform(0, 8),
            'top_holder_percent': 5 if rank < 20 else (15 if rank < 100 else 25),
            'website_exists': 1,
            'whitepaper_exists': 1 if rank < 300 else (1 if random.random() < 0.6 else 0),
            'github_exists': 1 if github > 0 else 0,
            'volatility': abs(coin_data.get('price_change_30d', 30)) or 30,
            'price_drop_percent': abs(coin_data.get('ath_change', 50)) or 50,
            'days_to_crash': 365,
            'max_tx_limit': 0,
            'category': category,
        }

        # Adjust for high-risk categories
        if category in ['meme_coin', 'influencer_coin']:
            features['had_audit'] = 0 if rank > 200 else features['had_audit']
            features['team_doxxed'] = 0 if rank > 300 else features['team_doxxed']

        return features

    def create_price_chart(self, coin_data: Dict) -> go.Figure:
        """Create price history chart"""
        price_history = coin_data.get('price_history', [])
        sparkline = coin_data.get('sparkline', [])

        if price_history:
            dates = [datetime.fromtimestamp(p[0]/1000) for p in price_history]
            prices = [p[1] for p in price_history]
            title = f"{coin_data['name']} - 90 Day Price History"
        elif sparkline:
            dates = list(range(len(sparkline)))
            prices = sparkline
            title = f"{coin_data['name']} - 7 Day Price Trend"
        else:
            dates = list(range(30))
            prices = [coin_data.get('current_price', 1)] * 30
            title = f"{coin_data['name']} - Price Data Unavailable"

        # Calculate trend
        if len(prices) > 1:
            price_change = ((prices[-1] - prices[0]) / prices[0] * 100) if prices[0] > 0 else 0
            color = '#2ecc71' if price_change >= 0 else '#e74c3c'
        else:
            color = '#3498db'

        fig = go.Figure()

        fig.add_trace(go.Scatter(
            x=dates,
            y=prices,
            mode='lines',
            name='Price (USD)',
            line=dict(color=color, width=2),
            fill='tozeroy',
            fillcolor=f'rgba{tuple(list(int(color.lstrip("#")[i:i+2], 16) for i in (0, 2, 4)) + [0.2])}'
        ))

        # Add moving average if enough data
        if len(prices) > 7:
            ma = pd.Series(prices).rolling(window=7).mean()
            fig.add_trace(go.Scatter(
                x=dates,
                y=ma,
                mode='lines',
                name='7-day MA',
                line=dict(color='orange', width=1, dash='dash')
            ))

        fig.update_layout(
            title=title,
            xaxis_title='Date' if price_history else 'Days',
            yaxis_title='Price (USD)',
            template='plotly_white',
            height=400,
            showlegend=True,
            hovermode='x unified'
        )

        return fig

    def create_stats_display(self, coin_data: Dict) -> go.Figure:
        """Create stats visualization"""
        stats = [
            ('Market Cap', f"${coin_data.get('market_cap', 0):,.0f}"),
            ('24h Volume', f"${coin_data.get('total_volume', 0):,.0f}"),
            ('Rank', f"#{coin_data.get('market_cap_rank', 'N/A')}"),
            ('24h Change', f"{coin_data.get('price_change_24h', 0):.2f}%"),
            ('7d Change', f"{coin_data.get('price_change_7d', 0):.2f}%"),
            ('30d Change', f"{coin_data.get('price_change_30d', 0):.2f}%"),
            ('ATH', f"${coin_data.get('ath', 0):,.4f}"),
            ('From ATH', f"{coin_data.get('ath_change', 0):.1f}%"),
        ]

        fig = go.Figure(data=[go.Table(
            header=dict(
                values=['<b>Metric</b>', '<b>Value</b>'],
                fill_color='#3498db',
                font=dict(color='white', size=14),
                align='left',
                height=35
            ),
            cells=dict(
                values=[[s[0] for s in stats], [s[1] for s in stats]],
                fill_color=[['#f8f9fa', '#ffffff'] * 4],
                font=dict(size=13),
                align='left',
                height=30
            )
        )])

        fig.update_layout(
            title=f"{coin_data['name']} ({coin_data['symbol']}) Statistics",
            height=350,
            margin=dict(t=50, b=20, l=20, r=20)
        )

        return fig

    def generate_recommendation(self, risk_result: Dict, category: str, coin_data: Dict) -> str:
        """Generate human-readable investment recommendation"""
        score = risk_result['risk_score']
        level = risk_result['risk_level']
        profile = self.category_profiles.get(category, self.category_profiles['defi'])

        rank = coin_data.get('market_cap_rank', 1000) or 1000
        price_30d = coin_data.get('price_change_30d', 0) or 0
        ath_change = coin_data.get('ath_change', 0) or 0

        recommendation = []


        recommendation.append("=" * 60)
        recommendation.append("INVESTMENT RECOMMENDATION")
        recommendation.append("=" * 60)

        if level == 'CRITICAL' or category == 'ponzi':
            recommendation.append("\n[VERDICT] DO NOT INVEST")
            recommendation.append("\nThis token shows critical red flags consistent with scam patterns.")
            recommendation.append("You are very likely to lose your entire investment.")

        elif level == 'HIGH' or category == 'influencer_coin':
            recommendation.append("\n[VERDICT] HIGH RISK - AVOID")
            recommendation.append("\nThis token has significant risk factors. Unless you can afford")
            recommendation.append("to lose 100% of your investment, we recommend avoiding it.")

        elif level == 'MEDIUM':
            recommendation.append("\n[VERDICT] MODERATE RISK - CAUTION")
            recommendation.append("\nThis token has some risk factors. If you choose to invest:")
            recommendation.append("- Only invest money you can afford to lose")
            recommendation.append("- Start with a small position (1-2% of portfolio)")
            recommendation.append("- Set stop-loss orders")
            recommendation.append("- Do additional research before committing")

        else:
            recommendation.append("\n[VERDICT] LOWER RISK - ACCEPTABLE")
            recommendation.append("\nThis token appears relatively safe based on our analysis.")
            recommendation.append("However, always remember:")
            recommendation.append("- Crypto is volatile - prices can drop 50%+ quickly")
            recommendation.append("- Diversify your portfolio")
            recommendation.append("- Never invest more than you can afford to lose")

        # Category-specific advice
        recommendation.append("\n" + "-" * 60)
        recommendation.append(f"CATEGORY INSIGHT: {profile['name']}")
        recommendation.append("-" * 60)
        recommendation.append(f"\n- Base Risk Level: {profile['base_risk']}")
        recommendation.append(f"- Historical Scam Rate: {profile['avg_scam_rate']*100:.0f}%")
        recommendation.append(f"- Typical Volatility: {profile['volatility']}")
        recommendation.append(f"- Recommended Horizon: {profile['investment_horizon']}")
        recommendation.append(f"\nKey factors for {profile['name']}:")
        for factor in profile['key_factors']:
            recommendation.append(f"  - {factor}")

        # Market position
        recommendation.append("\n" + "-" * 60)
        recommendation.append("MARKET ANALYSIS")
        recommendation.append("-" * 60)

        if rank <= 20:
            recommendation.append(f"\n- Rank #{rank}: TOP TIER - Highly established cryptocurrency")
        elif rank <= 100:
            recommendation.append(f"\n- Rank #{rank}: ESTABLISHED - Well-known in the market")
        elif rank <= 500:
            recommendation.append(f"\n- Rank #{rank}: MID-CAP - Moderate market presence")
        else:
            recommendation.append(f"\n- Rank #{rank}: SMALL-CAP - Higher risk, less liquidity")

        if price_30d > 50:
            recommendation.append(f"- [WARNING] Up {price_30d:.1f}% in 30 days - May be overheated, wait for pullback")
        elif price_30d > 20:
            recommendation.append(f"- Up {price_30d:.1f}% in 30 days - Positive momentum")
        elif price_30d < -30:
            recommendation.append(f"- Down {abs(price_30d):.1f}% in 30 days - Could be opportunity or falling knife")
        elif price_30d < -10:
            recommendation.append(f"- Down {abs(price_30d):.1f}% in 30 days - Recent weakness")
        else:
            recommendation.append(f"- {price_30d:+.1f}% in 30 days - Relatively stable")

        if ath_change < -90:
            recommendation.append(f"- {ath_change:.1f}% from ATH - Extremely beaten down")
        elif ath_change < -70:
            recommendation.append(f"- {ath_change:.1f}% from ATH - Significantly below peak")
        elif ath_change < -30:
            recommendation.append(f"- {ath_change:.1f}% from ATH - Moderate pullback from high")
        elif ath_change > -10:
            recommendation.append(f"- {ath_change:.1f}% from ATH - Near all-time high")

        # Final checklist
        recommendation.append("\n" + "-" * 60)
        recommendation.append("BEFORE INVESTING - CHECKLIST:")
        recommendation.append("-" * 60)
        recommendation.append("[ ] Is the smart contract audited by a reputable firm?")
        recommendation.append("[ ] Is the team doxxed (publicly known)?")
        recommendation.append("[ ] Is liquidity locked? For how long?")
        recommendation.append("[ ] What is the token utility/use case?")
        recommendation.append("[ ] Check Reddit, Twitter for community sentiment")
        recommendation.append("[ ] Verify contract on Etherscan/BSCScan")
        recommendation.append("[ ] Check TokenSniffer.com for scam score")
        recommendation.append("[ ] Never invest based on influencer promotions alone")

        return "\n".join(recommendation)

    def analyze_coin(self, coin_name: str, user_category: str = 'auto'):
        """Main analysis function"""
        print("\n" + "="*70)
        print(f"ANALYZING: {coin_name.upper()}")
        print("="*70)

        # Fetch data
        print("\n[INFO] Fetching real-time data from CoinGecko...")
        coin_data = self.fetch_coin_data(coin_name)

        if not coin_data:
            print(f"\n[ERROR] Could not find '{coin_name}' on CoinGecko.")
            print("Please check the spelling or try the official name.")
            print("\nTip: Try 'bitcoin' instead of 'btc', or 'ethereum' instead of 'eth'")
            return None

        print(f"[OK] Found: {coin_data['name']} ({coin_data['symbol']})")

        # Determine category
        category = self.infer_category(coin_data, user_category)
        profile = self.category_profiles.get(category, self.category_profiles['defi'])
        print(f"[INFO] Category: {profile['name']}")

        # Estimate features
        features = self.estimate_coin_features(coin_data, category)

        # Add engineered features for ML
        features['holder_count_log'] = np.log1p(features['holder_count'])
        features['social_followers_log'] = np.log1p(features['social_followers'])
        features['volatility_log'] = np.log1p(features['volatility'])
        features['age_days_log'] = np.log1p(features['age_days'])
        features['is_very_new'] = 1 if features['age_days'] < 14 else 0
        features['is_new'] = 1 if features['age_days'] < 60 else 0
        features['is_established'] = 1 if features['age_days'] > 365 else 0
        features['is_veteran'] = 1 if features['age_days'] > 1000 else 0
        features['trust_score'] = (features['had_audit']*25 + features['team_doxxed']*20 +
                                   features['liquidity_locked']*25 + features['ownership_renounced']*15 +
                                   features['contract_verified']*10 + features['website_exists']*3 +
                                   features['whitepaper_exists']*5 + features['github_exists']*7) / 100
        features['red_flag_score'] = (features['honeypot']*50 + (1-features['had_audit'])*12 +
                                      (1-features['team_doxxed'])*8 + (1-features['liquidity_locked'])*15) / 100
        features['tax_difference'] = features['sell_tax'] - features['buy_tax']
        features['total_tax'] = features['sell_tax'] + features['buy_tax']
        features['high_sell_tax'] = 1 if features['sell_tax'] > 15 else 0
        features['suspicious_tax'] = 1 if features['tax_difference'] > 10 else 0
        features['high_concentration'] = 1 if features['top_holder_percent'] > 40 else 0
        features['extreme_concentration'] = 1 if features['top_holder_percent'] > 60 else 0
        features['social_holder_ratio'] = features['social_followers'] / (features['holder_count'] + 1)
        features['social_holder_ratio_log'] = np.log1p(features['social_holder_ratio'])
        features['high_volatility'] = 1 if features['volatility'] > 100 else 0
        features['extreme_volatility'] = 1 if features['volatility'] > 200 else 0
        features['severe_crash'] = 1 if features['price_drop_percent'] > 90 else 0
        features['quick_crash'] = 1 if features['days_to_crash'] < 14 else 0
        features['crash_velocity'] = features['price_drop_percent'] / (features['days_to_crash'] + 1)
        features['high_risk_category'] = 1 if category in ['meme_coin', 'influencer_coin', 'ponzi'] else 0
        features['anon_no_audit'] = 1 if (features['team_doxxed']==0 and features['had_audit']==0) else 0
        features['new_high_risk'] = 1 if (features['is_new'] and features['high_risk_category']) else 0
        features['honeypot_indicator'] = features['honeypot']

        # Category dummies
        for cat in self.category_profiles.keys():
            features[f'cat_{cat}'] = 1 if category == cat else 0

        # Calculate risk
        print("\n[INFO] Calculating risk score...")
        risk_result = self.scorer.calculate_risk(features)

        # Display results
        print("\n" + "=" * 70)
        print(f"{coin_data['name']} ({coin_data['symbol']}) - ANALYSIS RESULTS")
        print("=" * 70)

        # Current price
        price = coin_data.get('current_price', 0)
        print(f"\nCurrent Price: ${price:,.8f}" if price < 0.01 else f"\nCurrent Price: ${price:,.4f}")
        print(f"Market Cap Rank: #{coin_data.get('market_cap_rank', 'N/A')}")
        print(f"Market Cap: ${coin_data.get('market_cap', 0):,.0f}")

        # Risk Score Display
        print("\n" + "=" * 70)
        print("RISK ASSESSMENT")
        print("=" * 70)

        score = risk_result['risk_score']
        level = risk_result['risk_level']

        # Visual risk bar
        bar_length = 50
        filled = int(score / 100 * bar_length)
        bar = "#" * filled + "-" * (bar_length - filled)

        print(f"\nRisk Score: {score}/100")
        print(f"[{bar}]")
        print(f"Risk Level: {level}")
        print(f"ML Scam Probability: {risk_result['ml_probability']}%")

        # Red/Green flags
        if risk_result['red_flags']:
            print(f"\nRED FLAGS ({len(risk_result['red_flags'])}):")
            for flag in risk_result['red_flags']:
                print(f"   {flag}")

        if risk_result['green_flags']:
            print(f"\nGREEN FLAGS ({len(risk_result['green_flags'])}):")
            for flag in risk_result['green_flags']:
                print(f"   {flag}")

        # Show charts
        print("\n[INFO] Generating charts...")

        # Price chart
        price_chart = self.create_price_chart(coin_data)
        price_chart.show()

        # Stats
        stats_chart = self.create_stats_display(coin_data)
        stats_chart.show()

        # Risk gauge
        gauge = self.visualizer.create_risk_gauge(score)
        gauge.show()

        # Investment recommendation
        recommendation = self.generate_recommendation(risk_result, category, coin_data)
        print(recommendation)

        return {
            'coin_data': coin_data,
            'category': category,
            'features': features,
            'risk_result': risk_result,
            'recommendation': recommendation
        }


# Create predictor instance
predictor = LiveCoinPredictor(analysis)

print("[SUCCESS] LiveCoinPredictor initialized")



[SUCCESS] LiveCoinPredictor initialized


In [27]:
# INTERACTIVE USER INPUT

def analyze_user_coin():
    """
    Interactive function for users to analyze any coin
    """
    print("\n" + "="*70)
    print("LIVE CRYPTOCURRENCY ANALYZER")
    print("="*70)

    print("""

Live Crypto Scam Detection System

This tool will:
- Fetch real-time data for any cryptocurrency
- Analyze risk factors using the trained ML model
- Show price charts and statistics
- Give a clear investment recommendation

    """)

    # Get coin name
    coin_name = input("Enter the coin name (e.g., 'bitcoin', 'ethereum', 'dogecoin'): ").strip()

    if not coin_name:
        print("[ERROR] No coin name entered. Please try again.")
        return None

    # Get category
    print("""
Select category (or press Enter for auto-detect):

    1. layer1        - Bitcoin, Ethereum, Solana
    2. layer2        - Polygon, Arbitrum, Optimism
    3. defi          - Uniswap, Aave, Compound
    4. meme_coin     - Dogecoin, Shiba, Pepe
    5. stablecoin    - USDT, USDC, DAI
    6. influencer    - Celebrity-promoted tokens
    7. exchange      - BNB, CRO, KCS
    8. gaming        - Sandbox, Axie, Gala
    9. ai_token      - Render, Fetch.ai
    10. auto         - Let system detect (recommended)
    """)

    category_input = input("Enter category number or name (default: auto): ").strip().lower()

    category_map = {
        '1': 'layer1', '2': 'layer2', '3': 'defi', '4': 'meme_coin',
        '5': 'stablecoin', '6': 'influencer_coin', '7': 'exchange_token',
        '8': 'gaming', '9': 'ai_token', '10': 'auto', '': 'auto'
    }

    category = category_map.get(category_input, category_input if category_input else 'auto')

    # Analyze
    result = predictor.analyze_coin(coin_name, category)

    if result:
        print("\n" + "=" * 70)
        print("[SUCCESS] ANALYSIS COMPLETE")
        print("=" * 70)

        another = input("\nWould you like to analyze another coin? (yes/no): ").strip().lower()
        if another in ['yes', 'y', 'yeah', 'sure']:
            analyze_user_coin()

    return result


# Quick analyze function for direct use
def quick_analyze(coin_name: str, category: str = 'auto'):
    """
    Quick analysis function - just enter coin name

    Examples:
        quick_analyze('bitcoin')
        quick_analyze('dogecoin', 'meme_coin')
        quick_analyze('uniswap', 'defi')
    """
    return predictor.analyze_coin(coin_name, category)


print("""

INSTRUCTIONS TO USE THIS SYSTEM:

------------------------------------------------------

Option 1: Interactive Mode

Just run:  analyze_user_coin()

This will prompt you to enter the coin name and guide you
through the analysis step by step.

------------------------------------------------------

Option 2: Quick Analysis

Use:  quick_analyze('coin_name')

------------------------------------------------------

Examples:
  quick_analyze('bitcoin')
  quick_analyze('ethereum')
  quick_analyze('dogecoin')
  quick_analyze('solana', 'layer1')
  quick_analyze('shiba-inu', 'meme_coin')


""")

# RUN INTERACTIVE ANALYZER

# Uncomment the line below to start the interactive analyzer:
# analyze_user_coin()

# Or use quick analysis:
# quick_analyze('bitcoin')
# quick_analyze('dogecoin')
# quick_analyze('solana')



INSTRUCTIONS TO USE THIS SYSTEM:

------------------------------------------------------

Option 1: Interactive Mode 

Just run:  analyze_user_coin()

This will prompt you to enter the coin name and guide you
through the analysis step by step.

------------------------------------------------------

Option 2: Quick Analysis

Use:  quick_analyze('coin_name')

------------------------------------------------------

Examples:
  quick_analyze('bitcoin')
  quick_analyze('ethereum')
  quick_analyze('dogecoin')
  quick_analyze('solana', 'layer1')
  quick_analyze('shiba-inu', 'meme_coin')



