In [None]:
import pandas as pd
import numpy as np
from urllib.request import urlretrieve
import logging
import os
from typing import Optional, Dict, Any

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(name)s - %(levelname)s - %(message)s',
    handlers=[logging.FileHandler('data_loading.log'), logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

class DataLoader:
    CONFIDENTIAL_FLAG = 'C'
    EXPECTED_COLUMNS = ['geo', 'TIME_PERIOD', 'OBS_VALUE', 'CONF_STATUS']
    DEFAULT_CONFIG = {
        'encoding': 'utf-8',
        'date_columns': ['TIME_PERIOD'],
        'source': 'local',
        'handle_confidential': True
    }
    
    def __init__(self, config: Optional[Dict[str, Any]] = None):
        """Initialize DataLoader with configuration."""
        self.df: Optional[pd.DataFrame] = None
        self.config = {**self.DEFAULT_CONFIG, **(config or {})}
    
    def validate_data_structure(self, df: pd.DataFrame) -> bool:
        """Validate DataFrame structure against expected columns."""
        missing_columns = set(self.EXPECTED_COLUMNS) - set(df.columns)
        if missing_columns:
            logger.error(f"Missing expected columns: {missing_columns}")
            return False
        if not pd.api.types.is_numeric_dtype(df['OBS_VALUE']):
            logger.warning("OBS_VALUE contains non-numeric values")
        duplicates = df.duplicated().sum()
        if duplicates > 0:
            logger.warning(f"Found {duplicates} duplicate rows")
            df.drop_duplicates(inplace=True)
        return True
    
    def handle_confidential_data(self, df: pd.DataFrame) -> pd.DataFrame:
        """Replace confidential values with NaN and interpolate."""
        if 'CONF_STATUS' in df.columns and self.config.get('handle_confidential', True):
            confidential_mask = df['CONF_STATUS'] == self.CONFIDENTIAL_FLAG
            n_confidential = confidential_mask.sum()
            df.loc[confidential_mask, 'OBS_VALUE'] = np.nan
            logger.info(f"Handled {n_confidential} confidential values (replaced with NaN)")
            df['OBS_VALUE'] = df['OBS_VALUE'].interpolate(method='linear').ffill().bfill()
            if df['OBS_VALUE'].isna().all():
                logger.warning("All OBS_VALUE are NaN after interpolation; filling with 0")
                df['OBS_VALUE'] = df['OBS_VALUE'].fillna(0)
        return df
    
    def check_data_quality(self, df: pd.DataFrame) -> None:
        """Log basic data quality metrics."""
        missing_values = df['OBS_VALUE'].isna().sum()
        logger.info(f"Found {missing_values} missing values in OBS_VALUE after processing")
        min_year = df['TIME_PERIOD'].min()
        max_year = df['TIME_PERIOD'].max()
        logger.info(f"Data covers years: {min_year} to {max_year}")
        countries = df['geo'].nunique()
        logger.info(f"Data contains {countries} unique countries")
        if len(df) < 10:
            logger.warning(f"Dataset has only {len(df)} rows, which may be insufficient")
    
    def load_data(self, file_path: str = 'estat_tec00107_filtered_en.csv') -> Optional[pd.DataFrame]:
        """Load and preprocess data from file."""
        try:
            if self.config['source'] == 'url':
                logger.info(f"Downloading data from URL: {file_path}")
                local_path = os.path.basename(file_path)
                urlretrieve(file_path, local_path)
                file_path = local_path
            
            if not os.path.exists(file_path):
                raise FileNotFoundError(f"File {file_path} not found")
                
            try:
                df = pd.read_csv(file_path, encoding=self.config['encoding'])
            except UnicodeDecodeError:
                logger.warning(f"Failed with encoding {self.config['encoding']}, trying latin-1")
                df = pd.read_csv(file_path, encoding='latin-1')
            
            for col in self.config['date_columns']:
                if col in df.columns:
                    df[col] = pd.to_datetime(df[col], format='%Y', errors='coerce')
            
            if not self.validate_data_structure(df):
                logger.error("Data structure validation failed")
                return None
                
            df = self.handle_confidential_data(df)
            df = df.dropna(how='all', axis=1)  # Удаление полностью пустых столбцов
            self.check_data_quality(df)
            self.df = df
            logger.info(f"Data loaded successfully: {len(df)} rows")
            return df
        except Exception as e:
            logger.error(f"Failed to load data: {str(e)}", exc_info=True)
            return None

def load_data(file_path: str = 'estat_tec00107_filtered_en.csv', config: Optional[Dict[str, Any]] = None) -> pd.DataFrame:
    """Wrapper function to load data."""
    loader = DataLoader(config=config)
    df = loader.load_data(file_path)
    if df is None or df.empty:
        logger.warning("No data loaded or empty DataFrame returned")
        return pd.DataFrame()
    return df

if __name__ == "__main__":
    logger.info("=== Загрузка данных FDI ===")
    df = load_data("estat_tec00107_filtered_en.csv")
    if not df.empty:
        logger.info(f"Loaded {len(df)} rows")

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
import logging
from logging.handlers import RotatingFileHandler
import json
import os
from datetime import datetime
from typing import Dict, Any, Optional
from scipy import stats

logger = logging.getLogger('eda')
logger.setLevel(logging.INFO)
logger.handlers.clear()
formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
ch = logging.StreamHandler()
ch.setFormatter(formatter)
logger.addHandler(ch)
fh = RotatingFileHandler('eda.log', maxBytes=1e6, backupCount=3)
fh.setFormatter(formatter)
logger.addHandler(fh)

class EDAAnalyzer:
    def __init__(self, df: pd.DataFrame):
        """Initialize EDA analyzer with DataFrame."""
        self.df = df.copy()
        self.results: Dict[str, Any] = {
            'metadata': {'analysis_date': datetime.now().isoformat(), 'data_source': 'estat_tec00107_filtered_en.csv', 'stats': {}},
            'missing_values': {},
            'outliers': {},
            'correlations': {},
            'trends': {},
            'recommendations': []
        }
    
    def describe_columns(self) -> None:
        """Describe dataset columns."""
        logger.info("Describing dataset columns")
        print("\n=== Dataset Columns Description ===")
        for col in self.df.columns:
            dtype = self.df[col].dtype
            unique = self.df[col].nunique()
            missing = self.df[col].isna().sum()
            print(f"{col}: Type={dtype}, Unique={unique}, Missing={missing}")
            logger.info(f"{col}: Type={dtype}, Unique={unique}, Missing={missing}")
    
    def _save_plot(self, fig: go.Figure, filename: str) -> None:
        """Save and display plot."""
        os.makedirs('plots', exist_ok=True)
        path = f"plots/{filename}.html"
        fig.write_html(path)
        fig.show()
        logger.info(f"Plot saved to {path}")
    
    def _add_recommendation(self, message: str) -> None:
        """Add recommendation to results."""
        self.results['recommendations'].append(message)
        logger.info(f"Recommendation: {message}")
    
    def _safe_describe(self, series: pd.Series) -> Dict[str, float]:
        """Safely compute descriptive statistics."""
        try:
            return {
                'count': series.count(), 'mean': series.mean(), 'std': series.std(),
                'min': series.min(), '25%': series.quantile(0.25), '50%': series.median(),
                '75%': series.quantile(0.75), 'max': series.max()
            }
        except Exception as e:
            logger.error(f"Error computing stats: {str(e)}")
            return {}

    def basic_statistics(self) -> Dict[str, Any]:
        """Compute basic statistics."""
        logger.info("Analyzing basic statistics...")
        self.results['metadata'].update({
            'total_rows': len(self.df), 'total_columns': len(self.df.columns),
            'time_period': {'start': str(self.df['TIME_PERIOD'].min()), 'end': str(self.df['TIME_PERIOD'].max())},
            'unique_countries': self.df['geo'].nunique()
        })
        num_cols = self.df.select_dtypes(include=[np.number]).columns
        for col in num_cols:
            self.results['metadata']['stats'][col] = self._safe_describe(self.df[col])
            if self.df[col].isnull().all():
                self._add_recommendation(f"Column {col} is completely empty - consider removing")
        cat_cols = self.df.select_dtypes(include=['object']).columns
        for col in cat_cols:
            self.results['metadata']['stats'][col] = {
                'count': self.df[col].count(), 'unique': self.df[col].nunique(),
                'top': self.df[col].mode().iloc[0] if not self.df[col].empty else None,
                'freq': self.df[col].value_counts().iloc[0] if not self.df[col].empty else None
            }
        return self.results['metadata']['stats']

    def analyze_missing_values(self) -> Dict[str, Any]:
        """Analyze missing values."""
        logger.info("Analyzing missing values...")
        missing = self.df.isnull().sum()
        missing_pct = (missing / len(self.df)) * 100
        self.results['missing_values'] = {'count': missing.to_dict(), 'percentage': missing_pct.to_dict()}
        fig = px.imshow(self.df.isnull(), title="Missing Values Heatmap")
        self._save_plot(fig, 'missing_values_heatmap')
        return self.results['missing_values']

    def detect_outliers(self, threshold: float = 3.5) -> Dict[str, Any]:
        """Detect outliers using Z-score and IQR."""
        logger.info("Detecting outliers...")
        numeric_cols = self.df.select_dtypes(include=[np.number]).columns
        outliers = {}
        for col in numeric_cols:
            data = self.df[col].dropna()
            if len(data) < 10:
                logger.warning(f"Not enough data for {col} (n={len(data)})")
                continue
            z_scores = np.abs(stats.zscore(data))
            z_outliers = data[z_scores > threshold]
            q1 = data.quantile(0.25)
            q3 = data.quantile(0.75)
            iqr = q3 - q1
            iqr_outliers = data[(data < (q1 - 1.5*iqr)) | (data > (q3 + 1.5*iqr))]
            outliers[col] = {
                'z_score': {'count': len(z_outliers), 'indices': z_outliers.index.tolist(), 'values': z_outliers.values.tolist()},
                'iqr': {'count': len(iqr_outliers), 'indices': iqr_outliers.index.tolist(), 'values': iqr_outliers.values.tolist()}
            }
            fig = px.box(self.df, y=col, title=f"Outliers in {col}")
            self._save_plot(fig, f'outliers_boxplot_{col}')
        self.results['outliers'] = outliers
        return outliers

    def analyze_distributions(self) -> None:
        """Analyze distributions of numeric columns."""
        logger.info("Analyzing distributions...")
        numeric_cols = self.df.select_dtypes(include=[np.number]).columns
        for col in numeric_cols:
            data = self.df[col].dropna()
            if len(data) < 5:
                continue
            fig = px.histogram(self.df, x=col, nbins=30, title=f"Distribution of {col}", marginal="rug")
            self._save_plot(fig, f'distribution_{col}')
            stat, p = stats.shapiro(data)
            self.results['metadata']['stats'][col].update({
                'normality_test': {'shapiro_stat': float(stat), 'shapiro_p': float(p), 'is_normal': bool(p > 0.05)}
            })

    def analyze_temporal_trends(self) -> None:
        """Analyze temporal trends."""
        logger.info("Analyzing temporal trends...")
        if 'TIME_PERIOD' not in self.df.columns:
            logger.warning("Missing TIME_PERIOD column")
            return
        self.df['year'] = self.df['TIME_PERIOD'].dt.year
        yearly_stats = self.df.groupby('year')['OBS_VALUE'].agg(['count', 'mean', 'median', 'std', 'min', 'max'])
        self.results['trends']['yearly_stats'] = yearly_stats.to_dict()
        fig = px.line(yearly_stats.reset_index(), x='year', y='mean', error_y='std', title='Mean FDI by Year with Confidence Interval')
        self._save_plot(fig, 'yearly_trends')

    def analyze_correlations(self) -> None:
        """Analyze correlations between numeric columns."""
        logger.info("Analyzing correlations...")
        numeric_df = self.df.select_dtypes(include=[np.number]).dropna(axis=1, how='all')
        numeric_df = numeric_df.loc[:, numeric_df.std() > 0]
        if len(numeric_df.columns) < 2:
            logger.warning("Not enough numeric columns for correlation analysis")
            return
        corr_matrix = numeric_df.corr()
        self.results['correlations']['matrix'] = corr_matrix.to_dict()
        fig = px.imshow(corr_matrix, text_auto='.2f', title="Correlation Matrix", color_continuous_scale='RdBu_r')
        self._save_plot(fig, 'correlation_matrix')

    def save_results(self) -> None:
        """Save analysis results to JSON."""
        def default_serializer(obj):
            if isinstance(obj, (np.int64, np.float64)):
                return float(obj)
            if isinstance(obj, np.bool_):
                return bool(obj)
            return str(obj)
        with open('eda_results.json', 'w', encoding='utf-8') as f:
            json.dump(self.results, f, indent=4, ensure_ascii=False, default=default_serializer)
        logger.info("Results saved to eda_results.json")

    def run_full_analysis(self) -> Dict[str, Any]:
        """Run full EDA analysis."""
        if self.df.empty:
            logger.warning("Empty DataFrame provided for EDA")
            return self.results
        logger.info("Running full EDA analysis")
        self.describe_columns()
        self.basic_statistics()
        self.analyze_missing_values()
        self.detect_outliers()
        self.analyze_distributions()
        self.analyze_temporal_trends()
        self.analyze_correlations()
        self.save_results()
        logger.info("EDA analysis completed")
        return self.results

if __name__ == "__main__":
    from data_loading import DataLoader
    loader = DataLoader()
    df = loader.load_data()
    if df is not None and not df.empty:
        analyzer = EDAAnalyzer(df)
        results = analyzer.run_full_analysis()
        print("\n=== SUMMARY RESULTS ===")
        print(f"Rows analyzed: {len(df)}")

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
import logging
import os
from sklearn.preprocessing import StandardScaler
from typing import Dict, Any
import yaml
import json
from datetime import datetime

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.FileHandler('data_processing.log'), logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

DEFAULT_CONFIG = {
    'window_sizes': {'mean': 3, 'std': 3, 'min': 5, 'max': 5},
    'output_dir': 'processed_data',
    'fillna_methods': {'OBS_VALUE': 'interpolate', 'rolling_mean': 'zero', 'yearly_change': 'zero', 'rolling_std': 'zero'},
    'features_to_scale': ['OBS_VALUE', 'rolling_mean', 'yearly_change'],
    'visualization': True,
    'max_abs_value': 1e6,
    'save_parquet': False,
    'outlier_threshold': 3.0,
    'save_stats': True,
    'save_excel': False,
    'drop_columns': ['OBS_FLAG', 'CONF_STATUS'],
    'validation_rules': {'OBS_VALUE': {'min': -1000, 'max': 1000}, 'yearly_change': {'min': -100, 'max': 100}},
    'drop_na': False
}

def load_config(config_path: str = 'config.yaml') -> Dict[str, Any]:
    """Load configuration from YAML file or use defaults."""
    try:
        with open(config_path) as f:
            config = yaml.safe_load(f) or {}
            logger.info(f"Loaded config from {config_path}")
            def deep_update(source, overrides):
                for key, value in overrides.items():
                    if isinstance(value, dict) and key in source:
                        deep_update(source[key], value)
                    else:
                        source[key] = value
                return source
            return deep_update(DEFAULT_CONFIG.copy(), config)
    except FileNotFoundError:
        logger.warning(f"Config file {config_path} not found, using defaults")
        return DEFAULT_CONFIG

def setup_output_dir(output_dir: str) -> None:
    """Set up output directory structure."""
    os.makedirs(os.path.join(output_dir, 'plots'), exist_ok=True)
    os.makedirs(os.path.join(output_dir, 'stats'), exist_ok=True)
    logger.info(f"Directory structure created in {output_dir}")

def handle_missing_values(df: pd.DataFrame, methods: Dict[str, str]) -> pd.DataFrame:
    """Handle missing values according to specified methods."""
    df = df.copy()
    for col, method in methods.items():
        if col not in df.columns:
            continue
        if method == 'interpolate':
            df[col] = df[col].interpolate(method='linear').ffill().bfill()
        elif method == 'median':
            df[col] = df[col].fillna(df[col].median())
        elif method == 'mean':
            df[col] = df[col].fillna(df[col].mean())
        elif method == 'zero':
            df[col] = df[col].fillna(0)
        elif method == 'ffill':
            df[col] = df[col].ffill()
        elif method == 'bfill':
            df[col] = df[col].bfill()
    logger.info(f"Missing values handled. Remaining NaN in OBS_VALUE: {df['OBS_VALUE'].isna().sum()}")
    return df

def calculate_rolling_features(df: pd.DataFrame, window_sizes: Dict[str, int]) -> pd.DataFrame:
    """Calculate rolling features for each geo group."""
    df = df.copy()
    window = window_sizes.get('mean', 3)
    df['rolling_mean'] = df.groupby('geo')['OBS_VALUE'].transform(lambda x: x.rolling(window, min_periods=1).mean())
    df['yearly_change'] = df.groupby('geo')['OBS_VALUE'].transform(
        lambda x: x.pct_change(fill_method=None).replace([np.inf, -np.inf], np.nan) * 100)
    df['yearly_change'] = df.groupby('geo')['yearly_change'].transform(
        lambda x: x.fillna(x.median() if x.notna().any() else 0))
    window = window_sizes.get('std', 3)
    df['rolling_std'] = df.groupby('geo')['OBS_VALUE'].transform(lambda x: x.rolling(window, min_periods=1).std())
    window = window_sizes.get('min', 5)
    df['rolling_min'] = df.groupby('geo')['OBS_VALUE'].transform(lambda x: x.rolling(window, min_periods=1).min())
    window = window_sizes.get('max', 5)
    df['rolling_max'] = df.groupby('geo')['OBS_VALUE'].transform(lambda x: x.rolling(window, min_periods=1).max())
    logger.info("Rolling features calculated")
    return df

def validate_data(df: pd.DataFrame, rules: Dict[str, Dict[str, float]]) -> bool:
    """Validate data against rules."""
    is_valid = True
    for col, rule in rules.items():
        if col not in df.columns:
            continue
        min_val = rule.get('min')
        max_val = rule.get('max')
        if min_val is not None and (df[col] < min_val).any():
            logger.warning(f"Found values below {min_val} in {col}")
            is_valid = False
        if max_val is not None and (df[col] > max_val).any():
            logger.warning(f"Found values above {max_val} in {col}")
            is_valid = False
    return is_valid

def create_visualizations(df: pd.DataFrame, output_dir: str) -> None:
    """Create and save visualizations."""
    fig = px.histogram(df, x='OBS_VALUE', nbins=30, title='Distribution of OBS_VALUE')
    fig.write_html(os.path.join(output_dir, 'plots', 'data_histogram.html'))
    fig.show()
    fig = px.box(df, y=['OBS_VALUE', 'rolling_mean', 'yearly_change'], title='Boxplots of Key Features')
    fig.write_html(os.path.join(output_dir, 'plots', 'data_boxplot.html'))
    fig.show()
    sample_countries = df['geo'].unique()[:5]
    fig = px.line(df[df['geo'].isin(sample_countries)], x='TIME_PERIOD', y='OBS_VALUE', color='geo', title='Sample Country Time Series')
    fig.write_html(os.path.join(output_dir, 'plots', 'sample_time_series.html'))
    fig.show()
    corr_matrix = df.select_dtypes(include=[np.number]).corr()
    fig = px.imshow(corr_matrix, text_auto='.2f', title='Correlation Matrix', color_continuous_scale='RdBu_r')
    fig.write_html(os.path.join(output_dir, 'plots', 'correlation_matrix.html'))
    fig.show()
    logger.info(f"Visualizations saved and displayed from {output_dir}/plots")

def preprocess_data(df: pd.DataFrame, config: Dict[str, Any]) -> pd.DataFrame:
    """Preprocess data according to config."""
    logger.info(f"Starting preprocessing. Initial shape: {df.shape}")
    if df.empty:
        logger.warning("Empty DataFrame provided for preprocessing")
        return df
    df = df.drop(columns=config.get('drop_columns', []), errors='ignore')
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    df[numeric_cols] = df[numeric_cols].replace([np.inf, -np.inf], np.nan)
    df[numeric_cols] = df[numeric_cols].clip(upper=config['max_abs_value'], lower=-config['max_abs_value'])
    logger.info(f"NaN in OBS_VALUE before handling: {df['OBS_VALUE'].isna().sum()}")
    df = handle_missing_values(df, config.get('fillna_methods', {}))
    df = calculate_rolling_features(df, config.get('window_sizes', {}))
    if 'TIME_PERIOD' in df.columns:
        df['year'] = df['TIME_PERIOD'].dt.year
        df['quarter'] = df['TIME_PERIOD'].dt.quarter
    scaler = StandardScaler()
    features_to_scale = [f for f in config.get('features_to_scale', []) if f in df.columns]
    if features_to_scale:
        df[features_to_scale] = scaler.fit_transform(df[features_to_scale])
    df = df.dropna(how='all', axis=1)  # Удаление полностью пустых столбцов
    logger.info(f"Preprocessing completed. Final shape: {df.shape}")
    return df

def save_processed_data(df: pd.DataFrame, output_dir: str, config: Dict[str, Any]) -> None:
    """Save processed data and statistics."""
    if df.empty:
        logger.warning("No data to save")
        return
    csv_path = os.path.join(output_dir, 'processed_data.csv')
    df.to_csv(csv_path, index=False)
    logger.info(f"Data saved to CSV: {csv_path}")
    if config.get('save_stats', True):
        stats = {
            'summary_stats': df.describe().to_dict(),
            'missing_values': df.isnull().sum().to_dict(),
            'timestamp': datetime.now().isoformat()
        }
        with open(os.path.join(output_dir, 'stats', 'data_statistics.json'), 'w') as f:
            json.dump(stats, f, indent=2, default=str)
        logger.info(f"Stats saved in {output_dir}/stats")

def main():
    """Main preprocessing pipeline."""
    logger.info("Starting preprocessing pipeline")
    config = load_config()
    setup_output_dir(config['output_dir'])
    from data_loading import load_data
    df = load_data()
    if df is not None and not df.empty:
        df_processed = preprocess_data(df, config)
        if config['visualization']:
            create_visualizations(df_processed, config['output_dir'])
        save_processed_data(df_processed, config['output_dir'], config)
        return df_processed
    logger.warning("No data processed")
    return pd.DataFrame()

if __name__ == "__main__":
    df_result = main()
    if not df_result.empty:
        print("\nPreprocessing Result: Rows processed =", len(df_result))

In [None]:
import pandas as pd
import numpy as np
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering, SpectralClustering
from sklearn.mixture import GaussianMixture
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score, davies_bouldin_score
import plotly.express as px
import logging
import pickle
from typing import Dict, Any

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.FileHandler('clustering.log'), logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

DEFAULT_CONFIG = {
    'max_clusters': 10,
    'random_state': 42,
    'file_path': 'estat_tec00107_filtered_en.csv',
    'fill_method': 'median',
    'dbscan_eps': 0.5,
    'dbscan_min_samples': 5,
    'n_runs': 5  # Для кросс-валидации
}

def find_optimal_clusters(X_scaled: np.ndarray, max_clusters: int, random_state: int) -> int:
    """Find optimal number of clusters using silhouette score."""
    silhouette_scores = []
    for k in range(2, max_clusters + 1):
        kmeans = KMeans(n_clusters=k, random_state=random_state)
        labels = kmeans.fit_predict(X_scaled)
        if len(set(labels)) > 1:
            silhouette_scores.append(silhouette_score(X_scaled, labels))
        else:
            silhouette_scores.append(0)
    optimal_k = np.argmax(silhouette_scores) + 2 if silhouette_scores else 2
    logger.info(f"Optimal number of clusters: {optimal_k}")
    return optimal_k

def cluster_countries(df: pd.DataFrame, config: Dict[str, Any]) -> pd.DataFrame:
    """Cluster countries based on FDI features."""
    logger.info("Starting clustering")
    if df.empty:
        logger.warning("Empty DataFrame provided for clustering")
        return pd.DataFrame(columns=['geo', 'fdi_mean', 'fdi_std', 'change_mean', 'cluster'])
    
    df_agg = df.groupby('geo').agg({
        'OBS_VALUE': ['mean', 'std'],
        'yearly_change': 'mean'
    }).reset_index()
    df_agg.columns = ['geo', 'fdi_mean', 'fdi_std', 'change_mean']
    df_agg.replace([np.inf, -np.inf], np.nan, inplace=True)
    
    initial_rows = len(df_agg)
    if config.get('fill_method') == 'median':
        df_agg.fillna({
            'fdi_mean': df_agg['fdi_mean'].median(),
            'change_mean': df_agg['change_mean'].median(),
            'fdi_std': df_agg['fdi_std'].median()
        }, inplace=True)
    elif config.get('fill_method') == 'mean':
        df_agg.fillna({
            'fdi_mean': df_agg['fdi_mean'].mean(),
            'change_mean': df_agg['change_mean'].mean(),
            'fdi_std': df_agg['fdi_std'].mean()
        }, inplace=True)
    df_agg.dropna(subset=['fdi_mean', 'change_mean'], inplace=True)
    logger.info(f"After processing NaN, {len(df_agg)} rows remain (initially {initial_rows})")
    
    if df_agg.empty:
        logger.warning("No data available for clustering after preprocessing")
        return pd.DataFrame(columns=['geo', 'fdi_mean', 'fdi_std', 'change_mean', 'cluster'])
    
    scaler = StandardScaler()
    X_scaled = scaler.fit_transform(df_agg[['fdi_mean', 'change_mean']])
    
    methods = {
        'KMeans': KMeans(n_clusters=find_optimal_clusters(X_scaled, config['max_clusters'], config['random_state']), random_state=config['random_state']),
        'DBSCAN': DBSCAN(eps=config['dbscan_eps'], min_samples=config['dbscan_min_samples']),
        'Agglomerative': AgglomerativeClustering(n_clusters=3),
        'Spectral': SpectralClustering(n_clusters=3, random_state=config['random_state']),
        'GMM': GaussianMixture(n_components=3, random_state=config['random_state'])
    }
    
    results = {}
    for name, model in methods.items():
        labels_list = []
        for _ in range(config['n_runs']):
            if name == 'GMM':
                labels = model.fit_predict(X_scaled)
            else:
                labels = model.fit_predict(X_scaled)
            labels_list.append(labels)
        # Majority voting for stability
        label_matrix = np.array(labels_list).T
        final_labels = [np.bincount(row[row >= 0]).argmax() if np.any(row >= 0) else -1 for row in label_matrix]
        results[name] = final_labels
        if len(set(final_labels)) > 1:
            silhouette = silhouette_score(X_scaled, final_labels)
            davies_bouldin = davies_bouldin_score(X_scaled, final_labels)
            logger.info(f"{name}: {len(set(final_labels))} clusters, Silhouette={silhouette:.3f}, Davies-Bouldin={davies_bouldin:.3f}")
    
    label_matrix = np.array([results[name] for name in results]).T
    ensemble_labels = [np.bincount(row[row >= 0]).argmax() if np.any(row >= 0) else -1 for row in label_matrix]
    df_agg['cluster'] = ensemble_labels
    if len(set(ensemble_labels)) > 1:
        ensemble_silhouette = silhouette_score(X_scaled, ensemble_labels)
        logger.info(f"Ensemble Silhouette Score: {ensemble_silhouette:.3f}")
    
    fig = px.scatter(df_agg, x='fdi_mean', y='change_mean', color='cluster', hover_data=['geo'], title='Country Clustering')
    fig.write_html('clusters.html')
    fig.show()
    logger.info("Clustering visualization saved and displayed")
    
    for name, model in methods.items():
        with open(f'{name}_model.pkl', 'wb') as f:
            pickle.dump(model, f)
    logger.info("Models saved")
    
    return df_agg

if __name__ == "__main__":
    from data_loading import load_data
    from data_preprocessing import preprocess_data
    config = DEFAULT_CONFIG
    df = load_data(config['file_path'])
    df_processed = preprocess_data(df, config)
    result = cluster_countries(df_processed, config)
    if not result.empty:
        print("\nClustering Result: Rows clustered =", len(result))

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from statsmodels.tsa.arima.model import ARIMA
from statsmodels.tsa.statespace.sarimax import SARIMAX
from prophet import Prophet
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from pmdarima import auto_arima
from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
from statsmodels.tsa.holtwinters import ExponentialSmoothing
import logging
import os
from typing import Dict, Any, Tuple, Optional

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.FileHandler('time_series_analysis.log'), logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

def prepare_data(df: pd.DataFrame, country: str) -> pd.DataFrame:
    """Prepare time series data for a specific country."""
    df = df[df['geo'] == country].copy()
    if df.empty:
        logger.warning(f"No data available for country: {country}")
        return df
    df['TIME_PERIOD'] = pd.to_datetime(df['TIME_PERIOD'])
    df.set_index('TIME_PERIOD', inplace=True)
    df = df.sort_index()
    df['OBS_VALUE'] = df['OBS_VALUE'].interpolate(method='time').fillna(method='ffill').fillna(method='bfill')
    df['year'] = df.index.year
    df['time_idx'] = np.arange(len(df))
    logger.info(f"Prepared data for {country} with {len(df)} rows")
    return df

def arima_forecast(train: pd.Series, test: pd.Series, steps: int) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
    """ARIMA forecast."""
    try:
        model = auto_arima(train, seasonal=False, trace=False)
        model_fit = model.fit(train)
        test_pred = model_fit.predict(n_periods=len(test))
        future_pred = model_fit.predict(n_periods=steps)
        return test_pred, future_pred
    except Exception as e:
        logger.error(f"ARIMA forecast failed: {str(e)}")
        return None, None

def prophet_forecast(train: pd.DataFrame, test: pd.DataFrame, steps: int) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
    """Prophet forecast with yearly seasonality."""
    try:
        df = train.reset_index().rename(columns={'TIME_PERIOD': 'ds', 'OBS_VALUE': 'y'})
        model = Prophet(yearly_seasonality=True)
        model.fit(df)
        test_dates = pd.DataFrame({'ds': test.index})
        future_dates = pd.date_range(start=train.index[-1], periods=steps + 1, freq='Y')[1:]
        future_df = pd.DataFrame({'ds': pd.concat([pd.Series(test.index), pd.Series(future_dates)])})
        forecast = model.predict(future_df)
        test_pred = forecast['yhat'].iloc[:len(test)].values
        future_pred = forecast['yhat'].iloc[len(test):].values
        return test_pred, future_pred
    except Exception as e:
        logger.error(f"Prophet forecast failed: {str(e)}")
        return None, None

def rf_forecast(train: pd.DataFrame, test: pd.DataFrame, steps: int) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
    """Optimized Random Forest forecast."""
    try:
        X_train = train[['year', 'time_idx']]
        y_train = train['OBS_VALUE']
        rf = RandomForestRegressor(random_state=42)
        param_grid = {'n_estimators': [50, 100], 'max_depth': [10, None]}
        grid_search = GridSearchCV(rf, param_grid, cv=3, scoring='neg_mean_absolute_error', n_jobs=-1)
        grid_search.fit(X_train, y_train)
        model = grid_search.best_estimator_
        logger.info(f"Best RF params: {grid_search.best_params_}")
        X_test = test[['year', 'time_idx']]
        test_pred = model.predict(X_test)
        future_dates = pd.date_range(start=train.index[-1], periods=steps + 1, freq='Y')[1:]
        future_df = pd.DataFrame({'year': future_dates.year, 'time_idx': np.arange(len(train), len(train) + steps)})
        future_pred = model.predict(future_df)
        return test_pred, future_pred
    except Exception as e:
        logger.error(f"Random Forest forecast failed: {str(e)}")
        return None, None

def sarima_forecast(train: pd.Series, test: pd.Series, steps: int) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
    """SARIMA forecast with seasonality."""
    try:
        model = auto_arima(train, seasonal=True, m=1, trace=False)  # m=1 для годовых данных
        order = model.order
        seasonal_order = model.seasonal_order
        sarima_model = SARIMAX(train, order=order, seasonal_order=seasonal_order)
        model_fit = sarima_model.fit(disp=False)
        test_pred = model_fit.forecast(steps=len(test))
        future_pred = model_fit.forecast(steps=steps)
        return test_pred, future_pred
    except Exception as e:
        logger.error(f"SARIMA forecast failed: {str(e)}")
        return None, None

def exp_smoothing_forecast(train: pd.Series, test: pd.Series, steps: int) -> Tuple[Optional[np.ndarray], Optional[np.ndarray]]:
    """Exponential Smoothing forecast."""
    try:
        model = ExponentialSmoothing(train, trend='add', seasonal='add', seasonal_periods=1)
        model_fit = model.fit()
        test_pred = model_fit.forecast(len(test))
        future_pred = model_fit.forecast(steps)
        return test_pred, future_pred
    except Exception as e:
        logger.error(f"Exponential Smoothing forecast failed: {str(e)}")
        return None, None

def ensemble_forecast(predictions: Dict[str, np.ndarray], test: pd.Series) -> Tuple[Optional[np.ndarray], Optional[Dict[str, float]]]:
    """Ensemble forecast with weighted averaging."""
    valid_preds = {name: pred for name, pred in predictions.items() if pred is not None and len(pred) == len(test)}
    if not valid_preds:
        logger.warning("No valid predictions for ensemble")
        return None, None
    weights = {}
    total_mae = 0
    for name, pred in valid_preds.items():
        try:
            mae = mean_absolute_error(test, pred)
            weights[name] = 1 / (mae + 1e-6)
            total_mae += weights[name]
            logger.info(f"{name} MAE: {mae:.3f}, Weight: {weights[name]:.3f}")
        except ValueError as e:
            logger.error(f"Error computing MAE for {name}: {str(e)}")
            return None, None
    weights = {name: w / total_mae for name, w in weights.items()}
    ensemble = np.zeros(len(test))
    for name, pred in valid_preds.items():
        ensemble += weights[name] * pred
    return ensemble, weights

def forecast_fdi(df: pd.DataFrame, country: str = 'Germany', steps: int = 3) -> Dict[str, Dict[str, float]]:
    """Forecast FDI using multiple models and ensemble."""
    logger.info(f"Time series analysis for {country}")
    if df.empty:
        logger.warning("Empty DataFrame provided for forecasting")
        return {model: {'mae': float('nan'), 'mae_std': float('nan')} for model in ['ARIMA', 'Prophet', 'RandomForest', 'SARIMA', 'ExpSmoothing', 'Ensemble']}
    
    data = prepare_data(df, country)
    if data.empty or len(data) < 10:
        logger.warning(f"Insufficient data for {country} ({len(data)} rows)")
        return {model: {'mae': float('nan'), 'mae_std': float('nan')} for model in ['ARIMA', 'Prophet', 'RandomForest', 'SARIMA', 'ExpSmoothing', 'Ensemble']}
    
    tscv = TimeSeriesSplit(n_splits=min(5, len(data) - 1))
    models = {
        'ARIMA': arima_forecast,
        'Prophet': prophet_forecast,
        'RandomForest': rf_forecast,
        'SARIMA': sarima_forecast,
        'ExpSmoothing': exp_smoothing_forecast
    }
    metrics = {}
    errors = {}
    all_predictions = {}
    for train_idx, test_idx in tscv.split(data):
        train, test = data.iloc[train_idx], data.iloc[test_idx]
        predictions, future_predictions = {}, {}
        for name, func in models.items():
            test_pred, future_pred = func(
                train['OBS_VALUE'] if name in ['ARIMA', 'SARIMA', 'ExpSmoothing'] else train,
                test['OBS_VALUE'] if name in ['ARIMA', 'SARIMA', 'ExpSmoothing'] else test,
                steps
            )
            predictions[name] = test_pred
            future_predictions[name] = future_pred
        
        ensemble_test, weights = ensemble_forecast(predictions, test['OBS_VALUE'])
        valid_future_preds = {k: v for k, v in future_predictions.items() if v is not None and len(v) == steps}
        if valid_future_preds and weights:
            ensemble_future = np.zeros(steps)
            for name, pred in valid_future_preds.items():
                ensemble_future += weights.get(name, 0) * pred
        else:
            ensemble_future = None
        
        for name, pred in predictions.items():
            if pred is not None and len(pred) == len(test['OBS_VALUE']):
                mae = mean_absolute_error(test['OBS_VALUE'], pred)
                metrics.setdefault(name, []).append(mae)
                errors.setdefault(name, []).append(test['OBS_VALUE'].values - pred)
                all_predictions.setdefault(name, []).append(pred)
        if ensemble_test is not None:
            mae = mean_absolute_error(test['OBS_VALUE'], ensemble_test)
            metrics.setdefault('Ensemble', []).append(mae)
            errors.setdefault('Ensemble', []).append(test['OBS_VALUE'].values - ensemble_test)
            all_predictions.setdefault('Ensemble', []).append(ensemble_test)
    
    avg_metrics = {name: {'mae': np.mean(maes), 'mae_std': np.std(maes)} for name, maes in metrics.items()}
    
    # Визуализация прогноза
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=data.index, y=data['OBS_VALUE'], mode='lines+markers', name='Historical'))
    future_dates = pd.date_range(start=data.index[-1], periods=steps + 1, freq='Y')[1:]
    if ensemble_future is not None:
        fig.add_trace(go.Scatter(x=future_dates, y=ensemble_future, mode='lines+markers', name='Ensemble Forecast'))
    fig.update_layout(title=f'FDI Forecast for {country}', xaxis_title='Date', yaxis_title='FDI Value')
    os.makedirs('plots', exist_ok=True)
    fig.write_html(f'plots/forecast_{country}.html')
    fig.show()
    
    # Визуализация ошибок
    error_fig = go.Figure()
    for name, error_list in errors.items():
        error_flat = np.concatenate(error_list)
        error_fig.add_trace(go.Box(y=error_flat, name=name))
    error_fig.update_layout(title=f'Forecast Error Distribution for {country}', yaxis_title='Error')
    error_fig.write_html(f'plots/forecast_errors_{country}.html')
    error_fig.show()
    
    # Визуализация ошибок во времени для последнего фолда
    time_error_fig = go.Figure()
    for name, pred in predictions.items():
        if pred is not None and len(pred) == len(test['OBS_VALUE']):
            time_error_fig.add_trace(go.Scatter(x=test.index, y=test['OBS_VALUE'] - pred, mode='lines+markers', name=f'{name} Error'))
    if ensemble_test is not None:
        time_error_fig.add_trace(go.Scatter(x=test.index, y=test['OBS_VALUE'] - ensemble_test, mode='lines+markers', name='Ensemble Error'))
    time_error_fig.update_layout(title=f'Forecast Errors Over Time for {country} (Last Fold)', xaxis_title='Date', yaxis_title='Error')
    time_error_fig.write_html(f'plots/time_series_errors_{country}.html')
    time_error_fig.show()
    
    # Визуализация фактических vs предсказанных значений для последнего фолда
    pred_fig = go.Figure()
    pred_fig.add_trace(go.Scatter(x=test.index, y=test['OBS_VALUE'], mode='lines+markers', name='Actual'))
    for name, pred in predictions.items():
        if pred is not None and len(pred) == len(test['OBS_VALUE']):
            pred_fig.add_trace(go.Scatter(x=test.index, y=pred, mode='lines+markers', name=f'{name} Predicted'))
    if ensemble_test is not None:
        pred_fig.add_trace(go.Scatter(x=test.index, y=ensemble_test, mode='lines+markers', name='Ensemble Predicted'))
    pred_fig.update_layout(title=f'Actual vs Predicted FDI for {country} (Last Fold)', xaxis_title='Date', yaxis_title='FDI Value')
    pred_fig.write_html(f'plots/actual_vs_predicted_{country}.html')
    pred_fig.show()
    
    # Визуализация метрик по фолдам
    metrics_fig = go.Figure()
    for name, mae_list in metrics.items():
        metrics_fig.add_trace(go.Box(y=mae_list, name=name))
    metrics_fig.update_layout(title=f'MAE Across Folds for {country}', yaxis_title='MAE')
    metrics_fig.write_html(f'plots/mae_across_folds_{country}.html')
    metrics_fig.show()
    
    logger.info(f"Forecast plot, errors, and metrics saved and displayed")
    
    return avg_metrics

if __name__ == "__main__":
    os.makedirs('plots', exist_ok=True)
    from data_loading import load_data
    from data_preprocessing import preprocess_data
    df = load_data()
    df_clean = preprocess_data(df, {})
    metrics = forecast_fdi(df_clean, 'Germany')
    print("\nForecast Metrics:", metrics)

In [None]:
import pandas as pd
import numpy as np
import plotly.express as px
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.svm import OneClassSVM
from sklearn.cluster import DBSCAN
import logging
import os
import pickle
from typing import Dict, Any

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.FileHandler('anomaly_detection.log'), logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

class AnomalyDetector:
    def __init__(self, file_path: str = 'estat_tec00107_filtered_en.csv'):
        """Initialize AnomalyDetector."""
        self.file_path = file_path
        self.df: pd.DataFrame = None
        self.results: Dict[str, Any] = {}
        os.makedirs('results', exist_ok=True)
        os.makedirs('results/models', exist_ok=True)
        os.makedirs('results/plots', exist_ok=True)
    
    def load_data(self) -> bool:
        """Load data from file."""
        from data_loading import load_data
        logger.info("Loading data...")
        self.df = load_data(self.file_path)
        if self.df is None or self.df.empty:
            logger.warning("No data loaded or empty DataFrame")
            return False
        self.df = self.df.sort_values('TIME_PERIOD')
        logger.info(f"Loaded {len(self.df)} records")
        return True
    
    def detect_anomalies(self) -> bool:
        """Detect anomalies using multiple methods."""
        if not self.load_data():
            return False
        logger.info("Detecting anomalies...")
        if self.df['OBS_VALUE'].dropna().empty:
            logger.warning("No valid OBS_VALUE data for anomaly detection")
            return False
        X = self.df[['OBS_VALUE']].dropna().values
        
        methods = {
            'IsolationForest': IsolationForest(contamination=0.05, random_state=42),
            'LOF': LocalOutlierFactor(contamination=0.05, novelty=False),
            'OneClassSVM': OneClassSVM(nu=0.05),
            'DBSCAN': DBSCAN(eps=0.5, min_samples=5)
        }
        
        for name, model in methods.items():
            if name in ['LOF', 'DBSCAN']:
                labels = model.fit_predict(X)
            else:
                labels = model.fit_predict(X)
            self.df[f'{name}_anomaly'] = np.nan
            self.df.loc[self.df['OBS_VALUE'].notna(), f'{name}_anomaly'] = np.where(labels == -1, 1, 0)
            self.results[name] = {'model': model, 'anomaly_col': f'{name}_anomaly'}
            logger.info(f"{name} anomalies detected")
        
        # Ensemble voting
        anomaly_cols = [f'{name}_anomaly' for name in methods]
        self.df['ensemble_anomaly'] = self.df[anomaly_cols].mode(axis=1)[0]
        logger.info("Ensemble anomaly detection completed")
        
        self._save_models()
        return True
    
    def _save_models(self) -> None:
        """Save trained models."""
        for name, result in self.results.items():
            model_path = f'results/models/{name}_model.pkl'
            with open(model_path, 'wb') as f:
                pickle.dump(result['model'], f)
        logger.info("Models saved")
    
    def visualize_results(self) -> None:
        """Visualize anomaly detection results."""
        if not self.results:
            logger.warning("No results to visualize")
            return
        fig = px.scatter(self.df, x='TIME_PERIOD', y='OBS_VALUE', title='Detected Anomalies in FDI Data')
        for name, result in self.results.items():
            anomalies = self.df[self.df[result['anomaly_col']] == 1]
            fig.add_scatter(x=anomalies['TIME_PERIOD'], y=anomalies['OBS_VALUE'], mode='markers', name=f'{name} Anomalies')
        ensemble_anomalies = self.df[self.df['ensemble_anomaly'] == 1]
        fig.add_scatter(x=ensemble_anomalies['TIME_PERIOD'], y=ensemble_anomalies['OBS_VALUE'], mode='markers', name='Ensemble Anomalies')
        fig.write_html('results/plots/anomalies_detection.html')
        fig.show()
        logger.info("Anomaly plot saved and displayed")
        self.df.to_csv('results/anomalies_marked.csv', index=False)
        logger.info("Data with anomaly labels saved")

def main():
    """Main anomaly detection pipeline."""
    detector = AnomalyDetector()
    if detector.load_data() and detector.detect_anomalies():
        detector.visualize_results()

if __name__ == "__main__":
    main()

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import cross_validate, KFold, GridSearchCV
from sklearn.inspection import permutation_importance
import plotly.express as px
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_absolute_error, r2_score, make_scorer
import logging

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.FileHandler('feature_importance_analysis.log'), logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

class FeatureImportanceAnalyzer:
    def __init__(self, n_splits: int = 5, random_state: int = 42):
        """Initialize FeatureImportanceAnalyzer."""
        self.n_splits = n_splits
        self.random_state = random_state
        self.best_params_ = None
        self.importance_df_ = None
        self.cv_results_ = None
    
    def load_and_preprocess(self, filepath: str) -> pd.DataFrame:
        """Load and preprocess data."""
        from data_loading import load_data
        from data_preprocessing import preprocess_data
        logger.info(f"Loading data from {filepath}...")
        df = load_data(filepath)
        if df.empty:
            logger.warning("Empty DataFrame after loading")
            return df
        df = preprocess_data(df, {})
        logger.info(f"Preprocessed data: {len(df)} rows")
        return df
    
    def build_pipeline(self) -> Pipeline:
        """Build preprocessing and modeling pipeline."""
        numeric_features = ['year', 'rolling_mean', 'yearly_change']
        categorical_features = ['geo']
        preprocessor = ColumnTransformer(
            transformers=[
                ('num', SimpleImputer(strategy='median'), numeric_features),
                ('cat', OneHotEncoder(handle_unknown='ignore', sparse_output=False), categorical_features)
            ]
        )
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('model', RandomForestRegressor(random_state=self.random_state))
        ])
        return pipeline
    
    def optimize_model(self, pipeline: Pipeline, X: pd.DataFrame, y: pd.Series) -> Pipeline:
        """Optimize model hyperparameters."""
        if X.empty or y.empty:
            logger.warning("Empty data provided for model optimization")
            return pipeline
        if len(X) < self.n_splits:
            logger.warning(f"Insufficient data ({len(X)} rows) for {self.n_splits}-fold CV. Using single fit.")
            pipeline.fit(X, y)
            return pipeline
        param_grid = {
            'model__n_estimators': [50, 100, 200],
            'model__max_depth': [None, 10, 20],
            'model__min_samples_split': [2, 5]
        }
        grid_search = GridSearchCV(pipeline, param_grid, cv=self.n_splits, scoring='neg_mean_absolute_error', n_jobs=-1)
        grid_search.fit(X, y)
        self.best_params_ = grid_search.best_params_
        logger.info(f"Best parameters: {self.best_params_}")
        return grid_search.best_estimator_
    
    def evaluate_model(self, model: Pipeline, X: pd.DataFrame, y: pd.Series) -> Dict[str, Any]:
        """Evaluate model using cross-validation."""
        if X.empty or y.empty:
            logger.warning("Empty data provided for model evaluation")
            return {'test_MAE': [float('nan')], 'test_R2': [float('nan')]}
        kfold = KFold(n_splits=min(self.n_splits, len(X)), shuffle=True, random_state=self.random_state)
        scoring = {'MAE': make_scorer(mean_absolute_error, greater_is_better=False), 'R2': make_scorer(r2_score)}
        cv_results = cross_validate(model, X, y, cv=kfold, scoring=scoring, n_jobs=-1, return_train_score=True)
        self.cv_results_ = cv_results
        return cv_results
    
    def compute_feature_importance(self, model: Pipeline, X: pd.DataFrame, y: pd.Series) -> pd.DataFrame:
        """Compute feature importance with aggregation for categorical features."""
        if X.empty or y.empty:
            logger.warning("Empty data provided for feature importance")
            return pd.DataFrame(columns=['feature', 'rf_importance', 'permutation_importance'])
        
        # Ограничение входных данных для пайплайна и permutation_importance
        required_features = ['year', 'rolling_mean', 'yearly_change', 'geo']
        X_filtered = X[required_features]
        logger.info(f"Using features for analysis: {required_features}")
        
        preprocessor = model.named_steps['preprocessor']
        X_transformed = preprocessor.transform(X_filtered)
        feature_names = preprocessor.get_feature_names_out()
        logger.info(f"Number of transformed features: {len(feature_names)}")
        
        # Важность признаков из RandomForest
        importances = model.named_steps['model'].feature_importances_
        logger.info(f"Length of RF importances: {len(importances)}")
        
        # Агрегация важности для закодированных признаков 'geo'
        rf_importance_dict = {'year': 0, 'rolling_mean': 0, 'yearly_change': 0, 'geo': 0}
        for name, importance in zip(feature_names, importances):
            if name.startswith('num__'):
                feature = name.replace('num__', '')
                rf_importance_dict[feature] = importance
            elif name.startswith('cat__'):
                rf_importance_dict['geo'] += importance
        logger.info(f"Aggregated RF importance: {rf_importance_dict}")
        
        # Permutation importance на отфильтрованных исходных данных
        result = permutation_importance(model, X_filtered, y, n_repeats=20, random_state=self.random_state, n_jobs=-1)
        original_features = required_features
        logger.info(f"Length of permutation importances: {len(result.importances_mean)}")
        
        # Проверка длины
        if len(original_features) != len(result.importances_mean):
            logger.error(f"Length mismatch: original_features={len(original_features)}, permutation={len(result.importances_mean)}")
            raise ValueError("Mismatch between original features and permutation importance length")
        
        try:
            self.importance_df_ = pd.DataFrame({
                'feature': original_features,
                'rf_importance': [rf_importance_dict[f] for f in original_features],
                'permutation_importance': result.importances_mean,
                'permutation_std': result.importances_std
            }).sort_values('rf_importance', ascending=False)
        except ValueError as e:
            logger.error(f"Error creating DataFrame: {str(e)}")
            raise
        
        return self.importance_df_
    
    def visualize_results(self, df: pd.DataFrame) -> None:
        """Visualize feature importance and model metrics."""
        if self.importance_df_ is None or self.importance_df_.empty:
            logger.warning("No feature importance data to visualize")
            return
        
        # Feature Importances (Random Forest)
        fig = px.bar(self.importance_df_, x='feature', y='rf_importance', 
                     title='Feature Importances (Random Forest)')
        fig.write_html('feature_importance_rf.html')
        fig.show()
        
        # Feature Importance Comparison with error bars
        fig = px.scatter(self.importance_df_, x='rf_importance', y='permutation_importance', 
                         text='feature', title='Feature Importance Comparison', 
                         error_y='permutation_std', hover_data=['feature'])
        fig.write_html('feature_importance_comparison.html')
        fig.show()
        
        # Distribution of FDI Intensity
        fig = px.histogram(df, x='OBS_VALUE', nbins=30, title='Distribution of FDI Intensity')
        fig.write_html('fdi_distribution.html')
        fig.show()
        
        # Top 10 Countries by Average FDI Intensity
        top_countries = df.groupby('geo')['OBS_VALUE'].mean().nlargest(10).reset_index()
        fig = px.bar(top_countries, x='geo', y='OBS_VALUE', title='Top 10 Countries by Average FDI Intensity')
        fig.write_html('top_countries.html')
        fig.show()
        
        # Model Performance Metrics
        if self.cv_results_:
            metrics_df = pd.DataFrame({
                'MAE': -self.cv_results_['test_MAE'],
                'R2': self.cv_results_['test_R2']
            })
            fig = px.box(metrics_df, title='Model Performance Metrics Across Folds')
            fig.write_html('model_metrics.html')
            fig.show()
        
        logger.info("Visualizations saved and displayed")

if __name__ == "__main__":
    analyzer = FeatureImportanceAnalyzer()
    df = analyzer.load_and_preprocess('estat_tec00107_filtered_en.csv')
    if not df.empty:
        pipeline = analyzer.build_pipeline()
        pipeline = analyzer.optimize_model(pipeline, df, df['OBS_VALUE'])
        cv_results = analyzer.evaluate_model(pipeline, df, df['OBS_VALUE'])
        importance_df = analyzer.compute_feature_importance(pipeline, df, df['OBS_VALUE'])
        analyzer.visualize_results(df)
        print("\nFeature Importance:")
        print(importance_df)
        print("\nModel Metrics:")
        print(f"Test MAE: {-cv_results['test_MAE'].mean():.3f}")
        print(f"Test R2: {cv_results['test_R2'].mean():.3f}")
    else:
        logger.warning("No data for feature importance analysis")

In [None]:
import pandas as pd
import plotly.express as px
import logging
from data_loading import load_data

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.FileHandler('investment_trends.log'), logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

def plot_investment_trends():
    logger.info("Starting investment trends analysis")
    df = load_data('estat_tec00107_filtered_en.csv')
    if df.empty:
        logger.warning("No data loaded")
        return
    
    # Фильтрация по странам и последним 10 годам
    countries = ['Germany', 'Sweden', 'Italy', 'Spain', 'France']
    df['TIME_PERIOD'] = pd.to_datetime(df['TIME_PERIOD'])
    df = df[df['geo'].isin(countries) & (df['TIME_PERIOD'].dt.year >= 2014)]
    
    # Создание графика
    fig = px.line(df, x='TIME_PERIOD', y='OBS_VALUE', color='geo', 
                  title='Объемы инвестиций для Германии, Швеции, Италии, Испании, Франции (2014–2023)',
                  labels={'TIME_PERIOD': 'Год', 'OBS_VALUE': 'Значение инвестиций', 'geo': 'Страна'})
    fig.write_html('investment_trends.html')
    fig.show()
    logger.info("Investment trends plot saved and displayed")

if __name__ == "__main__":
    plot_investment_trends()

In [None]:
import pandas as pd
import plotly.express as px
import logging
from data_loading import load_data

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[logging.FileHandler('europe_vs_special.log'), logging.StreamHandler()]
)
logger = logging.getLogger(__name__)

def compare_investments():
    logger.info("Starting Europe vs Special countries comparison")
    df = load_data('estat_tec00107_filtered_en.csv')
    if df.empty:
        logger.warning("No data loaded")
        return
    
    # Фильтрация по странам и последним 10 годам
    df['TIME_PERIOD'] = pd.to_datetime(df['TIME_PERIOD'])
    df = df[df['TIME_PERIOD'].dt.year >= 2014]
    
    # Группы стран
    europe = df[df['geo'] == 'European Union - 27 countries (from 2020)']
    special = df[df['geo'].isin(['Cyprus', 'Luxembourg', 'Malta'])]
    
    # Средние значения
    europe_mean = europe['OBS_VALUE'].mean()
    special_mean = special.groupby('geo')['OBS_VALUE'].mean()
    data = pd.DataFrame({
        'Группа': ['Европа (EU-27)'] + special_mean.index.tolist(),
        'Средние инвестиции': [europe_mean] + special_mean.values.tolist()
    })
    
    # График
    fig = px.bar(data, x='Группа', y='Средние инвестиции', 
                 title='Сравнение средних инвестиций: Европа vs Кипр, Люксембург, Мальта (2014–2023)',
                 labels={'Средние инвестиции': 'Среднее значение инвестиций'})
    fig.write_html('europe_vs_special.html')
    fig.show()
    logger.info("Comparison plot saved and displayed")

if __name__ == "__main__":
    compare_investments()