EDA для текстовых данных

In [None]:
# notebooks/01_eda_large_dataset.ipynb
"""
EDA для больших датасетов - пошаговый план
"""

import pandas as pd
import numpy as np
import dask.dataframe as dd
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm

class LargeScaleEDA:
    """EDA для больших датасетов с использованием выборок и Dask"""

    def __init__(self, data_path: str, sample_size: int = 100000):
        self.data_path = data_path
        self.sample_size = sample_size

    def step1_basic_info(self):
        """Шаг 1: Базовая информация о датасете"""
        print("="*50)
        print("STEP 1: BASIC DATASET INFO")
        print("="*50)

        # Используем Dask для метаинформации
        if self.data_path.endswith('.parquet'):
            df = dd.read_parquet(self.data_path)
        else:
            df = dd.read_csv(self.data_path, blocksize='64MB')

        print(f"Total rows: {len(df):,}")
        print(f"Total columns: {len(df.columns)}")
        print(f"Estimated memory: {df.memory_usage(deep=True).sum().compute() / 1e9:.2f} GB")
        print(f"\nColumns: {df.columns.tolist()}")
        print(f"\nDtypes:\n{df.dtypes}")

        return df

    def step2_sampling_strategy(self):
        """Шаг 2: Стратегии выборки"""
        print("\n" + "="*50)
        print("STEP 2: SAMPLING STRATEGIES")
        print("="*50)

        strategies = {
            'random': self._random_sample,
            'stratified': self._stratified_sample,
            'systematic': self._systematic_sample,
            'cluster': self._cluster_sample
        }

        samples = {}
        for name, method in strategies.items():
            print(f"\nCreating {name} sample...")
            samples[name] = method()
            print(f"  Sample size: {len(samples[name])}")

        return samples

    def _random_sample(self) -> pd.DataFrame:
        """Случайная выборка"""
        # Читаем random строки
        n_rows = sum(1 for _ in open(self.data_path.replace('.parquet', '.csv')))
        skip_rows = np.random.choice(n_rows, n_rows - self.sample_size, replace=False)

        if self.data_path.endswith('.parquet'):
            df = pd.read_parquet(self.data_path)
            return df.sample(n=min(self.sample_size, len(df)))
        else:
            return pd.read_csv(self.data_path, skiprows=skip_rows, nrows=self.sample_size)

    def _stratified_sample(self) -> pd.DataFrame:
        """Стратифицированная выборка по целевой переменной"""
        # Читаем только целевую переменную и индексы
        if self.data_path.endswith('.parquet'):
            df = pd.read_parquet(self.data_path, columns=['is_counterfeit'])
        else:
            df = pd.read_csv(self.data_path, usecols=['is_counterfeit'])

        # Стратифицированная выборка
        sample_idx = df.groupby('is_counterfeit').apply(
            lambda x: x.sample(n=min(len(x), self.sample_size//2))
        ).index.get_level_values(1)

        # Читаем полные данные для выбранных индексов
        if self.data_path.endswith('.parquet'):
            return pd.read_parquet(self.data_path).iloc[sample_idx]
        else:
            return pd.read_csv(self.data_path, skiprows=lambda x: x not in sample_idx)

    def _systematic_sample(self) -> pd.DataFrame:
        """Систематическая выборка (каждая k-я строка)"""
        if self.data_path.endswith('.parquet'):
            df = pd.read_parquet(self.data_path)
            k = len(df) // self.sample_size
            return df.iloc[::k][:self.sample_size]
        else:
            # Для CSV читаем каждую k-ю строку
            k = sum(1 for _ in open(self.data_path)) // self.sample_size
            return pd.read_csv(self.data_path, skiprows=lambda i: i % k != 0, nrows=self.sample_size)

    def _cluster_sample(self) -> pd.DataFrame:
        """Кластерная выборка (по категориям/брендам)"""
        # Выбираем случайные категории/бренды
        if self.data_path.endswith('.parquet'):
            df = pd.read_parquet(self.data_path, columns=['category'])
            selected_categories = df['category'].value_counts().head(10).index
            df_full = pd.read_parquet(self.data_path)
            return df_full[df_full['category'].isin(selected_categories)].sample(
                n=min(self.sample_size, len(df_full))
            )

    def step3_missing_analysis(self, df_sample: pd.DataFrame):
        """Шаг 3: Анализ пропусков"""
        print("\n" + "="*50)
        print("STEP 3: MISSING DATA ANALYSIS")
        print("="*50)

        missing_stats = pd.DataFrame({
            'column': df_sample.columns,
            'missing_count': df_sample.isnull().sum(),
            'missing_percent': df_sample.isnull().sum() / len(df_sample) * 100
        }).sort_values('missing_percent', ascending=False)

        print("\nColumns with missing values:")
        print(missing_stats[missing_stats['missing_count'] > 0])

        # Паттерны пропусков
        print("\nMissing patterns:")
        missing_patterns = df_sample.isnull().value_counts()
        print(missing_patterns.head(10))

        return missing_stats

    def step4_distribution_analysis(self, df_sample: pd.DataFrame):
        """Шаг 4: Анализ распределений"""
        print("\n" + "="*50)
        print("STEP 4: DISTRIBUTION ANALYSIS")
        print("="*50)

        # Числовые переменные
        numeric_cols = df_sample.select_dtypes(include=[np.number]).columns
        print(f"\nNumeric columns ({len(numeric_cols)}):")
        print(df_sample[numeric_cols].describe())

        # Категориальные переменные
        categorical_cols = df_sample.select_dtypes(include=['object', 'category']).columns
        print(f"\nCategorical columns ({len(categorical_cols)}):")
        for col in categorical_cols[:5]:  # Первые 5 для примера
            print(f"\n{col}:")
            print(df_sample[col].value_counts().head(10))

    def step5_text_analysis(self, df_sample: pd.DataFrame):
        """Шаг 5: Анализ текстовых данных"""
        print("\n" + "="*50)
        print("STEP 5: TEXT ANALYSIS")
        print("="*50)

        text_columns = ['title', 'description']

        for col in text_columns:
            if col in df_sample.columns:
                print(f"\n{col} analysis:")

                # Длина текстов
                text_lengths = df_sample[col].fillna('').str.len()
                print(f"  Length stats: mean={text_lengths.mean():.0f}, "
                      f"median={text_lengths.median():.0f}, "
                      f"max={text_lengths.max():.0f}")

                # Количество слов
                word_counts = df_sample[col].fillna('').str.split().str.len()
                print(f"  Word count: mean={word_counts.mean():.0f}, "
                      f"median={word_counts.median():.0f}")

                # Процент заполненности
                filled_percent = (df_sample[col].notna().sum() / len(df_sample)) * 100
                print(f"  Filled: {filled_percent:.1f}%")

    def step6_target_analysis(self, df_sample: pd.DataFrame):
        """Шаг 6: Анализ целевой переменной"""
        print("\n" + "="*50)
        print("STEP 6: TARGET VARIABLE ANALYSIS")
        print("="*50)

        target = 'is_counterfeit'
        if target in df_sample.columns:
            print(f"\nTarget distribution:")
            print(df_sample[target].value_counts(normalize=True))

            # Анализ по категориям
            if 'category' in df_sample.columns:
                print(f"\nCounterfeit rate by category:")
                counterfeit_by_category = df_sample.groupby('category')[target].mean().sort_values(ascending=False)
                print(counterfeit_by_category.head(10))

            # Анализ по брендам
            if 'brand' in df_sample.columns:
                print(f"\nCounterfeit rate by brand:")
                counterfeit_by_brand = df_sample.groupby('brand')[target].mean().sort_values(ascending=False)
                print(counterfeit_by_brand.head(10))

    def step7_correlation_analysis(self, df_sample: pd.DataFrame):
        """Шаг 7: Корреляционный анализ"""
        print("\n" + "="*50)
        print("STEP 7: CORRELATION ANALYSIS")
        print("="*50)

        # Только числовые колонки
        numeric_df = df_sample.select_dtypes(include=[np.number])

        if 'is_counterfeit' in numeric_df.columns:
            correlations = numeric_df.corr()['is_counterfeit'].sort_values(ascending=False)
            print("\nTop correlations with target:")
            print(correlations.head(10))
            print("\nBottom correlations with target:")
            print(correlations.tail(10))

    def step8_anomaly_detection(self, df_sample: pd.DataFrame):
        """Шаг 8: Поиск аномалий"""
        print("\n" + "="*50)
        print("STEP 8: ANOMALY DETECTION")
        print("="*50)

        # Поиск выбросов в цене
        if 'price' in df_sample.columns:
            Q1 = df_sample['price'].quantile(0.25)
            Q3 = df_sample['price'].quantile(0.75)
            IQR = Q3 - Q1

            outliers = df_sample[(df_sample['price'] < Q1 - 1.5*IQR) |
                                 (df_sample['price'] > Q3 + 1.5*IQR)]

            print(f"\nPrice outliers: {len(outliers)} ({len(outliers)/len(df_sample)*100:.2f}%)")
            print(f"Outlier price range: {outliers['price'].min():.2f} - {outliers['price'].max():.2f}")

            # Проверяем, чаще ли outliers являются контрафактом
            if 'is_counterfeit' in df_sample.columns:
                outlier_counterfeit_rate = outliers['is_counterfeit'].mean()
                normal_counterfeit_rate = df_sample[~df_sample.index.isin(outliers.index)]['is_counterfeit'].mean()
                print(f"Counterfeit rate in outliers: {outlier_counterfeit_rate:.2%}")
                print(f"Counterfeit rate in normal: {normal_counterfeit_rate:.2%}")

    def run_complete_eda(self):
        """Запуск полного EDA"""
        # Шаг 1: Базовая информация
        dask_df = self.step1_basic_info()

        # Шаг 2: Создание выборок
        samples = self.step2_sampling_strategy()

        # Используем random sample для дальнейшего анализа
        df_sample = samples['random']

        # Шаги 3-8: Детальный анализ
        self.step3_missing_analysis(df_sample)
        self.step4_distribution_analysis(df_sample)
        self.step5_text_analysis(df_sample)
        self.step6_target_analysis(df_sample)
        self.step7_correlation_analysis(df_sample)
        self.step8_anomaly_detection(df_sample)

        print("\n" + "="*50)
        print("EDA COMPLETE!")
        print("="*50)

        return df_sample

# Использование
eda = LargeScaleEDA('data/train.parquet', sample_size=100000)
df_sample = eda.run_complete_eda()