In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import warnings
from pathlib import Path
from datetime import datetime

warnings.filterwarnings('ignore')

plt.style.use('seaborn-v0_8-whitegrid')
sns.set_palette('husl')
plt.rcParams['figure.dpi'] = 100
plt.rcParams['font.size'] = 10

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)
pd.set_option('display.float_format', '{:.4f}'.format)
pd.set_option('display.width', 120)

print('Libraries imported successfully')
print(f'Pandas version: {pd.__version__}')
print(f'NumPy version: {np.__version__}')

In [None]:
DATA_PATH = Path('../data/raw/dataset.csv')

if not DATA_PATH.exists():
    raise FileNotFoundError(f'Dataset not found at: {DATA_PATH}')

df = pd.read_csv(DATA_PATH)

print(f'Dataset loaded from: {DATA_PATH}')
print(f'Dataset shape: {df.shape[0]:,} rows x {df.shape[1]} columns')
print(f'Memory usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB')
print(f'Load timestamp: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')

In [None]:
print('DATASET PREVIEW - First 10 Rows')
print('=' * 120)
df.head(10)

In [None]:
print('DATASET PREVIEW - Last 10 Rows')
print('=' * 120)
df.tail(10)

In [None]:
print('DATASET PREVIEW - Random 10 Rows')
print('=' * 120)
df.sample(10, random_state=42)

In [None]:
print('DATASET STRUCTURE AND METADATA')
print('=' * 120)
df.info(verbose=True, show_counts=True)

In [None]:
print('COLUMN NAMES AND DATA TYPES')
print('=' * 120)

column_info = pd.DataFrame({
    'Column_Name': df.columns,
    'Data_Type': df.dtypes.values,
    'Non_Null_Count': df.count().values,
    'Null_Count': df.isnull().sum().values,
    'Null_Percentage': (df.isnull().sum() / len(df) * 100).values
})

column_info.index = range(1, len(column_info) + 1)
column_info

In [None]:
print('STATISTICAL SUMMARY - NUMERICAL FEATURES')
print('=' * 120)
df.describe(include=[np.number]).transpose()

In [None]:
print('STATISTICAL SUMMARY - CATEGORICAL FEATURES')
print('=' * 120)
df.describe(include=['object']).transpose()

In [None]:
print('MISSING VALUES ANALYSIS')
print('=' * 120)

missing_data = pd.DataFrame({
    'Column': df.columns,
    'Missing_Count': df.isnull().sum().values,
    'Missing_Percentage': (df.isnull().sum() / len(df) * 100).values,
    'Data_Type': df.dtypes.values
}).sort_values('Missing_Count', ascending=False)

missing_data = missing_data[missing_data['Missing_Count'] > 0]

if len(missing_data) > 0:
    print(f'Total columns with missing values: {len(missing_data)}')
    print(f'Total missing values: {missing_data["Missing_Count"].sum():,}')
    print('\nDetailed breakdown:')
    missing_data.index = range(1, len(missing_data) + 1)
    display(missing_data)
else:
    print('No missing values detected in dataset')

In [None]:
print('MISSING VALUES HEATMAP')
print('=' * 120)

fig, ax = plt.subplots(figsize=(14, 8))
sns.heatmap(df.isnull(), cbar=True, cmap='viridis', yticklabels=False, ax=ax)
ax.set_title('Missing Values Heatmap (Yellow = Missing)', fontsize=14, fontweight='bold', pad=20)
ax.set_xlabel('Columns', fontsize=12, fontweight='bold')
ax.set_ylabel('Rows', fontsize=12, fontweight='bold')
plt.xticks(rotation=45, ha='right')
plt.tight_layout()
plt.show()

total_cells = np.product(df.shape)
total_missing = df.isnull().sum().sum()
print(f'\nTotal cells: {total_cells:,}')
print(f'Missing cells: {total_missing:,} ({total_missing/total_cells*100:.2f}%)')
print(f'Complete cells: {total_cells - total_missing:,} ({(total_cells - total_missing)/total_cells*100:.2f}%)')

In [None]:
print('DUPLICATE RECORDS ANALYSIS')
print('=' * 120)

total_duplicates = df.duplicated().sum()
duplicate_percentage = (total_duplicates / len(df)) * 100

print(f'Total records: {len(df):,}')
print(f'Duplicate records: {total_duplicates:,} ({duplicate_percentage:.2f}%)')
print(f'Unique records: {len(df) - total_duplicates:,} ({100 - duplicate_percentage:.2f}%)')

if total_duplicates > 0:
    print('\nDuplicate records preview:')
    display(df[df.duplicated(keep=False)].sort_values(by=df.columns[0]).head(20))
else:
    print('\nNo duplicate records found')

In [None]:
print('DATA TYPES DISTRIBUTION')
print('=' * 120)

dtype_counts = df.dtypes.value_counts()
print(dtype_counts)

fig, ax = plt.subplots(figsize=(10, 6))
dtype_counts.plot(kind='bar', color='steelblue', edgecolor='black', ax=ax)
ax.set_title('Data Types Distribution', fontsize=14, fontweight='bold', pad=20)
ax.set_xlabel('Data Type', fontsize=12, fontweight='bold')
ax.set_ylabel('Count', fontsize=12, fontweight='bold')
ax.grid(axis='y', alpha=0.3)
plt.xticks(rotation=0)
plt.tight_layout()
plt.show()

In [None]:
print('NUMERICAL FEATURES DISTRIBUTION - HISTOGRAMS')
print('=' * 120)

numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()
print(f'Total numerical features: {len(numerical_cols)}')
print(f'Features: {numerical_cols}\n')

if len(numerical_cols) > 0:
    n_cols = 3
    n_rows = (len(numerical_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, n_rows * 4))
    axes = axes.flatten() if len(numerical_cols) > 1 else [axes]
    
    for idx, col in enumerate(numerical_cols):
        ax = axes[idx]
        data = df[col].dropna()
        
        ax.hist(data, bins=40, color='skyblue', edgecolor='black', alpha=0.7)
        ax.axvline(data.mean(), color='red', linestyle='--', linewidth=2, label=f'Mean: {data.mean():.2f}')
        ax.axvline(data.median(), color='orange', linestyle='--', linewidth=2, label=f'Median: {data.median():.2f}')
        ax.set_title(col, fontsize=11, fontweight='bold')
        ax.set_xlabel('Value', fontsize=9)
        ax.set_ylabel('Frequency', fontsize=9)
        ax.legend(fontsize=8)
        ax.grid(axis='y', alpha=0.3)
    
    for idx in range(len(numerical_cols), len(axes)):
        fig.delaxes(axes[idx])
    
    plt.tight_layout()
    plt.show()
else:
    print('No numerical features found')

In [None]:
print('NUMERICAL FEATURES - BOX PLOTS (OUTLIER DETECTION)')
print('=' * 120)

numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()

if len(numerical_cols) > 0:
    n_cols = 3
    n_rows = (len(numerical_cols) + n_cols - 1) // n_cols
    
    fig, axes = plt.subplots(n_rows, n_cols, figsize=(18, n_rows * 4))
    axes = axes.flatten() if len(numerical_cols) > 1 else [axes]
    
    for idx, col in enumerate(numerical_cols):
        ax = axes[idx]
        data = df[col].dropna()
        
        bp = ax.boxplot(data, vert=True, patch_artist=True,
                        boxprops=dict(facecolor='lightblue', alpha=0.7),
                        whiskerprops=dict(color='black'),
                        capprops=dict(color='black'),
                        medianprops=dict(color='red', linewidth=2))
        
        ax.set_title(col, fontsize=11, fontweight='bold')
        ax.set_ylabel('Value', fontsize=9)
        ax.grid(axis='y', alpha=0.3)
        
        q1 = data.quantile(0.25)
        q3 = data.quantile(0.75)
        iqr = q3 - q1
        outliers = ((data < (q1 - 1.5 * iqr)) | (data > (q3 + 1.5 * iqr))).sum()
        
        ax.text(0.5, 0.95, f'Outliers: {outliers}', transform=ax.transAxes,
                fontsize=9, verticalalignment='top', bbox=dict(boxstyle='round', facecolor='wheat', alpha=0.5))
    
    for idx in range(len(numerical_cols), len(axes)):
        fig.delaxes(axes[idx])
    
    plt.tight_layout()
    plt.show()
else:
    print('No numerical features found')

In [None]:
print('CORRELATION MATRIX - NUMERICAL FEATURES')
print('=' * 120)

numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()

if len(numerical_cols) >= 2:
    correlation_matrix = df[numerical_cols].corr()
    
    fig, axes = plt.subplots(1, 2, figsize=(18, 7))
    
    sns.heatmap(correlation_matrix, annot=True, fmt='.3f', cmap='RdYlBu_r',
                square=True, linewidths=1, cbar_kws={'shrink': 0.8},
                vmin=-1, vmax=1, center=0, ax=axes[0])
    axes[0].set_title('Full Correlation Matrix', fontsize=14, fontweight='bold', pad=20)
    
    mask = np.triu(np.ones_like(correlation_matrix, dtype=bool))
    sns.heatmap(correlation_matrix, mask=mask, annot=True, fmt='.3f', cmap='RdYlBu_r',
                square=True, linewidths=1, cbar_kws={'shrink': 0.8},
                vmin=-1, vmax=1, center=0, ax=axes[1])
    axes[1].set_title('Lower Triangle Correlation Matrix', fontsize=14, fontweight='bold', pad=20)
    
    plt.tight_layout()
    plt.show()
    
    print('\nHigh Correlations (|r| > 0.7):')
    high_corr = []
    for i in range(len(correlation_matrix.columns)):
        for j in range(i+1, len(correlation_matrix.columns)):
            if abs(correlation_matrix.iloc[i, j]) > 0.7:
                high_corr.append({
                    'Feature_1': correlation_matrix.columns[i],
                    'Feature_2': correlation_matrix.columns[j],
                    'Correlation': correlation_matrix.iloc[i, j]
                })
    
    if high_corr:
        high_corr_df = pd.DataFrame(high_corr).sort_values('Correlation', key=abs, ascending=False)
        display(high_corr_df)
    else:
        print('No high correlations found')
else:
    print('Insufficient numerical features for correlation analysis')

In [None]:
print('CATEGORICAL FEATURES - VALUE COUNTS')
print('=' * 120)

categorical_cols = df.select_dtypes(include=['object']).columns.tolist()
print(f'Total categorical features: {len(categorical_cols)}')
print(f'Features: {categorical_cols}\n')

if len(categorical_cols) > 0:
    for col in categorical_cols:
        print(f'\n{col}:')
        print('-' * 60)
        value_counts = df[col].value_counts()
        value_percentage = (df[col].value_counts(normalize=True) * 100).round(2)
        
        result = pd.DataFrame({
            'Count': value_counts,
            'Percentage': value_percentage
        })
        
        print(result.head(20))
        
        if len(value_counts) > 20:
            print(f'... and {len(value_counts) - 20} more unique values')
        
        print(f'Total unique values: {len(value_counts)}')
else:
    print('No categorical features found')

In [None]:
print('CATEGORICAL FEATURES - DISTRIBUTION CHARTS')
print('=' * 120)

categorical_cols = df.select_dtypes(include=['object']).columns.tolist()

if len(categorical_cols) > 0:
    for col in categorical_cols:
        value_counts = df[col].value_counts()
        
        if len(value_counts) <= 20:
            fig, axes = plt.subplots(1, 2, figsize=(16, 5))
            
            value_counts.plot(kind='bar', color='steelblue', edgecolor='black', alpha=0.7, ax=axes[0])
            axes[0].set_title(f'{col} - Bar Chart', fontsize=12, fontweight='bold')
            axes[0].set_xlabel('Category', fontsize=10)
            axes[0].set_ylabel('Count', fontsize=10)
            axes[0].tick_params(axis='x', rotation=45)
            axes[0].grid(axis='y', alpha=0.3)
            
            value_counts.plot(kind='pie', autopct='%1.1f%%', startangle=90, ax=axes[1])
            axes[1].set_title(f'{col} - Pie Chart', fontsize=12, fontweight='bold')
            axes[1].set_ylabel('')
            
            plt.tight_layout()
            plt.show()
        else:
            fig, ax = plt.subplots(figsize=(14, 6))
            value_counts.head(20).plot(kind='bar', color='steelblue', edgecolor='black', alpha=0.7, ax=ax)
            ax.set_title(f'{col} - Top 20 Values', fontsize=12, fontweight='bold')
            ax.set_xlabel('Category', fontsize=10)
            ax.set_ylabel('Count', fontsize=10)
            ax.tick_params(axis='x', rotation=45)
            ax.grid(axis='y', alpha=0.3)
            plt.tight_layout()
            plt.show()
else:
    print('No categorical features found')

In [None]:
print('FEATURE UNIQUENESS ANALYSIS')
print('=' * 120)

uniqueness_data = []
for col in df.columns:
    unique_count = df[col].nunique()
    unique_percentage = (unique_count / len(df)) * 100
    uniqueness_data.append({
        'Feature': col,
        'Unique_Values': unique_count,
        'Unique_Percentage': unique_percentage,
        'Data_Type': df[col].dtype
    })

uniqueness_df = pd.DataFrame(uniqueness_data).sort_values('Unique_Values', ascending=False)
uniqueness_df.index = range(1, len(uniqueness_df) + 1)
uniqueness_df

In [None]:
print('INTERACTIVE SCATTER PLOT - NUMERICAL FEATURES (PLOTLY)')
print('=' * 120)

numerical_cols = df.select_dtypes(include=[np.number]).columns.tolist()

if len(numerical_cols) >= 2:
    x_col = numerical_cols[0]
    y_col = numerical_cols[1] if len(numerical_cols) > 1 else numerical_cols[0]
    
    size_col = numerical_cols[2] if len(numerical_cols) > 2 else None
    color_col = df.select_dtypes(include=['object']).columns[0] if len(df.select_dtypes(include=['object']).columns) > 0 else None
    
    fig = px.scatter(
        df,
        x=x_col,
        y=y_col,
        size=size_col,
        color=color_col,
        hover_data=df.columns.tolist(),
        title=f'Interactive Scatter: {x_col} vs {y_col}',
        opacity=0.7,
        size_max=30
    )
    
    fig.update_layout(
        height=700,
        font=dict(size=12),
        title_font=dict(size=16, family='Arial Black'),
        hovermode='closest'
    )
    
    fig.show()
    
    print(f'X-axis: {x_col}')
    print(f'Y-axis: {y_col}')
    if size_col:
        print(f'Bubble size: {size_col}')
    if color_col:
        print(f'Color: {color_col}')
else:
    print('Insufficient numerical features for scatter plot')

In [None]:
print('DATA QUALITY ASSESSMENT')
print('=' * 120)

quality_metrics = {
    'Total Records': len(df),
    'Total Features': len(df.columns),
    'Numerical Features': len(df.select_dtypes(include=[np.number]).columns),
    'Categorical Features': len(df.select_dtypes(include=['object']).columns),
    'DateTime Features': len(df.select_dtypes(include=['datetime64']).columns),
    'Total Cells': np.product(df.shape),
    'Missing Cells': df.isnull().sum().sum(),
    'Missing Percentage': f"{(df.isnull().sum().sum() / np.product(df.shape)) * 100:.2f}%",
    'Duplicate Records': df.duplicated().sum(),
    'Duplicate Percentage': f"{(df.duplicated().sum() / len(df)) * 100:.2f}%",
    'Memory Usage (MB)': f"{df.memory_usage(deep=True).sum() / 1024**2:.2f}",
    'Complete Records': len(df.dropna()),
    'Complete Records Percentage': f"{(len(df.dropna()) / len(df)) * 100:.2f}%"
}

quality_df = pd.DataFrame(list(quality_metrics.items()), columns=['Metric', 'Value'])
quality_df.index = range(1, len(quality_df) + 1)
quality_df

In [None]:
print('DATASET STRENGTHS')
print('=' * 120)

strengths = []

if df.duplicated().sum() == 0:
    strengths.append('No duplicate records detected')

missing_pct = (df.isnull().sum().sum() / np.product(df.shape)) * 100
if missing_pct < 5:
    strengths.append(f'Low missing data: {missing_pct:.2f}%')

if len(df) >= 1000:
    strengths.append(f'Large dataset: {len(df):,} records')

numerical_cols = df.select_dtypes(include=[np.number]).columns
if len(numerical_cols) > 0:
    strengths.append(f'{len(numerical_cols)} numerical features for analysis')

categorical_cols = df.select_dtypes(include=['object']).columns
if len(categorical_cols) > 0:
    strengths.append(f'{len(categorical_cols)} categorical features for segmentation')

complete_records_pct = (len(df.dropna()) / len(df)) * 100
if complete_records_pct > 80:
    strengths.append(f'High data completeness: {complete_records_pct:.2f}% complete records')

if len(strengths) > 0:
    for i, strength in enumerate(strengths, 1):
        print(f'{i}. {strength}')
else:
    print('No specific strengths identified')

In [None]:
print('DATASET WEAKNESSES / FLAWS')
print('=' * 120)

weaknesses = []

missing_pct = (df.isnull().sum().sum() / np.product(df.shape)) * 100
if missing_pct > 5:
    weaknesses.append(f'Significant missing data: {missing_pct:.2f}%')

if df.duplicated().sum() > 0:
    weaknesses.append(f'{df.duplicated().sum()} duplicate records found')

for col in df.select_dtypes(include=[np.number]).columns:
    q1 = df[col].quantile(0.25)
    q3 = df[col].quantile(0.75)
    iqr = q3 - q1
    outliers = ((df[col] < (q1 - 1.5 * iqr)) | (df[col] > (q3 + 1.5 * iqr))).sum()
    outlier_pct = (outliers / len(df)) * 100
    if outlier_pct > 10:
        weaknesses.append(f'{col}: {outliers} outliers ({outlier_pct:.2f}%)')

for col in df.columns:
    unique_pct = (df[col].nunique() / len(df)) * 100
    if unique_pct > 95 and df[col].dtype == 'object':
        weaknesses.append(f'{col}: Very high cardinality ({df[col].nunique()} unique values)')

if len(df) < 100:
    weaknesses.append(f'Small dataset: only {len(df)} records')

skewed_features = []
for col in df.select_dtypes(include=[np.number]).columns:
    skewness = df[col].skew()
    if abs(skewness) > 2:
        skewed_features.append(f'{col} (skewness: {skewness:.2f})')

if len(skewed_features) > 0:
    weaknesses.append(f'Highly skewed features: {len(skewed_features)}')

if len(weaknesses) > 0:
    for i, weakness in enumerate(weaknesses, 1):
        print(f'{i}. {weakness}')
else:
    print('No significant weaknesses detected')

In [None]:
print('EXPLORATION SUMMARY')
print('=' * 120)
print(f'Dataset: {DATA_PATH}')
print(f'Exploration completed: {datetime.now().strftime("%Y-%m-%d %H:%M:%S")}')
print('=' * 120)
print(f'Total Records: {len(df):,}')
print(f'Total Features: {len(df.columns)}')
print(f'Numerical Features: {len(df.select_dtypes(include=[np.number]).columns)}')
print(f'Categorical Features: {len(df.select_dtypes(include=["object"]).columns)}')
print(f'Missing Data: {(df.isnull().sum().sum() / np.product(df.shape)) * 100:.2f}%')
print(f'Duplicates: {df.duplicated().sum()}')
print(f'Memory Usage: {df.memory_usage(deep=True).sum() / 1024**2:.2f} MB')
print('=' * 120)
print('Ready for data cleaning and preprocessing')