# Risk Predictor - Data Exploration and Analysis

This notebook provides comprehensive data exploration and analysis for the risk prediction model.

## Objectives:
- Load and examine the dataset structure
- Perform exploratory data analysis (EDA)
- Identify patterns and correlations
- Visualize key insights
- Prepare data quality assessment

## 1. Import Required Libraries

In [None]:
# Data manipulation and analysis
import pandas as pd
import numpy as np

# Visualization libraries
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Statistical analysis
from scipy import stats
from scipy.stats import chi2_contingency

# Machine learning preprocessing
from sklearn.preprocessing import StandardScaler, LabelEncoder

# Configuration
plt.style.use('seaborn-v0_8')
sns.set_palette('husl')
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', 100)

# Suppress warnings
import warnings
warnings.filterwarnings('ignore')

print('Libraries imported successfully!')

## 2. Data Loading and Initial Inspection

In [None]:
# Load the dataset
# Update the path according to your data location
data_path = '/home/jovyan/data/raw/'  # Docker volume path

# Example: Load CSV files (update according to your data format)
try:
    # Replace with your actual data file
    df = pd.read_csv(f'{data_path}risk_data.csv')
    print(f'Dataset loaded successfully!')
    print(f'Shape: {df.shape}')
except FileNotFoundError:
    print('Data file not found. Please ensure your data is in the /data/raw/ directory.')
    # Create sample data for demonstration
    np.random.seed(42)
    df = pd.DataFrame({
        'age': np.random.randint(18, 80, 1000),
        'income': np.random.normal(50000, 20000, 1000),
        'credit_score': np.random.randint(300, 850, 1000),
        'employment_length': np.random.randint(0, 40, 1000),
        'loan_amount': np.random.normal(25000, 15000, 1000),
        'risk_level': np.random.choice(['Low', 'Medium', 'High'], 1000, p=[0.6, 0.3, 0.1])
    })
    print('Sample data created for demonstration purposes')

# Display basic information
print('\nDataset Info:')
df.info()

In [None]:
# Display first few rows
print('First 5 rows:')
df.head()

In [None]:
# Statistical summary
print('Statistical Summary:')
df.describe()

## 3. Data Quality Assessment

In [None]:
# Check for missing values
print('Missing Values:')
missing_data = df.isnull().sum()
missing_percentage = (missing_data / len(df)) * 100
missing_df = pd.DataFrame({
    'Column': missing_data.index,
    'Missing Count': missing_data.values,
    'Missing Percentage': missing_percentage.values
})
print(missing_df[missing_df['Missing Count'] > 0])

In [None]:
# Check for duplicates
duplicates = df.duplicated().sum()
print(f'Number of duplicate rows: {duplicates}')

# Data types
print('\nData Types:')
print(df.dtypes)

## 4. Exploratory Data Analysis (EDA)

In [None]:
# Distribution of target variable
if 'risk_level' in df.columns:
    fig = px.histogram(df, x='risk_level', title='Distribution of Risk Levels')
    fig.show()
    
    # Count and percentage
    risk_counts = df['risk_level'].value_counts()
    risk_percentages = df['risk_level'].value_counts(normalize=True) * 100
    
    print('Risk Level Distribution:')
    for level in risk_counts.index:
        print(f'{level}: {risk_counts[level]} ({risk_percentages[level]:.1f}%)')

In [None]:
# Correlation matrix for numerical variables
numerical_cols = df.select_dtypes(include=[np.number]).columns
correlation_matrix = df[numerical_cols].corr()

# Create heatmap
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Correlation Matrix of Numerical Variables')
plt.tight_layout()
plt.show()

In [None]:
# Distribution of numerical variables
fig, axes = plt.subplots(2, 3, figsize=(15, 10))
axes = axes.ravel()

for i, col in enumerate(numerical_cols):
    if i < len(axes):
        df[col].hist(bins=30, ax=axes[i], alpha=0.7)
        axes[i].set_title(f'Distribution of {col}')
        axes[i].set_xlabel(col)
        axes[i].set_ylabel('Frequency')

plt.tight_layout()
plt.show()

## 5. Risk Analysis by Features

In [None]:
# Box plots for numerical variables by risk level
if 'risk_level' in df.columns:
    fig, axes = plt.subplots(2, 3, figsize=(15, 10))
    axes = axes.ravel()
    
    for i, col in enumerate(numerical_cols):
        if i < len(axes) and col != 'risk_level':
            df.boxplot(column=col, by='risk_level', ax=axes[i])
            axes[i].set_title(f'{col} by Risk Level')
    
    plt.tight_layout()
    plt.show()

## 6. Statistical Tests and Insights

In [None]:
# Statistical tests for numerical variables vs risk level
if 'risk_level' in df.columns:
    print('Statistical Tests (ANOVA) for Numerical Variables:')
    print('=' * 50)
    
    for col in numerical_cols:
        if col != 'risk_level':
            groups = [df[df['risk_level'] == level][col].dropna() 
                     for level in df['risk_level'].unique()]
            
            f_stat, p_value = stats.f_oneway(*groups)
            print(f'{col}:')
            print(f'  F-statistic: {f_stat:.4f}')
            print(f'  p-value: {p_value:.4f}')
            print(f'  Significant: {"Yes" if p_value < 0.05 else "No"}')
            print()

## 7. Key Insights and Recommendations

In [None]:
# Summary insights
print('=== DATA EXPLORATION SUMMARY ===')
print(f'Dataset shape: {df.shape}')
print(f'Missing values: {df.isnull().sum().sum()}')
print(f'Duplicate rows: {df.duplicated().sum()}')
print(f'Numerical columns: {len(numerical_cols)}')
print(f'Categorical columns: {len(df.select_dtypes(include=["object"]).columns)}')

if 'risk_level' in df.columns:
    print(f'\nRisk Level Distribution:')
    for level, count in df['risk_level'].value_counts().items():
        pct = (count / len(df)) * 100
        print(f'  {level}: {count} ({pct:.1f}%)')

print('\n=== NEXT STEPS ===')
print('1. Data cleaning and preprocessing')
print('2. Feature engineering')
print('3. Model development and training')
print('4. Model evaluation and validation')