# 04 – Data Validation

This notebook performs comprehensive data validation including:
- Basic quality checks (missing values, data types)
- Data quality scoring
- Outlier detection
- Custom validation rules
- Missing value analysis and visualization

In [27]:
import sys
import os
sys.path.insert(0, os.path.abspath('..'))

In [28]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy import stats
from scripts.validation import generate_quality_report
from datetime import datetime
import os

In [29]:

# Paths
date_partition = datetime.now().strftime('%Y%m%d')
raw_root = os.path.join('..', 'data', 'raw')
validated_dir = os.path.join('..', 'data', 'validated', date_partition)
# Generate report
generate_quality_report(raw_root, validated_dir)
print('Validation complete. Report saved in:', validated_dir)

2025-08-24 15:02:02 - validation - INFO - Validated customers - issues: {'missing_customer_id': np.int64(0), 'duplicate_customer_id': np.int64(0), 'negative_tenure_months': np.int64(0), 'missing_monthly_charges': np.int64(0), 'missing_total_charges': np.int64(0)}


2025-08-24 15:02:02 - validation - INFO - Validated transactions - issues: {'missing_transaction_id': np.int64(0), 'duplicate_transaction_id': np.int64(0), 'missing_customer_id': np.int64(0), 'negative_amount': np.int64(0), 'invalid_dates': 0}
2025-08-24 15:02:02 - validation - INFO - Validated web_logs - issues: {'invalid_json': 0}
2025-08-24 15:02:02 - validation - INFO - Quality report generated at ..\data\validated\20250824\quality_report_20250824_150202.csv
2025-08-24 15:02:02 - validation - INFO - Validated web_logs - issues: {'invalid_json': 0}
2025-08-24 15:02:02 - validation - INFO - Quality report generated at ..\data\validated\20250824\quality_report_20250824_150202.csv


Validation complete. Report saved in: ..\data\validated\20250824


In [30]:
def analyze_missing_values(df):
    """Analyze and visualize missing values in the dataset."""
    
    # Calculate missing value statistics
    missing_stats = pd.DataFrame({
        'missing_count': df.isnull().sum(),
        'missing_percentage': (df.isnull().sum() / len(df) * 100).round(2)
    }).sort_values('missing_percentage', ascending=False)
    
    # Create missing value heatmap
    plt.figure(figsize=(12, 6))
    sns.heatmap(df.isnull(), yticklabels=False, cbar=False, cmap='viridis')
    plt.title('Missing Value Pattern')
    plt.show()
    
    return missing_stats

In [31]:
def detect_outliers(df, columns=None):
    """Detect outliers using the IQR method and Z-score."""
    if columns is None:
        columns = df.select_dtypes(include=[np.number]).columns
    
    outlier_report = {}
    for col in columns:
        # IQR method
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        iqr_outliers = df[(df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))]
        
        # Z-score method
        z_scores = np.abs(stats.zscore(df[col]))
        z_outliers = df[z_scores > 3]
        
        outlier_report[col] = {
            'iqr_outliers_count': len(iqr_outliers),
            'iqr_outliers_percentage': len(iqr_outliers) / len(df) * 100,
            'z_score_outliers_count': len(z_outliers),
            'z_score_outliers_percentage': len(z_outliers) / len(df) * 100
        }
    
    return pd.DataFrame(outlier_report).transpose()

In [32]:
def calculate_data_quality_score(df):
    """Calculate a data quality score based on multiple metrics."""
    # Initialize scoring components
    completeness_score = 1 - df.isnull().mean().mean()
    
    # Check for duplicates
    duplicate_score = 1 - len(df[df.duplicated()]) / len(df)
    
    # Check numerical columns for outliers using IQR
    numerical_cols = df.select_dtypes(include=[np.number]).columns
    outlier_scores = []
    for col in numerical_cols:
        Q1 = df[col].quantile(0.25)
        Q3 = df[col].quantile(0.75)
        IQR = Q3 - Q1
        outliers = df[(df[col] < (Q1 - 1.5 * IQR)) | (df[col] > (Q3 + 1.5 * IQR))]
        outlier_score = 1 - len(outliers) / len(df)
        outlier_scores.append(outlier_score)
    
    outlier_score = np.mean(outlier_scores) if outlier_scores else 1.0
    
    # Final weighted score
    final_score = (0.4 * completeness_score + 
                  0.3 * duplicate_score + 
                  0.3 * outlier_score)
    
    return {
        'overall_score': final_score,
        'completeness_score': completeness_score,
        'duplicate_score': duplicate_score,
        'outlier_score': outlier_score
    }

In [33]:
def validate_value_distributions(df, columns=None):
    """Validate data distributions using statistical tests and visualizations."""
    if columns is None:
        numerical_cols = df.select_dtypes(include=[np.number]).columns
        categorical_cols = df.select_dtypes(include=['object']).columns
    
    results = {}
    
    # Analyze numerical columns
    for col in numerical_cols:
        # Test for normality
        stat, p_value = stats.normaltest(df[col].dropna())
        
        # Create distribution plot
        plt.figure(figsize=(10, 4))
        plt.subplot(121)
        sns.histplot(data=df, x=col, kde=True)
        plt.title(f'{col} Distribution')
        
        # Create Q-Q plot
        plt.subplot(122)
        stats.probplot(df[col].dropna(), dist="norm", plot=plt)
        plt.title(f'{col} Q-Q Plot')
        plt.tight_layout()
        plt.show()
        
        results[col] = {
            'normality_test_statistic': stat,
            'normality_p_value': p_value,
            'is_normal': p_value > 0.05,
            'skewness': stats.skew(df[col].dropna()),
            'kurtosis': stats.kurtosis(df[col].dropna())
        }
    
    # Analyze categorical columns
    for col in categorical_cols:
        # Calculate entropy for categorical variables
        value_counts = df[col].value_counts(normalize=True)
        entropy = stats.entropy(value_counts)
        
        # Create bar plot
        plt.figure(figsize=(10, 4))
        sns.countplot(data=df, x=col)
        plt.xticks(rotation=45)
        plt.title(f'{col} Distribution')
        plt.tight_layout()
        plt.show()
        
        results[col] = {
            'unique_values': df[col].nunique(),
            'entropy': entropy,
            'mode': df[col].mode()[0],
            'mode_frequency': value_counts.iloc[0]
        }
    
    return pd.DataFrame(results).transpose()

In [34]:
def validate_data_consistency(df):
    """Validate data consistency across related fields."""
    consistency_checks = {
        'numeric_range': {
            'tenure_months': (0, float('inf')),
            'monthly_charges': (0, float('inf')),
            'total_charges': (0, float('inf'))
        },
        'categorical_values': {
            'churn': ['Yes', 'No'],
            'contract': ['Month-to-month', 'One year', 'Two year'],
            'internet_service': ['DSL', 'Fiber optic', 'No']
        },
        'logical_rules': [
            ('total_charges >= monthly_charges', 
             lambda x: x['total_charges'] >= x['monthly_charges']),
            ('tenure_months correlation with total_charges', 
             lambda x: x['tenure_months'].corr(x['total_charges']) > 0)
        ]
    }
    
    results = []
    
    # Check numeric ranges
    for field, (min_val, max_val) in consistency_checks['numeric_range'].items():
        if field in df.columns:
            invalid_count = df[~df[field].between(min_val, max_val)].shape[0]
            results.append({
                'check_type': 'numeric_range',
                'field': field,
                'valid': invalid_count == 0,
                'invalid_count': invalid_count,
                'details': f'Values should be between {min_val} and {max_val}'
            })
    
    # Check categorical values
    for field, valid_values in consistency_checks['categorical_values'].items():
        if field in df.columns:
            invalid_values = df[~df[field].isin(valid_values)][field].unique()
            results.append({
                'check_type': 'categorical_values',
                'field': field,
                'valid': len(invalid_values) == 0,
                'invalid_values': list(invalid_values),
                'details': f'Values should be in {valid_values}'
            })
    
    # Check logical rules
    for rule_name, rule_func in consistency_checks['logical_rules']:
        try:
            is_valid = rule_func(df)
            results.append({
                'check_type': 'logical_rule',
                'rule': rule_name,
                'valid': bool(is_valid),
                'details': 'Logical relationship validated'
            })
        except Exception as e:
            results.append({
                'check_type': 'logical_rule',
                'rule': rule_name,
                'valid': False,
                'details': f'Error in validation: {str(e)}'
            })
    
    return pd.DataFrame(results)

In [35]:
# Load the most recent customer data
files = os.listdir(raw_root)
customer_files = [f for f in files if 'customers' in f]
latest_file = sorted(customer_files)[-1]
customers_df = pd.read_csv(os.path.join(raw_root, latest_file))

# Calculate data quality score
quality_scores = calculate_data_quality_score(customers_df)
print("\nData Quality Scores:")
for metric, score in quality_scores.items():
    print(f"{metric}: {score:.2f}")

# Detect outliers
print("\nOutlier Analysis:")
outlier_report = detect_outliers(customers_df)
print(outlier_report)

# Analyze missing values
print("\nMissing Value Analysis:")
missing_analysis = analyze_missing_values(customers_df)
print(missing_analysis)

  plt.show()



Data Quality Scores:
overall_score: 0.99
completeness_score: 1.00
duplicate_score: 1.00
outlier_score: 0.96

Outlier Analysis:
                 iqr_outliers_count  iqr_outliers_percentage  \
senior_citizen                712.0                    14.24   
tenure_months                   0.0                     0.00   
monthly_charges                 0.0                     0.00   
total_charges                  31.0                     0.62   

                 z_score_outliers_count  z_score_outliers_percentage  
senior_citizen                      0.0                         0.00  
tenure_months                       0.0                         0.00  
monthly_charges                     0.0                         0.00  
total_charges                       6.0                         0.12  

Missing Value Analysis:
                  missing_count  missing_percentage
customer_id                   0                 0.0
gender                        0                 0.0
senior_citizen 

## Advanced Data Validation

We'll now perform more sophisticated validation checks:

1. **Distribution Analysis**
   - Normality tests for numerical features
   - Distribution visualization with histograms and Q-Q plots
   - Entropy calculation for categorical variables
   
2. **Data Consistency Validation**
   - Range checks for numerical fields
   - Valid value checks for categorical fields
   - Logical relationship validation
   - Cross-field consistency checks

These validations will help ensure:
- Data quality and reliability
- Proper statistical properties
- Business rule compliance
- Consistency across related fields

In [36]:
# Perform distribution validation
print("Analyzing feature distributions...")
distribution_results = validate_value_distributions(customers_df)
display(distribution_results)

# Perform consistency validation
print("\nValidating data consistency...")
consistency_results = validate_data_consistency(customers_df)
display(consistency_results)

# Generate summary report
validation_summary = {
    'total_records': len(customers_df),
    'fields_analyzed': len(customers_df.columns),
    'distribution_issues': len(distribution_results[distribution_results['is_normal'] == False]) if 'is_normal' in distribution_results.columns else 0,
    'consistency_issues': len(consistency_results[consistency_results['valid'] == False]),
    'overall_quality_score': quality_scores['overall_score']
}

print("\nValidation Summary:")
for metric, value in validation_summary.items():
    print(f"{metric}: {value}")

Analyzing feature distributions...


  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()
  plt.show()


Unnamed: 0,normality_test_statistic,normality_p_value,is_normal,skewness,kurtosis,unique_values,entropy,mode,mode_frequency
senior_citizen,1778.263183,0.0,False,2.046587,2.188517,,,,
tenure_months,4370.86178,0.0,False,0.002762,-1.202852,,,,
monthly_charges,4608.410882,0.0,False,-0.014803,-1.208551,,,,
total_charges,428.45223,0.0,False,0.805804,-0.13685,,,,
customer_id,,,,,,5000.0,8.517193,C000001,0.0002
gender,,,,,,2.0,0.693075,Male,0.506
partner,,,,,,2.0,0.687603,No,0.5526
dependents,,,,,,2.0,0.609503,No,0.7016
contract,,,,,,3.0,0.850393,Month-to-month,0.6542
internet_service,,,,,,3.0,1.019399,Fiber optic,0.4488



Validating data consistency...


Unnamed: 0,check_type,field,valid,invalid_count,details,invalid_values,rule
0,numeric_range,tenure_months,True,0.0,Values should be between 0 and inf,,
1,numeric_range,monthly_charges,True,0.0,Values should be between 0 and inf,,
2,numeric_range,total_charges,False,57.0,Values should be between 0 and inf,,
3,categorical_values,churn,True,,"Values should be in ['Yes', 'No']",[],
4,categorical_values,contract,True,,"Values should be in ['Month-to-month', 'One ye...",[],
5,categorical_values,internet_service,True,,"Values should be in ['DSL', 'Fiber optic', 'No']",[],
6,logical_rule,,False,,Error in validation: The truth value of a Seri...,,total_charges >= monthly_charges
7,logical_rule,,True,,Logical relationship validated,,tenure_months correlation with total_charges



Validation Summary:
total_records: 5000
fields_analyzed: 12
distribution_issues: 4
consistency_issues: 2
overall_quality_score: 0.9888549999999999
