# Auditing, Statistical, and Predictive Modeling

## Objective
Perform regulatory compliance audits, statistical analysis, hypothesis testing, and risk prediction.

### 1. Data Auditing (SQL)
Verify high-risk pipeline segments inspection compliance.

In [1]:
# Mock SQL execution using pandasql or just displaying the query logic
import pandas as pd
import sqlite3

# Mock Data for SQL Audit
inspections = pd.DataFrame({
    'pipeline_segment_id': ['SEG-101', 'SEG-102', 'SEG-103'],
    'inspection_date': ['2023-08-01', '2023-05-01', '2023-09-15'],
    'risk_level': ['HIGH', 'HIGH', 'LOW']
})

# SQL Query Logic
query = """
SELECT 
    pipeline_segment_id,
    inspection_date,
    CASE 
        WHEN inspection_date < DATE('now', '-90 days') THEN 'Non-Compliant'
        ELSE 'Compliant'
    END as compliance_status
FROM inspections
WHERE risk_level = 'HIGH'
"""

# Using sqlite3 to demonstrate SQL execution
conn = sqlite3.connect(':memory:')
inspections.to_sql('inspections', conn, index=False)
audit_result = pd.read_sql_query(query, conn)
print(audit_result)

  pipeline_segment_id inspection_date compliance_status
0             SEG-101      2023-08-01     Non-Compliant
1             SEG-102      2023-05-01     Non-Compliant


### 2. Statistical Analysis (Python)
Calculate KPIs like Mean Time Between Maintenance (MTBM).

In [2]:
import numpy as np

# Mock Maintenance Data
maintenance_intervals_days = [120, 130, 110, 140, 125, 115, 135]

mtbm = np.mean(maintenance_intervals_days)
print(f"Mean Time Between Maintenance (MTBM): {mtbm:.2f} days")

Mean Time Between Maintenance (MTBM): 125.00 days


### 3. Hypothesis Testing (R via rpy2)
Test if new logistics route reduces energy consumption.

In [3]:
# Note: rpy2 requires R to be installed in the environment.
# Here we simulate the code structure.

try:
    import rpy2.robjects as robjects
    from rpy2.robjects.packages import importr
    
    # R code to perform t-test
    r_code = """
    group_a <- c(100, 102, 98, 101, 99) # Old Route Energy Consumption
    group_b <- c(95, 94, 96, 93, 97)    # New Route Energy Consumption
    
    test_result <- t.test(group_a, group_b)
    print(test_result)
    """
    
    # robjects.r(r_code)
    print("R t-test executed (mocked output for demo).")
    print("p-value < 0.05, rejecting null hypothesis. New route is significantly more efficient.")
except ImportError:
    print("rpy2 not installed or R not found. Skipping R execution.")

Error importing in API mode: ImportError('On Windows, cffi mode "ANY" is only "ABI".')
Trying to import in ABI mode.


R t-test executed (mocked output for demo).
p-value < 0.05, rejecting null hypothesis. New route is significantly more efficient.


### 4. Risk Prediction (Python/Scikit-learn)
Predict Pipeline Failure Probability.

In [4]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import numpy as np

# Generate Synthetic Data with a Pattern (for Demo)
np.random.seed(42)
n_samples = 500

# Features: [Pressure_PSI, Temperature_C, Vibration_Hz]
# Normalized 0-1 for simplicity in this demo
X = np.random.rand(n_samples, 3)

# Define logic: High Vibration (idx 2) + High Pressure (idx 0) = Failure
# If (0.3 * Pressure) + (0.7 * Vibration) > 0.6, then Failure (1)
risk_score = (0.3 * X[:, 0]) + (0.7 * X[:, 2]) + np.random.normal(0, 0.05, n_samples)
y = (risk_score > 0.6).astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

predictions = clf.predict(X_test)
accuracy = accuracy_score(y_test, predictions)

print(f"Risk Prediction Model Accuracy: {accuracy:.2f}")
print("Feature Importance (Pressure, Temp, Vibration):", clf.feature_importances_)

Risk Prediction Model Accuracy: 0.94
Feature Importance (Pressure, Temp, Vibration): [0.18702962 0.10759399 0.70537639]
