In [None]:

import import_ipynb
from src.dataloader import DataLoader
from src.dataclean import DataCleaner
from src.visualizer import Visualizer
from src.utils import flag_outliers
from src import dataunderstanding
from src.eda import EDAAnalyzer
from src.preprocess import PreprocessAndVisualize
from src.hypothesis import HypothesisTestingAndVisualization
from src.preprocess_model import ModelPreprocessing
from src.modeling import ModelBuilder
from src.modelinter import ModelInterpret



In [None]:

# Load
loader = DataLoader("data/MachineLearningRating_v3.txt")
df = loader.load_csv()
    
# Clean
cleaner = DataCleaner(df)
df = cleaner.fix_dtypes()
df = cleaner.handle_missing()
df = cleaner.add_metrics()
    
    

In [None]:
# Data Understanding
du = dataunderstanding.DataUnderstanding(df)
du.overview()

# EDA
eda = EDAAnalyzer(df)
eda.overview()
var_stats = eda.variability_stats()
print("[INFO] Variability Stats:")
print(var_stats[['mean','std','var','IQR','min','max']])
eda.correlation_matrix()
eda.scatter_plot('TotalPremium','TotalClaims', hue_col='Province')
eda.monthly_trends('TotalClaims')
eda.group_trends('VehicleType','TotalClaims')
df, lower, upper = eda.boxplot_outliers('TotalClaims')
print(f"[INFO] TotalClaims outlier threshold: lower={lower}, upper={upper}")
eda.creative_plots()
    
    

In [None]:
# Visuals
viz = Visualizer(df)
viz.histogram("TotalClaims", log_scale=True)
viz.boxplot("CustomValueEstimate")
viz.bar_chart("VehicleType")

# Outlier detection
df, threshold = flag_outliers(df, "TotalClaims")
print(f"[INFO] Flagged {df['TotalClaims_is_outlier'].sum()} extreme TotalClaims (>{threshold})")
    
    

In [None]:
 # Preprocess & create metrics
prep_viz = PreprocessAndVisualize(df)
df = prep_viz.create_metrics()

# Task 3 specific visuals
prep_viz.plot_claim_frequency('Province')
prep_viz.plot_claim_frequency('Gender')
prep_viz.plot_claim_frequency('PostalCode')
prep_viz.plot_margin_distribution('PostalCode')
prep_viz.plot_claim_severity('Province')

# Hypothesis testing + visualization
hypo_viz = HypothesisTestingAndVisualization(df)
task3_results = hypo_viz.run_all_hypotheses()

# Print summary
print("\nTask 3 Hypotheses Summary:")
for h, (p, res) in task3_results.items():
    print(f"{h}: p-value={p:.4f} â†’ {res}")
    
    

In [None]:
# ----------------- Task 4: Data preparation -----------------
prep = ModelPreprocessing(df)
df_claims = prep.filter_claims()  # Only policies with claims
df_claims = prep.handle_missing(strategy='median')
df_claims = prep.encode_features(categorical_features=['Province','Gender','VehicleType'])

X_train, X_test, y_train, y_test = prep.train_test_split(target='TotalClaims', test_size=0.3)

# ----------------- Task 4: Model Building -----------------
builder = ModelBuilder(X_train, X_test, y_train, y_test)
builder.train_linear_regression()
builder.train_random_forest(n_estimators=100, max_depth=6)
builder.train_xgboost(n_estimators=100, learning_rate=0.1)

results = builder.evaluate_models()
print("Model Evaluation Results:")
print(results)

