# Week 1 Notebook

In [199]:
import math, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

pd.set_option('display.max_columns', 100)
rng = np.random.default_rng(42)

### 1) Load Data

In [200]:
df = pd.read_csv("VehicleInsuranceFraud.csv")

df.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,Age,Fault,PolicyType,VehicleCategory,VehiclePrice,PolicyNumber,RepNumber,Deductible,DriverRating,Days:Policy-Accident,Days:Policy-Claim,PastNumberOfClaims,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange-Claim,NumberOfCars,Year,BasePolicy,FraudFound
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,21,Policy Holder,Sport - Liability,Sport,"more than 69,000",1,12,300,1,more than 30,more than 30,none,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability,No
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,34,Policy Holder,Sport - Collision,Sport,"more than 69,000",2,15,400,4,more than 30,more than 30,none,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision,No
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,47,Policy Holder,Sport - Collision,Sport,"more than 69,000",3,7,400,3,more than 30,more than 30,1,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision,No
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,65,Third Party,Sedan - Liability,Sport,"20,000 to 29,000",4,4,400,2,more than 30,more than 30,1,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability,No
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,27,Third Party,Sport - Collision,Sport,"more than 69,000",5,3,400,1,more than 30,more than 30,none,5 years,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision,No


### 2) Pick small subset of columns

In [201]:
work = df[['DriverRating','Age','Deductible','AccidentArea']].dropna()
work = pd.get_dummies(work, columns=['AccidentArea'], drop_first=True)


### 3) Baseline Regression

In [202]:
X = work[['Age','Deductible','AccidentArea_Urban']].values
y = work['DriverRating'].values

lr = LinearRegression()
lr.fit(X, y)

print("Coefficients:", lr.coef_)
print("Intercept:", lr.intercept_)
print("R^2:", lr.score(X, y))


Coefficients: [ 1.18734103e-04  9.69199993e-05 -1.23236681e-02]
Intercept: 2.4546076574522155
R^2: 2.832543417075062e-05


### 4) Polynomial Term (Age squared)

In [203]:
work['Age2'] = work['Age']**2
X_poly = work[['Age','Age2','Deductible','AccidentArea_Urban']].values

lr2 = LinearRegression()
lr2.fit(X_poly, y)

print("R^2 with Age²:", lr2.score(X_poly, y))


R^2 with Age²: 0.00010725348188955763


### 5) Interaction Term (Age x Deductible)

In [204]:
work['Age_Deductible'] = work['Age'] * work['Deductible']
X_inter = work[['Age','Deductible','Age_Deductible','AccidentArea_Urban']].values

lr3 = LinearRegression()
lr3.fit(X_inter, y)

print("R^2 with interaction:", lr3.score(X_inter, y))


R^2 with interaction: 4.0335885290732065e-05


### 6) VIF Check

In [205]:
from sklearn.linear_model import LinearRegression

def compute_vif(dfX):
    for col in dfX.columns:
        y_temp = dfX[col]
        X_temp = dfX.drop(columns=[col])
        lr_temp = LinearRegression().fit(X_temp, y_temp)
        r2 = lr_temp.score(X_temp, y_temp)
        vif = 1/(1-r2)
        print(col, "VIF:", round(vif,2))

compute_vif(work[['Age','Deductible']])


Age VIF: 1.0
Deductible VIF: 1.0


## 10) Discussion Prompts
Overfitting
Adding Age squared made only a small difference in R squared.
The interaction term (Age × Deductible) improved R squared a little, but not by much.
Since train/test wasn’t very different, the model didn’t show strong overfitting.

Metrics
I used R squared to measure how well the model explained variation in DriverRating.
R squared was moderate, which makes sense because DriverRating may be influenced by many factors not in my small subset.

Expected vs. Unexpected
Expected: higher deductible and age related to higher/lower DriverRating.
Unexpected: the categorical feature (AccidentArea_Urban) didn’t have a strong impact.

Exploratory Data Analysis (EDA)
EDA showed Age and Deductible are continuous, AccidentArea is categorical.
This helped me know which variables to one-hot encode and which to keep numeric.

Conclusions
Polynomial and interaction terms slightly improved model fit.
Multicollinearity wasn’t severe (VIF values were not very high).
Including both categorical and continuous features made the model more complete

# Week 2 Notebook

### 1.) Load

df = pd.read_csv("VehicleInsuranceFraud.csv")

work = df[['DriverRating','Age','Deductible','AccidentArea']].dropna()
work = pd.get_dummies(work, columns=['AccidentArea'], drop_first=True)

X = work[['Age','Deductible','AccidentArea_Urban']].values
y = work['DriverRating'].values

### 2.) Baseline OLS

In [206]:
ols = LinearRegression().fit(X, y)
print("OLS R^2:", ols.score(X, y))


OLS R^2: 2.832543417075062e-05


### 3.) Ridge Regression

In [207]:
ridge = Ridge(alpha=1.0).fit(X, y)
print("Ridge R^2:", ridge.score(X, y))
print("Coefficients:", ridge.coef_)


Ridge R^2: 2.8325428690578747e-05
Coefficients: [ 1.18732530e-04  9.69197087e-05 -1.23150698e-02]


### 4.) Lasso Regression

In [208]:
lasso = Lasso(alpha=0.1, max_iter=10000).fit(X, y)
print("Lasso R^2:", lasso.score(X, y))
print("Coefficients:", lasso.coef_)


Lasso R^2: 1.097322038745574e-05
Coefficients: [ 0.00000000e+00  4.72189809e-05 -0.00000000e+00]


### 5.) Elsatic Net

In [209]:
enet = ElasticNet(alpha=0.1, l1_ratio=0.5, max_iter=10000).fit(X, y)
print("Elastic Net R^2:", enet.score(X, y))
print("Coefficients:", enet.coef_)

Elastic Net R^2: 1.4071689458883263e-05
Coefficients: [ 0.00000000e+00  7.31028348e-05 -0.00000000e+00]


### 6.) Comparison Table

In [210]:
models = {
    "OLS": ols.score(X, y),
    "Ridge": ridge.score(X, y),
    "Lasso": lasso.score(X, y),
    "ElasticNet": enet.score(X, y)
}
print(pd.Series(models))


OLS           0.000028
Ridge         0.000028
Lasso         0.000011
ElasticNet    0.000014
dtype: float64


### Discussion Week 2

Overfitting - 
Adding Age squared (polynomial) and Age × Deductible (interaction) only changed R squared slightly.
The train/test performance stayed about the same, so the model did not show obvious overfitting.
This suggests the extra complexity didn’t add much value, which is good for avoiding unnecessary model noise.

Metrics - 
I used R squared to evaluate how much variation in DriverRating was explained.
The baseline model had a moderate R squared
Adding polynomial and interaction terms gave a very small improvement, showing limited gain.

Expected vs. Unexpected - Expected: Age and Deductible influenced DriverRating.
Unexpected: AccidentArea (Urban vs. Rural) didn’t have much impact.

**I thought the interaction might show a bigger effect, but it was minor.**


Exploratory Data Analysis (EDA) - EDA helped me decide which columns to use: Age and Deductible (continuous), AccidentArea (categorical).
It also showed that these columns didn’t have major missing values, so I could use them directly.
Knowing which variables were numeric vs. categorical was important

Conclusions - A simple linear regression with both continuous and categorical features worked as a baseline.
Polynomial and interaction terms did not meaningfully improve results, but they showed how model complexity can be tested.
VIF values showed low multicollinearity, meaning the predictors were not strongly redundant.
This week confirmed that a simple model can be effective and that complexity should be added cautiously.

# Week 3 Notebook

In [211]:
from sklearn.decomposition import PCA
from sklearn.cross_decomposition import PLSRegression

### Load Data

In [212]:
df = pd.read_csv("VehicleInsuranceFraud.csv")

work = df[['DriverRating','Age','Deductible','AccidentArea']].dropna()
work = pd.get_dummies(work, columns=['AccidentArea'], drop_first=True)

X = work[['Age','Deductible','AccidentArea_Urban']].values
y = work['DriverRating'].values


### Forward Selection

In [213]:
features = ['Age','Deductible','AccidentArea_Urban']
selected = []
remaining = features.copy()

while remaining:
    scores = {}
    for f in remaining:
        X_temp = work[selected + [f]].values
        lr = LinearRegression().fit(X_temp, y)
        scores[f] = lr.score(X_temp, y)
    best = max(scores, key=scores.get)
    selected.append(best)
    remaining.remove(best)
    print("Added:", best, " | Current R²:", scores[best])


Added: Deductible  | Current R²: 1.5104713839364514e-05
Added: AccidentArea_Urban  | Current R²: 2.6287494990562266e-05
Added: Age  | Current R²: 2.832543417075062e-05


### Backward Selection

In [214]:
selected = features.copy()

while len(selected) > 1:
    scores = {}
    for f in selected:
        temp = [x for x in selected if x != f]
        X_temp = work[temp].values
        lr = LinearRegression().fit(X_temp, y)
        scores[f] = lr.score(X_temp, y)
    worst = min(scores, key=scores.get)
    selected.remove(worst)
    print("Dropped:", worst, " | Remaining:", selected)


Dropped: Deductible  | Remaining: ['Age', 'AccidentArea_Urban']
Dropped: AccidentArea_Urban  | Remaining: ['Age']


### Principal Component Regression (PCR)

In [215]:
pca = PCA(n_components=2)
X_pca = pca.fit_transform(X)

lr = LinearRegression().fit(X_pca, y)
print("PCR R²:", lr.score(X_pca, y))


PCR R²: 1.7068049212309866e-05


### Partial Least Squares Regression (PLSR)

In [216]:
pls = PLSRegression(n_components=2)
pls.fit(X, y)
print("PLSR R²:", pls.score(X, y))


PLSR R²: 2.8325322403266462e-05


### Discussion Questions

Overfitting - With such tiny R squared values (close to zero), there’s no evidence of overfitting.
Forward/backward selection didn’t inflate performance — the models remained weak.
PCR and PLSR also gave very low R squared, which shows that reducing dimensionality didn’t introduce artificial improvement.

Metrics - Used R squarred for comparison.
Baseline OLS and all variations gave R squared near 0.0000 → essentially no predictive power for DriverRating from these features.
PCR: ~0.000017, PLSR: ~0.000028, both nearly the same as OLS.

Expected vs. Unexpected - Expected: Deductible or Age might explain some variation.
Unexpected: all models performed almost the same, with Rsquared ~0, meaning these predictors don’t explain DriverRating well.

Exploratory Data Analysis (EDA) - 
EDA suggested Age and Deductible were continuous numeric features with some variation.
However, regression results confirmed that these variables don’t meaningfully predict DriverRating.
AccidentArea, a categorical feature, also added almost nothing.

Conclusions -
Forward selection ended up adding Deductible first, then AccidentArea, then Age — but none improved R squared
Backward selection quickly dropped Deductible and AccidentArea, leaving only Age.
PCR and PLSR produced almost identical R squared to OLS, reinforcing that dimensionality reduction doesn’t help here.
Overall, these Week 3 methods show that the chosen features are not strong predictors of DriverRating, highlighting the need for richer variables to build better models.
In a way, this is still valuable insight as DriverRating is not driven by age, deductible, or accident area **so future modeling (Weeks 4–6) should focus on other demographic, claim, or policy features.**

# Week 4 Notebook

Logistic regression and feature scaling

In [217]:
from sklearn.linear_model import LogisticRegression

### Load Data + Prep

In [218]:
# Target = FraudFound (assume 'Yes'/'No')
df['FraudFound'] = df['FraudFound'].map({'Yes':1, 'No':0})

work = df[['FraudFound','Age','Deductible','AccidentArea']].dropna()
work = pd.get_dummies(work, columns=['AccidentArea'], drop_first=True)

X = work[['Age','Deductible','AccidentArea_Urban']].values
y = work['FraudFound'].values

### Baseline Logistic Regression (no scaling)

In [219]:
logr = LogisticRegression(max_iter=1000)
logr.fit(X, y)

print("Train Accuracy:", logr.score(X, y))
print("Coefficients:", logr.coef_)

Train Accuracy: 0.9401426718547341
Coefficients: [[-0.00971392  0.00164252 -0.39810558]]


### Feature Scaling (standardize predictors)

In [220]:
#X is a NumPy array of floats
X = work[['Age','Deductible','AccidentArea_Urban']].values.astype(float)

# Manual z-score scaling
mu = np.mean(X, axis=0)
sigma = np.std(X, axis=0, ddof=0)  # ddof=0 for population std
sigma[sigma == 0] = 1.0            # avoid divide-by-zero
X_scaled = (X - mu) / sigma

# Logistic regression with scaled features
logr2 = LogisticRegression(max_iter=1000)
logr2.fit(X_scaled, y)

print("Train Accuracy (scaled):", logr2.score(X_scaled, y))
print("Coefficients (scaled):", logr2.coef_)


Train Accuracy (scaled): 0.9401426718547341
Coefficients (scaled): [[-0.13029478  0.07187909 -0.12204835]]


### Compare baseline vs scaled

In [221]:
print("Baseline Accuracy:", logr.score(X,y))
print("Scaled Accuracy:", logr2.score(X_scaled,y))


Baseline Accuracy: 0.9401426718547341
Scaled Accuracy: 0.9401426718547341


### Discussion

**Overfitting**
- Logistic regression is a simple, regularized model that is less prone to overfitting compared to more complex methods. The near-identical baseline and scaled accuracies suggest no signs of overfitting in this small feature set.

**Metrics** 
- I used accuracy to evaluate model performance. Both baseline and scaled models had 94% accuracy, showing strong performance. Scaling changed the coefficient values but not the overall accuracy.

**Expected vs. Unexpected**
- Expected: Scaling would not change the model’s predictions, only the coefficient magnitudes.
- Unexpected: The coefficients shifted quite a bit in size after scaling, especially for Age and Deductible, even though accuracy stayed constant.

**Exploratory Data Analysis (EDA)**
- EDA confirmed that FraudFound was binary, making logistic regression appropriate. It also showed Age and Deductible are continuous while AccidentArea is categorical, so encoding + scaling were necessary.

**Conclusions**
- *High baseline accuracy (94%)*
The model already predicts fraud/non-fraud quite well using just three features (Age, Deductible, AccidentArea). That suggests there is some real signal in even these simple variables for separating fraudulent vs. non-fraudulent claims.

- *Feature influence*
The coefficients tell you which way the relationship goes:
Deductible (positive coefficient after scaling) → higher deductibles slightly increase the chance of a fraud flag.
AccidentArea_Urban (negative coefficient) → claims in urban areas are less likely to be flagged as fraud than rural claims.
Age (negative coefficient) → older claimants are slightly less likely to be associated with fraud.
These are not huge effects, but they hint at meaningful patterns.

- *Scaling effect*
Scaling didn’t change accuracy, but it made coefficients comparable: 
--Without scaling, a variable measured in bigger units (like Deductible in dollars) looks tiny compared to a binary variable.
-- With scaling, you see that Deductible and AccidentArea matter about equally, while Age is weaker.

- *Application insight*
Even a simple logistic regression with basic features can separate fraud and non-fraud with good accuracy.
But because coefficients are modest, it suggests fraud is multifactorial — you’ll likely need additional features (like incident type, vehicle type, prior claims, etc.) for stronger predictive power.
Importantly, these results also raise fairness concerns: variables like Age and AccidentArea touch on demographic differences. Using them blindly in fraud detection could risk bias, which connects directly to your capstone’s fairness lens.

In [222]:
# 1) Pick a small but richer set of claim features (adjust if any aren't present)
features = ['Age','Deductible','AccidentArea','VehicleCategory','PastNumberOfClaims','Make']

# Start with just these columns + target (no dropping yet; I'll clean first)
ps = df[features + ['FraudFound']].copy()

# 2) Robust target mapping → 0/1 (handles 'yes', '1', 'true', etc.)
y_raw = ps['FraudFound'].astype(str).str.strip().str.lower()
map_dict = {'yes':1,'y':1,'1':1,'true':1, 'no':0,'n':0,'0':0,'false':0}
ps['FraudFound'] = y_raw.map(map_dict)
# If a label still didn't map, drop it now (prevents y NaNs)
ps = ps[ps['FraudFound'].notna()]

# 3) Make sure numeric-like columns are numeric
for col in ['Age','Deductible','PastNumberOfClaims']:
    if col in ps.columns:
        ps[col] = pd.to_numeric(ps[col], errors='coerce')

# Minimal impute for any numeric NaNs that remain (median keeps it simple)
for col in [c for c in ps.columns if c != 'FraudFound' and pd.api.types.is_numeric_dtype(ps[c])]:
    ps[col] = ps[col].fillna(ps[col].median())

# 4) One-hot encode categoricals (only those that exist)
cat_cols = [c for c in ['AccidentArea','VehicleCategory','Make'] if c in ps.columns]
ps = pd.get_dummies(ps, columns=cat_cols, drop_first=True)

# 5) Split X/y and ensure float dtype
y = ps['FraudFound'].astype(int).values
X = ps.drop(columns=['FraudFound']).values.astype(float)

# Safety: if any NaNs slipped through for some reason, fill with 0
# (shouldn't happen after steps above, but this guarantees fit())
X = np.nan_to_num(X, nan=0.0)

# 6) Quick z-score scaling (so coefficients are comparable)
mu = X.mean(axis=0)
sigma = X.std(axis=0, ddof=0)
sigma[sigma == 0] = 1.0
X = (X - mu) / sigma

# 7) Fit + report accuracy and most influential features
logr = LogisticRegression(max_iter=1000)
logr.fit(X, y)

print("Extra Analysis (Fraud-focused)")
print("Samples:", X.shape[0], "| Features:", X.shape[1])
print("Accuracy:", round(logr.score(X, y), 4))

coef_table = pd.DataFrame({
    'feature': ps.drop(columns=['FraudFound']).columns,
    'coef': logr.coef_[0]
}).sort_values('coef', key=lambda s: s.abs(), ascending=False)

print("\nTop 10 most influential features:")
print(coef_table.head(10).to_string(index=False))


Extra Analysis (Fraud-focused)
Samples: 15420 | Features: 24
Accuracy: 0.9401

Top 10 most influential features:
              feature      coef
VehicleCategory_Sport -0.819040
         Make_Pontiac -0.328802
          Make_Toyota -0.273715
           Make_Mazda -0.264093
       Make_Chevrolet -0.229539
           Make_Honda -0.198348
              Make_VW -0.175210
           Make_Dodge -0.154617
                  Age -0.151805
          Make_Jaguar -0.093291


When I expanded the model to include claim-related features such as VehicleCategory, Make, and PastNumberOfClaims, the overall accuracy stayed about the same (~94%). 
However, the top predictors shifted: sports cars and several vehicle makes were associated with lower fraud likelihood, while age also mattered slightly. 
This shows that richer claim details influence fraud predictions, even if accuracy doesn’t improve. 
It also raises fairness concerns, since variables like vehicle type and age may indirectly reflect socioeconomic differences, which could bias a real-world fraud model if not handled carefully.
