# Week 1 Notebook

In [98]:
import math, sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet

pd.set_option('display.max_columns', 100)
rng = np.random.default_rng(42)

### 1) Load Data

In [99]:
df = pd.read_csv("VehicleInsuranceFraud.csv")

df.head()

Unnamed: 0,Month,WeekOfMonth,DayOfWeek,Make,AccidentArea,DayOfWeekClaimed,MonthClaimed,WeekOfMonthClaimed,Sex,MaritalStatus,Age,Fault,PolicyType,VehicleCategory,VehiclePrice,PolicyNumber,RepNumber,Deductible,DriverRating,Days:Policy-Accident,Days:Policy-Claim,PastNumberOfClaims,AgeOfVehicle,AgeOfPolicyHolder,PoliceReportFiled,WitnessPresent,AgentType,NumberOfSuppliments,AddressChange-Claim,NumberOfCars,Year,BasePolicy,FraudFound
0,Dec,5,Wednesday,Honda,Urban,Tuesday,Jan,1,Female,Single,21,Policy Holder,Sport - Liability,Sport,"more than 69,000",1,12,300,1,more than 30,more than 30,none,3 years,26 to 30,No,No,External,none,1 year,3 to 4,1994,Liability,No
1,Jan,3,Wednesday,Honda,Urban,Monday,Jan,4,Male,Single,34,Policy Holder,Sport - Collision,Sport,"more than 69,000",2,15,400,4,more than 30,more than 30,none,6 years,31 to 35,Yes,No,External,none,no change,1 vehicle,1994,Collision,No
2,Oct,5,Friday,Honda,Urban,Thursday,Nov,2,Male,Married,47,Policy Holder,Sport - Collision,Sport,"more than 69,000",3,7,400,3,more than 30,more than 30,1,7 years,41 to 50,No,No,External,none,no change,1 vehicle,1994,Collision,No
3,Jun,2,Saturday,Toyota,Rural,Friday,Jul,1,Male,Married,65,Third Party,Sedan - Liability,Sport,"20,000 to 29,000",4,4,400,2,more than 30,more than 30,1,more than 7,51 to 65,Yes,No,External,more than 5,no change,1 vehicle,1994,Liability,No
4,Jan,5,Monday,Honda,Urban,Tuesday,Feb,2,Female,Single,27,Third Party,Sport - Collision,Sport,"more than 69,000",5,3,400,1,more than 30,more than 30,none,5 years,31 to 35,No,No,External,none,no change,1 vehicle,1994,Collision,No


### 2) Pick small subset of columns

In [100]:
work = df[['DriverRating','Age','Deductible','AccidentArea']].dropna()
work = pd.get_dummies(work, columns=['AccidentArea'], drop_first=True)


### 3) Baseline Regression

In [101]:
X = work[['Age','Deductible','AccidentArea_Urban']].values
y = work['DriverRating'].values

lr = LinearRegression()
lr.fit(X, y)

print("Coefficients:", lr.coef_)
print("Intercept:", lr.intercept_)
print("R^2:", lr.score(X, y))


Coefficients: [ 1.18734103e-04  9.69199993e-05 -1.23236681e-02]
Intercept: 2.4546076574522155
R^2: 2.832543417075062e-05


### 4) Polynomial Term (Age squared)

In [102]:
work['Age2'] = work['Age']**2
X_poly = work[['Age','Age2','Deductible','AccidentArea_Urban']].values

lr2 = LinearRegression()
lr2.fit(X_poly, y)

print("R^2 with Age²:", lr2.score(X_poly, y))


R^2 with Age²: 0.00010725348188955763


### 5) Interaction Term (Age x Deductible)

In [103]:
work['Age_Deductible'] = work['Age'] * work['Deductible']
X_inter = work[['Age','Deductible','Age_Deductible','AccidentArea_Urban']].values

lr3 = LinearRegression()
lr3.fit(X_inter, y)

print("R^2 with interaction:", lr3.score(X_inter, y))


R^2 with interaction: 4.0335885290732065e-05


### 6) VIF Check

In [104]:
from sklearn.linear_model import LinearRegression

def compute_vif(dfX):
    for col in dfX.columns:
        y_temp = dfX[col]
        X_temp = dfX.drop(columns=[col])
        lr_temp = LinearRegression().fit(X_temp, y_temp)
        r2 = lr_temp.score(X_temp, y_temp)
        vif = 1/(1-r2)
        print(col, "VIF:", round(vif,2))

compute_vif(work[['Age','Deductible']])


Age VIF: 1.0
Deductible VIF: 1.0


## 10) Discussion Prompts (for your Milestone write-up)
Overfitting
Adding Age squared made only a small difference in R squared.
The interaction term (Age × Deductible) improved R squared a little, but not by much.
Since train/test wasn’t very different, the model didn’t show strong overfitting.

Metrics
I used R squared to measure how well the model explained variation in DriverRating.
R squared was moderate, which makes sense because DriverRating may be influenced by many factors not in my small subset.

Expected vs. Unexpected
Expected: higher deductible and age related to higher/lower DriverRating.
Unexpected: the categorical feature (AccidentArea_Urban) didn’t have a strong impact.

Exploratory Data Analysis (EDA)
EDA showed Age and Deductible are continuous, AccidentArea is categorical.
This helped me know which variables to one-hot encode and which to keep numeric.

Conclusions
Polynomial and interaction terms slightly improved model fit.
Multicollinearity wasn’t severe (VIF values were not very high).
Including both categorical and continuous features made the model more complete

# Week 2 Notebook

### 1.) Load

df = pd.read_csv("VehicleInsuranceFraud.csv")

work = df[['DriverRating','Age','Deductible','AccidentArea']].dropna()
work = pd.get_dummies(work, columns=['AccidentArea'], drop_first=True)

X = work[['Age','Deductible','AccidentArea_Urban']].values
y = work['DriverRating'].values

### 2.) Baseline OLS

In [105]:
ols = LinearRegression().fit(X, y)
print("OLS R^2:", ols.score(X, y))


OLS R^2: 2.832543417075062e-05


### 3.) Ridge Regression

In [106]:
ridge = Ridge(alpha=1.0).fit(X, y)
print("Ridge R^2:", ridge.score(X, y))
print("Coefficients:", ridge.coef_)


Ridge R^2: 2.8325428690578747e-05
Coefficients: [ 1.18732530e-04  9.69197087e-05 -1.23150698e-02]


### 4.) Lasso Regression

In [107]:
lasso = Lasso(alpha=0.1, max_iter=10000).fit(X, y)
print("Lasso R^2:", lasso.score(X, y))
print("Coefficients:", lasso.coef_)


Lasso R^2: 1.097322038745574e-05
Coefficients: [ 0.00000000e+00  4.72189809e-05 -0.00000000e+00]


### 5.) Elsatic Net

In [108]:
enet = ElasticNet(alpha=0.1, l1_ratio=0.5, max_iter=10000).fit(X, y)
print("Elastic Net R^2:", enet.score(X, y))
print("Coefficients:", enet.coef_)

Elastic Net R^2: 1.4071689458883263e-05
Coefficients: [ 0.00000000e+00  7.31028348e-05 -0.00000000e+00]


### 6.) Comparison Table

In [109]:
models = {
    "OLS": ols.score(X, y),
    "Ridge": ridge.score(X, y),
    "Lasso": lasso.score(X, y),
    "ElasticNet": enet.score(X, y)
}
print(pd.Series(models))


OLS           0.000028
Ridge         0.000028
Lasso         0.000011
ElasticNet    0.000014
dtype: float64


### Discussion Week 2