In [41]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

# Import dataset
from sklearn.datasets import load_breast_cancer

# For precision, recall, etc.
from sklearn.metrics import classification_report
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

# Import the selector module, and the accuracy_score module to computer performance
from sklearn.metrics import f1_score
from mlxtend.feature_selection import SequentialFeatureSelector as sfs
import matplotlib.pyplot as plt


In [42]:
df = pd.read_csv('Advanced Features Claims Data.csv')

In [43]:
df.head()

Unnamed: 0.1,Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_state,policy_csl,policy_deductable,policy_annual_premium,umbrella_limit,...,auto_model,auto_year,fraud_reported,fraud_reported_numeric,ages_category,customer_category,Contract Years,total_premiums_paid,net_value_of_customer,Positive_Net_Value
0,0,328,48,521585,41929.0,OH,250/500,1000,1406.91,0,...,92x,2004,Y,1,Middle Aged,Long-Term Client,27.333333,38455.54,-33154.46,N
1,1,228,42,342868,38895.0,IN,250/500,2000,1197.22,5000000,...,E400,2007,Y,1,Middle Aged,Long-Term Client,19.0,22747.18,17677.18,Y
2,2,134,29,687698,36775.0,OH,100/300,2000,1413.14,5000000,...,RAM,2007,N,0,Young Adult,Established Client,11.166667,15780.063333,-18869.936667,N
3,3,256,41,227811,33018.0,IL,250/500,2000,1415.74,6000000,...,Tahoe,2014,Y,1,Middle Aged,Long-Term Client,21.333333,30202.453333,-33197.546667,N
4,4,228,44,367455,41796.0,IL,500/1000,1000,1583.91,6000000,...,RSX,2009,N,0,Middle Aged,Long-Term Client,19.0,30094.29,23594.29,Y


In [82]:
# labels
y = df['Positive_Net_Value']

# features
X = df.drop(['Positive_Net_Value', 'net_value_of_customer', 'total_premiums_paid', 'total_claim_amount', 'ages_category','Unnamed: 0'], axis=1)

In [83]:
# Transforming the Features
X_transformed = pd.get_dummies(X, drop_first=True)

In [84]:
X_train, X_test, y_train, y_test = train_test_split(X_transformed, y, test_size=0.2, random_state=50)

In [85]:
X_train.head()

Unnamed: 0,months_as_customer,age,policy_number,policy_bind_date,policy_deductable,policy_annual_premium,umbrella_limit,insured_zip,capital-gains,capital-loss,...,auto_model_Silverado,auto_model_TL,auto_model_Tahoe,auto_model_Ultima,auto_model_Wrangler,auto_model_X5,auto_model_X6,fraud_reported_Y,customer_category_Long-Term Client,customer_category_New Client
886,141,30,556538,36722.0,1000,1851.78,0,465248,78800,0,...,0,0,0,0,0,0,0,0,0,0
488,294,46,735307,40331.0,500,1532.8,0,473935,0,0,...,0,0,0,0,0,0,0,0,1,0
265,394,57,395572,36249.0,500,1401.2,0,619892,51500,0,...,0,0,0,0,0,0,0,0,1,0
112,210,41,395269,41215.0,500,1222.75,0,432781,0,-41000,...,0,0,0,0,0,1,0,0,1,0
650,128,31,526296,34184.0,500,1045.12,0,615311,0,-28300,...,0,0,0,0,0,0,0,1,0,0


In [86]:
lm_full = LogisticRegression(max_iter=100000, C=0.5, penalty='l2', solver='lbfgs')

In [87]:
# No transformations necessary
lm_full.fit(X_train, y_train)

STOP: TOTAL NO. of f AND g EVALUATIONS EXCEEDS LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [90]:
# Generate predictions from full model
pred_lm_full = lm_full.predict(X_test)

In [91]:
print('Full Model')
print(classification_report(y_test, pred_lm_full, target_names=['Fraud', 'No Fraud']))



Full Model
              precision    recall  f1-score   support

       Fraud       0.98      1.00      0.99       160
    No Fraud       1.00      0.93      0.96        40

    accuracy                           0.98       200
   macro avg       0.99      0.96      0.98       200
weighted avg       0.99      0.98      0.98       200



In [92]:
# Get coefficients from the logistic regression model
coefficients = lm_full.coef_[0]

# Get feature names
feature_names = X_train.columns

# Combine feature names and coefficients into a DataFrame
coefficients_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients
})

# Sort coefficients by absolute value to identify the most influential features
coefficients_df['Absolute_Coefficient'] = np.abs(coefficients_df['Coefficient'])
coefficients_df = coefficients_df.sort_values(by='Absolute_Coefficient', ascending=False)

# Display the top features contributing to the model
top_features = coefficients_df.head(20)
print("Top 20 Features Contributing to the Logistic Regression Model:")
print(top_features)

Top 20 Features Contributing to the Logistic Regression Model:
                           Feature  Coefficient  Absolute_Coefficient
0               months_as_customer     0.048664              0.048664
1                              age    -0.028411              0.028411
18                       auto_year     0.009198              0.009198
5            policy_annual_premium     0.008160              0.008160
20                  Contract Years     0.004055              0.004055
11        incident_hour_of_the_day    -0.003221              0.003221
14                       witnesses     0.002849              0.002849
12     number_of_vehicles_involved    -0.002038              0.002038
13                 bodily_injuries     0.002020              0.002020
90         incident_city_Hillsdale    -0.001649              0.001649
23              policy_csl_250/500    -0.001396              0.001396
82    authorities_contacted_Police     0.001271              0.001271
1104                auto_ma