In [3]:
import pandas as pd

# Load the dataset from the provided link
url = "https://raw.githubusercontent.com/FlipRoboTechnologies/ML_-Datasets/main/Insurance%20Claim%20Fraud%20Detection/Automobile_insurance_fraud.csv"
df = pd.read_csv(url)

# Display the first few rows of the dataset to understand its structure
print(df.head())

# Check the dimensions of the dataset
print("Dataset dimensions:", df.shape)

# Check for missing values
print(df.isnull().sum())

# Summary statistics of numerical columns
print(df.describe())

# Check data types of columns
print(df.dtypes)


   328  48  521585  17-10-2014  OH   250/500  1000  1406.91        0  466132  \
0  228  42  342868  27-06-2006  IN   250/500  2000  1197.22  5000000  468176   
1  134  29  687698  06-09-2000  OH   100/300  2000  1413.14  5000000  430632   
2  256  41  227811  25-05-1990  IL   250/500  2000  1415.74  6000000  608117   
3  228  44  367455  06-06-2014  IL  500/1000  1000  1583.91  6000000  610706   
4  256  39  104594  12-10-2006  OH   250/500  1000  1351.10        0  478456   

   ...  2 YES.1  71610  6510 13020  52080       Saab    92x  2004  Y  
0  ...  0     ?   5070   780   780   3510   Mercedes   E400  2007  Y  
1  ...  3    NO  34650  7700  3850  23100      Dodge    RAM  2007  N  
2  ...  2    NO  63400  6340  6340  50720  Chevrolet  Tahoe  2014  Y  
3  ...  1    NO   6500  1300   650   4550     Accura    RSX  2009  N  
4  ...  2    NO  64100  6410  6410  51280       Saab     95  2003  Y  

[5 rows x 39 columns]
Dataset dimensions: (999, 39)
328                          0
48       

In [5]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder

# Assuming df is already loaded from the previous step

# Check the columns present in the dataset
print("Columns in the dataset:", df.columns)

# Drop unnecessary columns if they exist
columns_to_drop = ['policy_number', '_c39']
df = df.drop(columns_to_drop, axis=1, errors='ignore')  # Use errors='ignore' to avoid KeyError

# Handle missing values
# Check which columns have missing values
print("Missing values before imputation:\n", df.isnull().sum())

# Impute missing values for numerical columns with mean or median
numerical_cols = df.select_dtypes(include='number').columns
df[numerical_cols] = df[numerical_cols].fillna(df[numerical_cols].mean())

# Impute missing values for categorical columns with mode
categorical_cols = df.select_dtypes(include='object').columns
df[categorical_cols] = df[categorical_cols].fillna(df[categorical_cols].mode().iloc[0])

# Convert categorical variables into numerical representations using LabelEncoder
label_encoder = LabelEncoder()
for col in categorical_cols:
    if df[col].dtype == 'object':
        df[col] = label_encoder.fit_transform(df[col])

# Verify if there are any remaining missing values
print("Missing values after imputation:\n", df.isnull().sum())

# Verify changes and final dataset
print("Final dataset:\n", df.head())


Columns in the dataset: Index(['328', '48', '521585', '17-10-2014', 'OH', '250/500', '1000', '1406.91',
       '0', '466132', 'MALE', 'MD', 'craft-repair', 'sleeping', 'husband',
       '53300', '0.1', '25-01-2015', 'Single Vehicle Collision',
       'Side Collision', 'Major Damage', 'Police', 'SC', 'Columbus',
       '9935 4th Drive', '5', '1', 'YES', '1.1', '2', 'YES.1', '71610', '6510',
       '13020', '52080', 'Saab', '92x', '2004', 'Y'],
      dtype='object')
Missing values before imputation:
 328                          0
48                           0
521585                       0
17-10-2014                   0
OH                           0
250/500                      0
1000                         0
1406.91                      0
0                            0
466132                       0
MALE                         0
MD                           0
craft-repair                 0
sleeping                     0
husband                      0
53300                        0


In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming df is already loaded and cleaned from earlier steps
# Print columns in the dataset to verify 'fraud_reported' existence
print("Columns in the dataset:", df.columns)

# Ensure the target variable 'fraud_reported' exists in the dataset
if 'fraud_reported' in df.columns:
    # Define features (independent variables) and target variable (dependent variable)
    X = df.drop(['fraud_reported'], axis=1)  # Features
    y = df['fraud_reported']  # Target variable

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Initialize the model (Logistic Regression as an example)
    model = LogisticRegression(random_state=42)

    # Train the model on the training data
    model.fit(X_train, y_train)

    # Predictions on the test set
    y_pred = model.predict(X_test)

    # Evaluate the model
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)

    # Classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred))

    # Confusion matrix
    print("\nConfusion Matrix:")
    print(confusion_matrix(y_test, y_pred))
else:
    print("Target variable 'fraud_reported' not found in the dataset.")


Columns in the dataset: Index(['328', '48', '521585', '17-10-2014', 'OH', '250/500', '1000', '1406.91',
       '0', '466132', 'MALE', 'MD', 'craft-repair', 'sleeping', 'husband',
       '53300', '0.1', '25-01-2015', 'Single Vehicle Collision',
       'Side Collision', 'Major Damage', 'Police', 'SC', 'Columbus',
       '9935 4th Drive', '5', '1', 'YES', '1.1', '2', 'YES.1', '71610', '6510',
       '13020', '52080', 'Saab', '92x', '2004', 'Y'],
      dtype='object')
Target variable 'fraud_reported' not found in the dataset.


In [15]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Assuming df is already loaded and cleaned from earlier steps
# Print columns in the dataset to verify 'fraud_reported' existence
print("Columns in the dataset:", df.columns)

# Ensure the target variable 'fraud_reported' exists in the dataset
if 'fraud_reported' in df.columns:
    # Define features (independent variables) and target variable (dependent variable)
    X = df.drop(['fraud_reported'], axis=1)  # Features
    y = df['fraud_reported']  # Target variable

    # Split the data into training and testing sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    # Define the parameter grid for GridSearchCV
    param_grid = {'C': [0.1, 1, 10, 100]}  # Example of regularization parameter C

    # Initialize GridSearchCV
    grid_search = GridSearchCV(LogisticRegression(random_state=42), param_grid, cv=5)

    # Fit GridSearchCV on the training data
    grid_search.fit(X_train, y_train)

    # Get the best estimator and its parameters
    best_model = grid_search.best_estimator_
    print("Best Parameters:", grid_search.best_params_)

    # Evaluate the best model on the test set
    y_pred_best = best_model.predict(X_test)
    accuracy_best = accuracy_score(y_test, y_pred_best)
    print("Accuracy with Best Model:", accuracy_best)

    # Classification report and confusion matrix for the best model
    print("\nClassification Report (Best Model):")
    print(classification_report(y_test, y_pred_best))

    print("\nConfusion Matrix (Best Model):")
    print(confusion_matrix(y_test, y_pred_best))
else:
    print("Target variable 'fraud_reported' not found in the dataset.")


Columns in the dataset: Index(['328', '48', '521585', '17-10-2014', 'OH', '250/500', '1000', '1406.91',
       '0', '466132', 'MALE', 'MD', 'craft-repair', 'sleeping', 'husband',
       '53300', '0.1', '25-01-2015', 'Single Vehicle Collision',
       'Side Collision', 'Major Damage', 'Police', 'SC', 'Columbus',
       '9935 4th Drive', '5', '1', 'YES', '1.1', '2', 'YES.1', '71610', '6510',
       '13020', '52080', 'Saab', '92x', '2004', 'Y'],
      dtype='object')
Target variable 'fraud_reported' not found in the dataset.
