### **Automated Fraud Detection in Vehicle Insurance Claims using Computer Vision**

In [157]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import ResNet50
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.resnet50 import preprocess_input
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [158]:
# Define directory paths
image_dir = '/Users/trilokesh/Desktop/WNS HAckathon/train/images'
csv_file = '/Users/trilokesh/Desktop/WNS HAckathon/train/train.csv'

# Read the CSV file
df = pd.read_csv(csv_file)

# Print the DataFrame
print(df)


      image_id  filename  label
0            1     1.jpg      0
1            2     2.jpg      0
2            3     3.jpg      0
3            4     4.jpg      0
4            5     5.jpg      0
...        ...       ...    ...
8074      8075  8075.jpg      0
8075      8076  8076.jpg      1
8076      8077  8077.jpg      0
8077      8078  8078.jpg      0
8078      8079  8079.jpg      0

[8079 rows x 3 columns]


In [159]:
# Read the CSV file
labels_df = pd.read_csv(csv_file)

# Create an empty DataFrame to store image data
image_data = []

# Iterate through the files in the image directory
for filename in os.listdir(image_dir):
    # Check if the file is an image file
    if filename.endswith('.jpg'):
        # Extract the label from the CSV file based on the filename
        label_row = labels_df[labels_df['filename'] == filename]
        if not label_row.empty:
            label = label_row.iloc[0]['label']
            
            # Append the filename and label to the image data list
            image_data.append({'filename': filename, 'label': label})

# Create a DataFrame from the image data list
image_df = pd.DataFrame(image_data)

# Display the DataFrame
print(image_df.head())


   filename  label
0    63.jpg      1
1  6400.jpg      0
2   823.jpg      0
3  4217.jpg      0
4  3578.jpg      0


In [160]:
labels_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8079 entries, 0 to 8078
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   image_id  8079 non-null   int64 
 1   filename  8079 non-null   object
 2   label     8079 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 189.5+ KB


In [161]:
labels_df['label'].value_counts()

0    7614
1     465
Name: label, dtype: int64

In [162]:
labels_df.iloc[-1]

image_id        8079
filename    8079.jpg
label              0
Name: 8078, dtype: object

In [163]:
labels_df['label'].value_counts()

0    7614
1     465
Name: label, dtype: int64

In [165]:
import pandas as pd
# Assuming labels_df is your DataFrame containing the data
class_1 = labels_df[labels_df['label'] == 1]
# Concatenate class 1 with itself until it reaches 2000 samples, then take the first 2000 samples
labels_df_balanced = pd.concat([labels_df[labels_df['label'] == 0], pd.concat([class_1] * ((2000 - len(class_1)) // len(class_1) + 1), ignore_index=True)[:2000]])
# Shuffle the concatenated data frame
labels_df_balanced = labels_df_balanced.sample(frac=1, random_state=42)
# Check the class distribution in the balanced data frame
labels_df_balanced['label'].value_counts()


0    7614
1    1860
Name: label, dtype: int64

In [166]:
import pandas as pd

# Assuming labels_df_balanced is your DataFrame containing the balanced data
labels_df_balanced_sampled = labels_df_balanced.groupby('label').apply(lambda x: x.sample(n=1860, random_state=42)).reset_index(drop=True)

# Check the class distribution in the sampled data frame
labels_df_balanced_sampled['label'].value_counts()


0    1860
1    1860
Name: label, dtype: int64

In [175]:
labels_df_balanced=labels_df_balanced_sampled

In [176]:
labels_df_balanced['label'].value_counts()

0    1860
1    1860
Name: label, dtype: int64

In [177]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.applications import VGG16
from tensorflow.keras.preprocessing.image import load_img, img_to_array
from tensorflow.keras.applications.vgg16 import preprocess_input
from sklearn.model_selection import train_test_split

# Load pre-trained VGG16 model
base_model = VGG16(weights='imagenet', include_top=False, input_shape=(224, 224, 3))

# Extract features using VGG16
def extract_features(image_path):
    img = load_img(image_path, target_size=(224, 224))
    img_array = img_to_array(img)
    img_array = np.expand_dims(img_array, axis=0)
    img_array = preprocess_input(img_array)
    features = base_model.predict(img_array)
    features = np.squeeze(features)
    return features

# Extract features for all images
image_features = []
labels = []

for filename in labels_df_balanced['filename']:
    image_path = os.path.join(image_dir, filename)
    label = labels_df_balanced.loc[labels_df_balanced['filename'] == filename, 'label'].values[0]
    features = extract_features(image_path)
    image_features.append(features)
    labels.append(label)

# Convert lists to arrays
X = np.array(image_features)
y = np.array(labels)





In [178]:
# Check the shape of extracted features
print("Shape of extracted features:", X.shape)

Shape of extracted features: (3720, 7, 7, 512)


In [179]:
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Further split training data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

In [180]:
# Define your classifier model
classifier = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=X_train.shape[1:]),
    tf.keras.layers.Dense(256, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dropout(0.5),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
classifier.compile(optimizer='adam',
                   loss='binary_crossentropy',
                   metrics=['accuracy'])

# Train the classifier model
history = classifier.fit(X_train, y_train,
                         epochs=10,
                         batch_size=8,
                         validation_data=(X_test, y_test))


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [181]:
# Evaluate the model on the validation set
val_loss, val_accuracy = classifier.evaluate(X_val, y_val)
# Print the validation loss and accuracy
print("Validation Loss:", val_loss)
print("Validation Accuracy:", val_accuracy)

Validation Loss: 0.1723548024892807
Validation Accuracy: 0.9177852272987366


In [182]:
from sklearn.metrics import classification_report

# Predict probabilities for the test set
y_pred_prob = classifier.predict(X_test)

# Threshold probabilities to get predicted classes
y_pred = (y_pred_prob > 0.5).astype(int)

# Print classification report
print(classification_report(y_test, y_pred))


              precision    recall  f1-score   support

           0       0.90      0.97      0.94       385
           1       0.97      0.89      0.93       359

    accuracy                           0.93       744
   macro avg       0.94      0.93      0.93       744
weighted avg       0.93      0.93      0.93       744



In [183]:
# Define the path to save the model
model_path = 'best_model_WNS_image classification.h5'

# Save the model
classifier.save(model_path)

print("Model saved successfully.")

Model saved successfully.


  saving_api.save_model(


In [184]:
# Directory containing test images
test_image_dir = '/Users/trilokesh/Desktop/WNS HAckathon/test/images'

# Extract features for test images
test_image_features = []
test_image_names = []

for filename in os.listdir(test_image_dir):
    if filename.endswith('.jpg'):
        image_path = os.path.join(test_image_dir, filename)
        features = extract_features(image_path)
        test_image_features.append(features)
        test_image_names.append(filename)

# Convert test features to array
X_test = np.array(test_image_features)

# Make predictions on test data
y_pred_proba = classifier.predict(X_test)
y_pred = (y_pred_proba > 0.5).astype(int)




In [147]:
# Create a DataFrame with image names and predicted labels
test_results_df = pd.DataFrame({'image_id': test_image_names, 'label': y_pred.flatten()})

In [148]:
# Sort the DataFrame by 'image_id'
test_results_df = test_results_df.sort_values(by='image_id',ascending=False)

# Strip the 'image_id' to remove the file extension '.jpg'
test_results_df['image_id'] = test_results_df['image_id'].str.split('.').str[0]

# Print the sorted and stripped DataFrame
print(test_results_df)


     image_id  label
1533     9999      0
1721     9998      0
2158     9997      1
1973     9996      1
1749     9995      0
...       ...    ...
2469    10004      0
2115    10003      1
2016    10002      0
1803    10001      0
1893    10000      0

[3462 rows x 2 columns]


In [149]:
# Save predictions to CSV file
test_results_df.to_csv('test_predictions.csv', index=False)

print("Test predictions saved to test_predictions.csv")

Test predictions saved to test_predictions.csv


In [25]:
t=pd.read_csv('test_predictions.csv')

In [26]:
t.head()

Unnamed: 0,image_id,label
0,9999,0
1,9998,0
2,9997,0
3,9996,0
4,9995,0


In [28]:
t['label'].value_counts()

0    3388
1      74
Name: label, dtype: int64

In [115]:
t.head()

Unnamed: 0,image_id,label
0,9999,0
1,9998,0
2,9997,0
3,9996,0
4,9995,0


In [116]:
from tensorflow.keras.models import load_model

# Provide the file path of the saved model
model_path = 'resnet_model.h5'
# Load the saved model
resnet_model = load_model(model_path)
# Display model summary
resnet_model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 flatten_3 (Flatten)         (None, 62720)             0         
                                                                 
 dense_9 (Dense)             (None, 256)               16056576  
                                                                 
 dropout_6 (Dropout)         (None, 256)               0         
                                                                 
 dense_10 (Dense)            (None, 128)               32896     
                                                                 
 dropout_7 (Dropout)         (None, 128)               0         
                                                                 
 dense_11 (Dense)            (None, 1)                 129       
                                                                 
Total params: 16089601 (61.38 MB)
Trainable params: 16