In [1]:
import pandas as pd
import cv2
import os
import numpy as np
from numpy import nan
import tensorflow as tf
from sklearn.metrics import f1_score, r2_score
from sklearn.model_selection import KFold
from tensorflow.keras.callbacks import EarlyStopping

In [2]:
base_folder = "./dataset/"

sample = pd.read_csv(base_folder+"sample_submission.csv")

#CSV
csv_train = pd.read_csv(base_folder+"train.csv")
csv_test = pd.read_csv(base_folder+"test.csv")

In [3]:
csv_train

Unnamed: 0,Image_path,Insurance_company,Cost_of_vehicle,Min_coverage,Expiry_date,Max_coverage,Condition,Amount
0,img_4513976.jpg,BQ,41500.0,1037.5,2026-12-03,36142.68,0,0.0
1,img_7764995.jpg,BQ,50700.0,1267.5,2025-07-10,12753.00,1,6194.0
2,img_451308.jpg,A,49500.0,1237.5,2022-08-11,43102.68,0,0.0
3,img_7768372.jpg,A,33500.0,837.5,2022-08-02,8453.00,1,7699.0
4,img_7765274.jpg,AC,27600.0,690.0,2026-05-01,6978.00,1,8849.0
...,...,...,...,...,...,...,...,...
1394,img_4637237.jpg,DA,52300.0,1307.5,2025-02-17,13153.00,1,4565.0
1395,img_4637000.jpg,BQ,41500.0,1037.5,2023-12-30,10453.00,1,3363.0
1396,img_4637503.jpg,AA,31400.0,785.0,2022-11-25,7928.00,1,5336.0
1397,img_4515101.jpg,A,33200.0,830.0,2022-10-10,8378.00,1,8734.0


###### Data Preprocessing

In [4]:
#Deleting Train data with missing target
csv_train = csv_train.loc[~csv_train["Amount"].isnull(),:]

In [5]:
#Combining Train and Test data
csv_train["train"] = 1
csv_test["train"] = 0

dataset = pd.concat([csv_train,csv_test])

#Data Imputation with Average
for col in ['Image_path', 'Insurance_company', 'Cost_of_vehicle', 'Min_coverage','Expiry_date', 'Max_coverage']:
    if dataset[col].isnull().sum()>0:
        dataset[col].fillna(dataset[col].mean(), inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [6]:
dataset.isnull().sum()

Image_path             0
Insurance_company      0
Cost_of_vehicle        0
Min_coverage           0
Expiry_date            0
Max_coverage           0
Condition            600
Amount               600
train                  0
dtype: int64

In [7]:
dataset = pd.get_dummies(dataset, columns=["Insurance_company"])

In [8]:
dataset["Expiry_date"] = pd.to_datetime(dataset["Expiry_date"] )

dataset["day"] = dataset["Expiry_date"].dt.day
dataset["month"] = dataset["Expiry_date"].dt.month
dataset["year"] = dataset["Expiry_date"].dt.year

In [9]:
del dataset["Expiry_date"]

In [10]:
dataset.dtypes

Image_path               object
Cost_of_vehicle         float64
Min_coverage            float64
Max_coverage            float64
Condition               float64
Amount                  float64
train                     int64
Insurance_company_A       uint8
Insurance_company_AA      uint8
Insurance_company_AC      uint8
Insurance_company_B       uint8
Insurance_company_BB      uint8
Insurance_company_BC      uint8
Insurance_company_BQ      uint8
Insurance_company_C       uint8
Insurance_company_DA      uint8
Insurance_company_O       uint8
Insurance_company_RE      uint8
day                       int64
month                     int64
year                      int64
dtype: object

In [11]:
csv_train["train"] = 1
csv_test["train"] = 0

csv_train = dataset[dataset["train"]==1]
csv_test = dataset[dataset["train"]==0]

del csv_train["train"]
del csv_test["train"]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [12]:
csv_train.shape, csv_test.shape

((1388, 20), (600, 20))

In [13]:
def load_images_from_folder(folder,path_list):
    images = []
    for filename in path_list:
        img = cv2.imread(os.path.join(folder,filename))
        img = cv2.resize(img, (255,255), interpolation = cv2.INTER_AREA)
        images.append(img)
    return images

imgs_train = load_images_from_folder("./dataset/trainImages/",csv_train["Image_path"])
imgs_test = load_images_from_folder("./dataset/testImages/",csv_test["Image_path"])

In [14]:
del csv_train["Image_path"]
del csv_test["Image_path"]

In [15]:
csv_train.shape, csv_test.shape

((1388, 19), (600, 19))

###### Modelling

In [16]:
##########Pretrained Model for Image##########
pretrained = tf.keras.applications.VGG16(include_top = False,
                           input_shape = (255,255,3),
                           pooling = None)

#Freezing all layers
for layer in pretrained.layers:
    layer.trainable = False
    
    
model1_in = pretrained.input
tmp = tf.keras.layers.Flatten()(pretrained.output)
model1_out = tf.keras.layers.Dense(tmp.shape[1], activation='relu', name='layer_2_m1')(tmp)
model1 = tf.keras.models.Model(model1_in, model1_out)

##########Dense Layer for Tabular Data##########
model2_in = tf.keras.layers.Input(shape=csv_train.shape[1]-2)
model2_out = tf.keras.layers.Dense(csv_train.shape[1]-2, activation='relu', name='layer_2_m2')(model2_in)
model2 = tf.keras.models.Model(model2_in, model2_out)


##########Concatinating Tabular and Image Networks##########
concatenated = tf.keras.layers.concatenate([model1_out, model2_out])
temp1 = tf.keras.layers.Dense(70)(concatenated)
temp2 = tf.keras.layers.Dropout(0.3)(temp1)
temp3 = tf.keras.layers.Dense(30)(temp2)
temp4 = tf.keras.layers.Dropout(0.3)(temp3)
#out = tf.keras.layers.Dense(2)(temp4)

out1_in = temp4
out1_out = tf.keras.layers.Dense(1, activation='softmax', name='Classification')(out1_in)
#out1 = tf.keras.models.Model(out1_in, out1_out)

out2_in = temp4
out2_out = tf.keras.layers.Dense(1, name='Regression')(out2_in)
#out2 = tf.keras.models.Model(out2_in, out2_out)

#out = tf.keras.layers.concatenate([out1_out, out2_out])

merged_model = tf.keras.models.Model([model1_in, model2_in], [out1_out,out2_out])

In [17]:
merged_model.summary()

Model: "model_2"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, 255, 255, 3) 0                                            
__________________________________________________________________________________________________
block1_conv1 (Conv2D)           (None, 255, 255, 64) 1792        input_1[0][0]                    
__________________________________________________________________________________________________
block1_conv2 (Conv2D)           (None, 255, 255, 64) 36928       block1_conv1[0][0]               
__________________________________________________________________________________________________
block1_pool (MaxPooling2D)      (None, 127, 127, 64) 0           block1_conv2[0][0]               
____________________________________________________________________________________________

In [18]:
true , pred = -1, -1
def custom_metric(y_true, y_pred):
    global true , pred
    true , pred = y_true, y_pred
    print(y_true, y_pred)
    
    y_true_img, y_true_csv = y_true
    y_pred_img, y_pred_csv = y_pred
    classification_error = max(0,100*f1_score(y_true_img, y_pred_img, average='macro'))
    regression_error = max(0,100*r2_score(y_true_csv, y_pred_csv))    
    return (classification_error+regression_error)/2

In [19]:
from keras import backend as K

def recall_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    possible_positives = K.sum(K.round(K.clip(y_true, 0, 1)))
    recall = true_positives / (possible_positives + K.epsilon())
    return recall

def precision_m(y_true, y_pred):
    true_positives = K.sum(K.round(K.clip(y_true * y_pred, 0, 1)))
    predicted_positives = K.sum(K.round(K.clip(y_pred, 0, 1)))
    precision = true_positives / (predicted_positives + K.epsilon())
    return precision

def f1_m(y_true, y_pred):
    precision = precision_m(y_true, y_pred)
    recall = recall_m(y_true, y_pred)
    return 2*((precision*recall)/(precision+recall+K.epsilon()))

In [20]:
from keras import backend as K

def R2(y_true, y_pred):
    from keras import backend as K
    SS_res =  K.sum(K.square( y_true-y_pred ))
    SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) )
    return ( 1 - SS_res/(SS_tot + K.epsilon()) )

In [21]:
def custom_metric(y_true, y_pred):
    classification_err = max(0,100*f1_m(y_true, y_pred))
    regression_err = max(0,100*R2(y_true, y_pred))
    return (classification_err+regression_err)/2

In [22]:
def root_mean_squared_error(y_true, y_pred):
        return K.sqrt(K.mean(K.square(y_pred - y_true)))

In [23]:
"""
merged_model.compile(optimizer=TF.keras.optimizers.Adam(lr=0.001), 
                           loss=custom_loss)
"""
merged_model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.001),
                     loss={'Regression': root_mean_squared_error, "Classification":f1_m},
                    )


In [24]:
X_img =  np.asarray([t_img for t_img in imgs_train])

csv_train = csv_train.reset_index(drop=True)
X_csv =  csv_train.loc[:, ((csv_train.columns!="Condition")&(csv_train.columns!="Amount"))]

y = csv_train.loc[:, ["Condition","Amount"]]

In [25]:
kf = KFold(n_splits=4)

split_predictions = pd.DataFrame()
count = 1

early_stopping = EarlyStopping(monitor='val_loss',
                              patience=5)

for train_index, val_index in kf.split(X_csv, y):
    x_train_img = X_img[train_index]
    x_val_img = X_img[val_index]
    
    x_train_csv = X_csv.loc[train_index, :]
    x_val_csv = X_csv.loc[val_index,:]
    
    y_train = y.loc[train_index, :]
    y_val = y.loc[val_index,:]
    
    merged_model.fit(
        [
            np.asarray([t_img for t_img in x_train_img]),
            x_train_csv.loc[:, ((x_train_csv.columns!="Condition")&(x_train_csv.columns!="Amount"))]
        ],
        y_train,
        epochs=30,
        batch_size=50,
        validation_data=(
            [
                x_val_img,
                x_val_csv
            ],
            y_val),
        callbacks=[early_stopping]
    )
    
    predictions = merged_model.predict(
        [
            np.asarray([t_img for t_img in imgs_test]),
            csv_test.loc[:, ((csv_test.columns!="Condition")&(csv_test.columns!="Amount"))]
        ])

    split_predictions["Condition"+str(count)] = pd.Series([i[0] for i in predictions[0]])
    split_predictions["Amount"+str(count)] = pd.Series([i[0] for i in predictions[1]])
    
    count+=1

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30

KeyboardInterrupt: 

In [72]:
merged_model.fit(
    [
        np.asarray([t_img for t_img in imgs_train]),
        csv_train.loc[:, ((csv_train.columns!="Condition")&(csv_train.columns!="Amount"))]
    ],
    csv_train.loc[:, ["Condition","Amount"]],
    epochs=3,
    batch_size=50) #batch_size, validation_data=(val_x, val_y)

Epoch 1/3
Epoch 2/3
Epoch 3/3


<tensorflow.python.keras.callbacks.History at 0x1cf841e62c8>

In [73]:
predictions = merged_model.predict(
    [
        np.asarray([t_img for t_img in imgs_test]),
        csv_test.loc[:, ((csv_test.columns!="Condition")&(csv_test.columns!="Amount"))]
    ])


#predictions = pd.DataFrame(predictions,columns=["Condition","Amount"])
predictions

[array([[1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
        [1.],
      

In [75]:
tmp = pd.DataFrame()

tmp["Condition"] = pd.Series([i[0] for i in predictions[0]])
tmp["Amount"] = pd.Series([i[0] for i in predictions[1]])

predictions = tmp

t_csv_test = pd.read_csv(base_folder+"test.csv")

predictions["Image_path"] = t_csv_test["Image_path"]
predictions[["Image_path","Condition","Amount"]].to_csv("rmse_f1_prediction.csv")

KeyError: 0

In [29]:
predictions = pd.read_csv("first_prediction_treated_thresh_100.csv")

In [61]:
csv_train.loc[csv_train["Condition"]==1,"Amount"].min(), csv_train.loc[csv_train["Condition"]==1,"Amount"].mean(), csv_train.loc[csv_train["Condition"]==1,"Amount"].max()

(-999.0, 4433.356089992242, 59844.0)

In [62]:
csv_train.loc[csv_train["Condition"]==0,"Amount"].min(), csv_train.loc[csv_train["Condition"]==0,"Amount"].mean(), csv_train.loc[csv_train["Condition"]==0,"Amount"].max()

(0.0, 0.0, 0.0)

In [64]:
csv_train["Condition"].value_counts()*100/len(csv_train)

1.0    92.867435
0.0     7.132565
Name: Condition, dtype: float64

In [54]:
predictions.loc[predictions["Condition"]<=0, ["Condition","Amount"]] = 0
predictions.loc[predictions["Condition"]>0, "Condition"] = 1

predictions["Condition"].value_counts()*100/len(predictions)

0.0    92.333333
1.0     7.666667
Name: Condition, dtype: float64

In [79]:
predictions[["Image_path","Condition","Amount"]].to_csv("C.csv")

In [77]:
predictions.loc[predictions["Amount"]<=100,["Condition","Amount"]] = 0

In [78]:
predictions.loc[predictions["Amount"]>100,["Condition",]] = 1

In [94]:
t_csv_test = pd.read_csv(base_folder+"test.csv")

In [95]:
predictions["Image_path"] = t_csv_test["Image_path"]

In [96]:
predictions[["Image_path","Condition","Amount"]].to_csv("first_prediction_treated_thresh_100.csv")

In [30]:
predictions["Condition"].value_counts()*100/len(predictions)

1.0    95.166667
0.0     4.833333
Name: Condition, dtype: float64