In [62]:
import os
import pickle
import pandas as pd
import json

# Paths to the directories containing the pickle files
pickle_path_codeformer = '/ceph/hpc/data/st2207-pgp-users/ldragar/Marijaproject/SelfBlendedImages/preds_codeformer'
pickle_path_original = '/ceph/hpc/data/st2207-pgp-users/ldragar/Marijaproject/SelfBlendedImages/preds_original_vids2'

# Function to load pickle files from a directory into a list of dataframes
def load_pickle_files(pickle_path):
    pickle_files = os.listdir(pickle_path)
    all_preds = []
    for file in pickle_files:
        file_path = os.path.join(pickle_path, file)
        with open(file_path, 'rb') as f:
            preds = pickle.load(f)
            all_preds.append(preds)
    return all_preds

# Load predictions from both directories
all_preds_codeformer = load_pickle_files(pickle_path_codeformer)
all_preds_original = load_pickle_files(pickle_path_original)

# Concatenate predictions into DataFrames
df_codeformer = pd.concat(all_preds_codeformer)
df_original = pd.concat(all_preds_original)

# Print DataFrame shapes and initial rows
print(df_codeformer.head(), df_original.head())
print(df_codeformer.shape, df_original.shape)

# Add video basename column (i.e., the file name without the path)
df_codeformer["vid_basename"] = df_codeformer["video"].apply(lambda x: os.path.basename(x))
df_original["vid_basename"] = df_original["video"].apply(lambda x: os.path.basename(x))

# Load JSON files
with open('/ceph/hpc/data/st2207-pgp-users/ldragar/Marijaproject/train.json') as f:
    train = json.load(f)
with open('/ceph/hpc/data/st2207-pgp-users/ldragar/Marijaproject/val.json') as f:
    val = json.load(f)
with open('/ceph/hpc/data/st2207-pgp-users/ldragar/Marijaproject/test.json') as f:
    test = json.load(f)

# Create lists of file names by combining elements of each pair and adding the .mp4 extension
def create_file_list(pairs):
    files = ["_".join(x) + '.mp4' for x in pairs]
    reverse_files = ["_".join(x[::-1]) + '.mp4' for x in pairs]
    return files + reverse_files

train_files = create_file_list(train)
val_files = create_file_list(val)
test_files = create_file_list(test)

# Extract real video names (first element in each pair) and add .mp4 extension
train_real = [x[0] + '.mp4' for x in train] + [x[1] + '.mp4' for x in train]
val_real = [x[0] + '.mp4' for x in val] + [x[1] + '.mp4' for x in val]
test_real = [x[0] + '.mp4' for x in test] + [x[1] + '.mp4' for x in test]

# Assign 'split' labels to the real videos DataFrame
df_original['split'] = 'train'
df_original.loc[df_original['vid_basename'].isin(val_real), 'split'] = 'val'
df_original.loc[df_original['vid_basename'].isin(test_real), 'split'] = 'test'

# Print the updated DataFrame with split information
print(df_original.head())

# Assign 'split' labels to the codeformer videos DataFrame
df_codeformer['split'] = 'train'
df_codeformer.loc[df_codeformer['vid_basename'].isin(val_files), 'split'] = 'val'
df_codeformer.loc[df_codeformer['vid_basename'].isin(test_files), 'split'] = 'test'

# Print the updated DataFrame with split information
print(df_codeformer.head())


#set index to vid_basename and sort
df_codeformer.set_index('vid_basename', inplace=True)
df_codeformer.sort_index(inplace=True)

df_original.set_index('vid_basename', inplace=True)
df_original.sort_index(inplace=True)


                                               video  \
0  /ceph/hpc/data/st2207-pgp-users/ldragar/Marija...   
1  /ceph/hpc/data/st2207-pgp-users/ldragar/Marija...   
2  /ceph/hpc/data/st2207-pgp-users/ldragar/Marija...   
3  /ceph/hpc/data/st2207-pgp-users/ldragar/Marija...   
4  /ceph/hpc/data/st2207-pgp-users/ldragar/Marija...   

                                               preds      mean     scale  \
0  [0.0025429681, 0.012217958, 0.010517527, 0.010...  0.058416  2.000000   
1  [0.8009218, 0.25600317, 0.12189435, 0.12106072...  0.072188  2.134375   
2  [0.0005287641, 0.000975369, 0.00066453347, 0.0...  0.003599  2.000000   
3  [0.52071476, 0.41383687, 0.64311093, 0.4810853...  0.621740  2.134375   
4  [0.02289241, 0.020640114, 0.013815321, 0.00520...  0.007637  2.134146   

   sanitypred  
0    0.999630  
1    0.392375  
2    0.998784  
3    0.998589  
4    0.956836                                                  video  \
0  /ceph/hpc/data/st2207-pgp-users/ldragar/Marija...  

In [63]:
df_codeformer

Unnamed: 0_level_0,video,preds,mean,scale,sanitypred,split
vid_basename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
000_003.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.21521401, 0.6541736, 0.6582172, 0.64888835,...",0.748342,2.134375,0.995975,test
001_870.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.04179751, 0.15466411, 0.035751067, 0.035914...",0.069863,2.000000,0.999192,train
002_006.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.0026411365, 0.008044865, 0.014432109, 0.004...",0.002163,2.000000,0.959078,train
003_000.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.071747094, 0.13728707, 0.053607468, 0.04295...",0.118011,2.134375,0.970985,test
004_982.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.32791942, 0.65048635, 0.9097166, 0.889481, ...",0.836489,2.000000,0.999707,val
...,...,...,...,...,...,...
995_233.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.8751523, 0.1019319, 0.028662963, 0.01785706...",0.137306,2.134375,0.997905,test
996_056.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.061223052, 0.8049624, 0.8318892, 0.9826325,...",0.847456,2.133333,0.999819,train
997_040.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.03697372, 0.0031647605, 0.004751568, 0.0123...",0.008861,2.133333,0.998872,train
998_561.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.0076610083, 0.007005981, 0.021117808, 0.010...",0.027228,2.000000,0.987437,train


In [65]:
df_original

Unnamed: 0_level_0,video,preds,mean,size,frame_count,split
vid_basename,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
000.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.032480344, 0.032480344, 0.032480344, 0.0324...",0.045246,"(480, 640, 3)",396,test
001.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.0056635644, 0.0013595405, 0.0026788537, 0.0...",0.003251,"(720, 1280, 3)",460,train
002.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.001661605, 0.0032865955, 0.0009039449, 0.00...",0.004779,"(720, 1280, 3)",693,train
003.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.02016747, 0.026011985, 0.06545948, 0.092410...",0.014508,"(480, 640, 3)",303,test
004.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.045055524, 0.06715119, 0.62459284, 0.594894...",0.161992,"(720, 1280, 3)",309,val
...,...,...,...,...,...,...
995.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.08514435, 0.5780664, 0.8194503, 0.78110474,...",0.044756,"(480, 640, 3)",548,test
996.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.0029959523, 0.0030336978, 0.0014628585, 0.0...",0.006804,"(480, 720, 3)",312,train
997.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.0007065389, 0.00075586146, 0.0008623703, 0....",0.005697,"(480, 600, 3)",438,train
998.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.00041957808, 0.0005281228, 0.00075592234, 0...",0.004479,"(720, 1280, 3)",344,train


In [66]:
print(df_codeformer['split'].value_counts())
print(df_original['split'].value_counts())
assert df_codeformer['split'].values.tolist() == df_original['split'].values.tolist()

split
train    720
test     140
val      140
Name: count, dtype: int64
split
train    720
test     140
val      140
Name: count, dtype: int64


In [67]:
#join the 2 dataframes together mark type real and fake

df = pd.concat([df_codeformer, df_original], keys=['fake', 'real'], names=['type']).reset_index()
df


Unnamed: 0,type,vid_basename,video,preds,mean,scale,sanitypred,split,size,frame_count
0,fake,000_003.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.21521401, 0.6541736, 0.6582172, 0.64888835,...",0.748342,2.134375,0.995975,test,,
1,fake,001_870.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.04179751, 0.15466411, 0.035751067, 0.035914...",0.069863,2.000000,0.999192,train,,
2,fake,002_006.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.0026411365, 0.008044865, 0.014432109, 0.004...",0.002163,2.000000,0.959078,train,,
3,fake,003_000.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.071747094, 0.13728707, 0.053607468, 0.04295...",0.118011,2.134375,0.970985,test,,
4,fake,004_982.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.32791942, 0.65048635, 0.9097166, 0.889481, ...",0.836489,2.000000,0.999707,val,,
...,...,...,...,...,...,...,...,...,...,...
1995,real,995.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.08514435, 0.5780664, 0.8194503, 0.78110474,...",0.044756,,,test,"(480, 640, 3)",548.0
1996,real,996.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.0029959523, 0.0030336978, 0.0014628585, 0.0...",0.006804,,,train,"(480, 720, 3)",312.0
1997,real,997.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.0007065389, 0.00075586146, 0.0008623703, 0....",0.005697,,,train,"(480, 600, 3)",438.0
1998,real,998.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.00041957808, 0.0005281228, 0.00075592234, 0...",0.004479,,,train,"(720, 1280, 3)",344.0


In [68]:
#set ground thruth to 0 for real and 1 for fake
df['ground_truth'] = 0
df.loc[df['type'] == 'fake', 'ground_truth'] = 1

df



Unnamed: 0,type,vid_basename,video,preds,mean,scale,sanitypred,split,size,frame_count,ground_truth
0,fake,000_003.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.21521401, 0.6541736, 0.6582172, 0.64888835,...",0.748342,2.134375,0.995975,test,,,1
1,fake,001_870.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.04179751, 0.15466411, 0.035751067, 0.035914...",0.069863,2.000000,0.999192,train,,,1
2,fake,002_006.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.0026411365, 0.008044865, 0.014432109, 0.004...",0.002163,2.000000,0.959078,train,,,1
3,fake,003_000.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.071747094, 0.13728707, 0.053607468, 0.04295...",0.118011,2.134375,0.970985,test,,,1
4,fake,004_982.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.32791942, 0.65048635, 0.9097166, 0.889481, ...",0.836489,2.000000,0.999707,val,,,1
...,...,...,...,...,...,...,...,...,...,...,...
1995,real,995.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.08514435, 0.5780664, 0.8194503, 0.78110474,...",0.044756,,,test,"(480, 640, 3)",548.0,0
1996,real,996.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.0029959523, 0.0030336978, 0.0014628585, 0.0...",0.006804,,,train,"(480, 720, 3)",312.0,0
1997,real,997.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.0007065389, 0.00075586146, 0.0008623703, 0....",0.005697,,,train,"(480, 600, 3)",438.0,0
1998,real,998.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.00041957808, 0.0005281228, 0.00075592234, 0...",0.004479,,,train,"(720, 1280, 3)",344.0,0


In [69]:
#if sanitypred value is nan set it to coresponding pred value that is in the same row
df['sanitypred'] = df['sanitypred'].fillna(df['mean'])




In [70]:
df

Unnamed: 0,type,vid_basename,video,preds,mean,scale,sanitypred,split,size,frame_count,ground_truth
0,fake,000_003.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.21521401, 0.6541736, 0.6582172, 0.64888835,...",0.748342,2.134375,0.995975,test,,,1
1,fake,001_870.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.04179751, 0.15466411, 0.035751067, 0.035914...",0.069863,2.000000,0.999192,train,,,1
2,fake,002_006.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.0026411365, 0.008044865, 0.014432109, 0.004...",0.002163,2.000000,0.959078,train,,,1
3,fake,003_000.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.071747094, 0.13728707, 0.053607468, 0.04295...",0.118011,2.134375,0.970985,test,,,1
4,fake,004_982.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.32791942, 0.65048635, 0.9097166, 0.889481, ...",0.836489,2.000000,0.999707,val,,,1
...,...,...,...,...,...,...,...,...,...,...,...
1995,real,995.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.08514435, 0.5780664, 0.8194503, 0.78110474,...",0.044756,,0.044756,test,"(480, 640, 3)",548.0,0
1996,real,996.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.0029959523, 0.0030336978, 0.0014628585, 0.0...",0.006804,,0.006804,train,"(480, 720, 3)",312.0,0
1997,real,997.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.0007065389, 0.00075586146, 0.0008623703, 0....",0.005697,,0.005697,train,"(480, 600, 3)",438.0,0
1998,real,998.mp4,/ceph/hpc/data/st2207-pgp-users/ldragar/Marija...,"[0.00041957808, 0.0005281228, 0.00075592234, 0...",0.004479,,0.004479,train,"(720, 1280, 3)",344.0,0


In [71]:
import numpy as np
from sklearn.metrics import roc_curve, auc
import plotly.graph_objects as go

# val_labels and val_scores_all need to be predefined with your validation data
#get all labels and scores
fpr, tpr, thresholds = roc_curve(df['ground_truth'], df['mean'])
roc_auc = auc(fpr, tpr)

# Plot ROC curve
fig = go.Figure()
fig.add_trace(go.Scatter(x=fpr, y=tpr, mode='lines', name='ROC curve (area = %0.2f)' % roc_auc))
fig.add_trace(go.Scatter(x=[0, 1], y=[0, 1], line=dict(color='black', dash='dash')))

fig.update_layout(
    title='Receiver Operating Characteristic',
    xaxis=dict(title='False Positive Rate'),
    yaxis=dict(title='True Positive Rate'),
    showlegend=True
)

# Compute the best threshold
best_threshold = thresholds[np.argmax(tpr - fpr)]
print(f'Best threshold: {best_threshold}')

# Find the corresponding FPR and TPR for the best threshold
best_fpr = fpr[np.argmax(tpr - fpr)]
best_tpr = tpr[np.argmax(tpr - fpr)]

# Show the best threshold as a red marker on the ROC curve
fig.add_trace(go.Scatter(x=[best_fpr], y=[best_tpr], mode='markers', name='Best threshold', marker=dict(color='red', size=10)))

fig.show()

Best threshold: 0.042880285531282425


In [79]:
#save df to pickle
df.to_pickle('/ceph/hpc/data/st2207-pgp-users/ldragar/Marijaproject/codeformer_results.pkl')




In [81]:
loaded = pd.read_pickle('/ceph/hpc/data/st2207-pgp-users/ldragar/Marijaproject/codeformer_results.pkl')
loaded.columns

Index(['type', 'vid_basename', 'video', 'preds', 'mean', 'scale', 'sanitypred',
       'split', 'size', 'frame_count', 'ground_truth'],
      dtype='object')