In [None]:
from glob import glob
from shutil import copyfile
from tqdm import tqdm
import numpy as np
import pandas as pd
import random
import os
from concurrent.futures import ThreadPoolExecutor

In [None]:
def copyfiles(origin_paths, img_paths):
    for origin_path, img_path in zip(origin_paths, img_paths):
        if not os.path.exists(img_path):
            try:
                copyfile(origin_path, img_path)
            except:
                print('Error copying file: ', origin_path)
    print('Done')

## Forgery Type

In [None]:
SEED = 2000
DATA_ROOT = '/Datasets/DeepfakeAttribution/release'

FORGERY_TYPES = {
    "RealFace": 0,
    "IdentitySwap": 1,
    "ExpressionTransfer": 2,
    "AttributeManipulation": 3,
    "EntireFaceSyncthesis": 4,
}

FORGERYID_TO_TYPE = {i: k for k, i in FORGERY_TYPES.items()}

ATTACK_METHOD = {
    "Real": 0,
    "FaceSwap": 1,
    "Deepfakes": 2,
    "FaceShifter": 3,
    "DeepFaceLab": 4,
    "FSGAN": 5,
    "FaceNet": 6,
    "Face2Face": 7,
    "NeuralTextures": 8,
    "Talking-Head-Video": 9,
    "ATVG-Net": 10,
    "FOMM": 11,
    "Wav2Lip": 12,
    "ATFHP": 13,
    "MakeItTalk": 14,
    "MaskGAN": 15,
    "StarGAN2": 16,
    "SC-FEGAN": 17,
    "DiscoFaceGAN": 18,
    "FaceAPP": 19,
    "StarGAN": 20,
    "PGGAN": 21,
    "CycleGAN": 22,
    "StyleGAN": 23,
    "StyleGAN2": 24,
}

ATTACK_CODE = {
    "Real": "None",
    "FaceSwap": "https://github.com/MarekKowalski/FaceSwap/",
    "Deepfakes": "https://github.com/deepfakes/faceswap",
    "FaceShifter": "https://github.com/mindslab-ai/faceshifter",
    "DeepFaceLab": "https://github.com/iperov/DeepFaceLab",
    "FSGAN": "https://github.com/YuvalNirkin/fsgan",
    "FaceNet": "https://github.com/davidsandberg/facenet",
    "Face2Face": "None",
    "NeuralTextures": "https://github.com/SSRSGJYD/NeuralTexture",
    "Talking-Head-Video": "https://github.com/sibozhang/Text2Video",
    "ATVG-Net": "https://github.com/lelechen63/ATVGnet",
    "FOMM": "https://github.com/AliaksandrSiarohin/first-order-model",
    "Wav2Lip": "https://github.com/Rudrabha/Wav2Lip",
    "ATFHP": "https://github.com/yiranran/Audio-driven-TalkingFace-HeadPose",
    "MakeItTalk": "https://github.com/adobe-research/MakeItTalk",
    "MaskGAN": "https://github.com/switchablenorms/CelebAMask-HQ",
    "StarGAN2": "https://github.com/clovaai/stargan-v2",
    "SC-FEGAN": "https://github.com/run-youngjoo/SC-FEGAN",
    "DiscoFaceGAN": "https://github.com/microsoft/DiscoFaceGAN",
    "FaceAPP": "https://faceapp.com/app",
    "StarGAN": "https://github.com/yunjey/stargan",
    "PGGAN": "https://github.com/tkarras/progressive_growing_of_gans",
    "CycleGAN": "https://github.com/junyanz/CycleGAN/",
    "StyleGAN": "https://github.com/NVlabs/stylegan",
    "StyleGAN2": "https://github.com/NVlabs/stylegan2",
}

FORGERY_ATTACK = {
    "RealFace": [
        "Real",
    ],
    "IdentitySwap": [
        "FaceSwap",
        "Deepfakes",
        "FaceShifter",
        "DeepFaceLab",
        "FSGAN",
        "FaceNet",
    ],
    "ExpressionTransfer": [
        "Face2Face",
        "NeuralTextures",
        "Talking-Head-Video",
        "ATVG-Net",
        "FOMM",
        "Wav2Lip",
        "ATFHP",
        "MakeItTalk",
    ],
    "AttributeManipulation": [
        "MaskGAN",
        "StarGAN2",
        "SC-FEGAN",
        "DiscoFaceGAN",
        "FaceAPP",
        "StarGAN",
    ],
    "EntireFaceSyncthesis": [
        "PGGAN",
        "CycleGAN",
        "StyleGAN",
        "StyleGAN2",
    ],
}

ATTACK_TO_FORGERY = {att: "" for att in ATTACK_METHOD}
for forgery in FORGERY_ATTACK:
    for att in FORGERY_ATTACK[forgery]:
        ATTACK_TO_FORGERY[att] = forgery

forgerynet_label2method = {
	1: 'FaceShifter',
	2: 'FSGAN',
	3: 'DeepFaceLab',
	4: 'BlendFace',
	5: 'MMReplacement',
	6: 'DeepFakes-StarGAN-Stack',
	7: 'Talking-Head-Video',
	8: 'ATVG-Net',
	9: 'StarGAN-BlendFace-Stack',
   10: 'FOMM',
   11: 'StyleGAN2',
   12: 'MaskGAN',
   13: 'StarGAN2',
   14: 'SC-FEGAN',
   15: 'DiscoFaceGAN',    
}

ATTACK_METHOD

## FaceForensics++

In [None]:
data_dir = '/Datasets/deepfakes_detection_datasets/faceforensics/ffpp_video'
compressions = [
    'c23', 
    # 'c40',
]
attack_types = [
    'Deepfakes',
    'Face2Face',
    'FaceSwap',
    'NeuralTextures',
]

In [None]:
num_frames = 100

workers = []
with ThreadPoolExecutor(max_workers=10) as executor:
    for compression in compressions:
        origin_paths = []
        img_paths = []
        attack_type = 'Real'
        video_dirs = glob(f'{data_dir}/original_sequences/youtube/c23/images_v3/*')
        prefix = f'{DATA_ROOT}/{ATTACK_TO_FORGERY[attack_type]}/{attack_type}/faceforensics'
        os.makedirs(prefix, exist_ok=True)
        for video_dir in video_dirs:
            sorted_images_names = np.array(sorted(os.listdir(video_dir), key=lambda x: int(x.split('.')[0])))
            ind = np.linspace(0, len(sorted_images_names) - 1, num_frames, endpoint=True, dtype=int)
            sub_img_paths = [os.path.join(video_dir, x) for x in sorted_images_names[ind]]
            for src in sub_img_paths:
                origin_paths.append(src)
                dst = f'{prefix}/{compression}-{"-".join(src.split("/")[-2:])}'
                img_paths.append(dst)
        workers.append(executor.submit(copyfiles, origin_paths, img_paths))
        print(compression, attack_type, len(img_paths))

In [None]:
data = {
    'attack': [], 
    'img_name': [], 
    'label': [], 
    'method': [],
    'code': [], 
    'image_source': [],
    'img_path': [],
    'origin_img_path': [],
    'compression': [],
    'frame_idx': [],
}

for i in tqdm(range(len(img_paths))):
    img_path = img_paths[i]
    origin_img_path = origin_paths[i]
    attack, method, _, img_full_name = img_path.split('/')[-4:]
    compression, img_name, frame_idx = img_full_name.split('-')
    frame_idx = frame_idx.split('.')[0]
    label = ATTACK_METHOD[method]
    code = ATTACK_CODE[method]
    image_source = 'FaceForensics++'
    for key in data.keys():
        data[key].append(eval(key))
ffpp_df = pd.DataFrame(data)

In [None]:
ffpp_df = ffpp_df.drop_duplicates(subset=['img_path'])
ffpp_c23_df = ffpp_df[ffpp_df.compression == 'c23']
ffpp_c40_df = ffpp_df[ffpp_df.compression == 'c40']
ffpp_df.head()

In [None]:
ffpp_df.to_csv(f'{DATA_ROOT}/meta_data/ffpp_large_real_meta.csv', index=False)
ffpp_c23_df.to_csv(f'{DATA_ROOT}/meta_data/ffpp_large_c23_real_meta.csv', index=False)
ffpp_c40_df.to_csv(f'{DATA_ROOT}/meta_data/ffpp_large_c40_real_meta.csv', index=False)

In [None]:
num_frames = 10

all_origin_paths = []
all_img_paths = []

workers = []
with ThreadPoolExecutor(max_workers=10) as executor:
    for compression in compressions:
        for attack_type in attack_types:
            origin_paths = []
            img_paths = []
            video_dirs = glob(f'{data_dir}/manipulated_sequences/{attack_type}/{compression}/images_v3/*')
            prefix = f'{DATA_ROOT}/{ATTACK_TO_FORGERY[attack_type]}/{attack_type}/faceforensics'
            print(prefix)
            os.makedirs(prefix, exist_ok=True)
            for video_dir in video_dirs:
                sorted_images_names = np.array(sorted(os.listdir(video_dir), key=lambda x: int(x.split('.')[0])))
                ind = np.linspace(0, len(sorted_images_names) - 1, num_frames, endpoint=True, dtype=int)
                sub_img_paths = [os.path.join(video_dir, x) for x in sorted_images_names[ind]]
                for src in sub_img_paths:
                    origin_paths.append(src)
                    dst = f'{prefix}/{compression}-{"-".join(src.split("/")[-2:])}'
                    img_paths.append(dst)
            workers.append(executor.submit(copyfiles, origin_paths, img_paths))
            all_origin_paths.extend(origin_paths)
            all_img_paths.extend(img_paths)
            print(compression, attack_type, len(all_origin_paths))

for worker in workers:
    worker.result()

In [None]:
data = {
    'attack': [], 
    'img_name': [], 
    'label': [], 
    'method': [],
    'code': [], 
    'image_source': [],
    'img_path': [],
    'origin_img_path': [],
    'compression': [],
    'frame_idx': [],
}

for i in tqdm(range(len(all_origin_paths))):
    img_path = all_img_paths[i]
    origin_img_path = all_origin_paths[i]
    attack, method, _, img_full_name = img_path.split('/')[-4:]
    compression, img_name, frame_idx = img_full_name.split('-')
    frame_idx = frame_idx.split('.')[0]
    label = ATTACK_METHOD[method]
    code = ATTACK_CODE[method]
    image_source = 'FaceForensics++'
    for key in data.keys():
        data[key].append(eval(key))

ffpp_df = pd.DataFrame(data)
ffpp_c23_df = ffpp_df[ffpp_df.compression == 'c23']
ffpp_c40_df = ffpp_df[ffpp_df.compression == 'c40']
ffpp_df.head()

In [None]:
ffpp_df.to_csv(f'{DATA_ROOT}/meta_data/ffpp_large_meta.csv', index=False)
ffpp_c23_df.to_csv(f'{DATA_ROOT}/meta_data/ffpp_large_c23_meta.csv', index=False)
ffpp_c40_df.to_csv(f'{DATA_ROOT}/meta_data/ffpp_large_c40_meta.csv', index=False)

## Celeb-DF

In [None]:
data_dir = '/Datasets/deepfakes_detection_datasets/celebdfv2'
real_types = [
    'Celeb-real',
    # 'YouTube-real',
]

In [None]:
num_frames = 50
workers = []

with ThreadPoolExecutor(max_workers=10) as executor:
    for real_type in real_types:
        origin_paths = []
        img_paths = []
        attack_type = 'Real'
        video_dirs = glob(f'{data_dir}/images_v1/{real_type}/*')
        prefix = f'{DATA_ROOT}/{ATTACK_TO_FORGERY[attack_type]}/{attack_type}/CelebDF'
        os.makedirs(prefix, exist_ok=True)
        for video_dir in video_dirs:
            sorted_images_names = np.array(sorted(os.listdir(video_dir), key=lambda x: int(x.split('.')[0])))
            ind = np.linspace(0, len(sorted_images_names) - 1, num_frames, endpoint=True, dtype=int)
            sub_img_paths = [os.path.join(video_dir, x) for x in sorted_images_names[ind]]
            for src in sub_img_paths:
                origin_paths.append(src)
                dst = f'{prefix}/{real_type}-{"-".join(src.split("/")[-2:])}'
                img_paths.append(dst)
        workers.append(executor.submit(copyfiles, origin_paths, img_paths))
        print(real_type, attack_type, len(img_paths))

In [None]:
data = {
    'attack': [], 
    'img_name': [], 
    'label': [], 
    'method': [],
    'code': [], 
    'image_source': [],
    'img_path': [],
    'origin_img_path': [],
    'source': [],
    'frame_idx': [],
}

for i in tqdm(range(len(img_paths))):
    img_path = img_paths[i]
    origin_img_path = origin_paths[i]
    attack, method, _, img_full_name = img_path.split('/')[-4:]
    source, _, img_name, frame_idx = img_full_name.split('-')
    frame_idx = frame_idx.split('.')[0]
    label = ATTACK_METHOD[method]
    code = ATTACK_CODE[method]
    image_source = 'CelebDF'
    for key in data.keys():
        data[key].append(eval(key))
celeb_df = pd.DataFrame(data)
celeb_df.head()

In [None]:
celeb_df = celeb_df.drop_duplicates(subset=['img_path'])

In [None]:
celeb_df.to_csv(f'{DATA_ROOT}/meta_data/celebdf_large_real_meta.csv', index=False)

## ForgeryNet

In [None]:
data_dir = '/Datasets/deepfakes_detection_datasets/ForgeryNet/unzip_files/Training'
image_list_path = os.path.join(data_dir, 'image_list.txt')

In [None]:
import collections

lines = open(image_list_path).read().splitlines()
class2paths = {}
for line in tqdm(lines):
    img_path = os.path.join(data_dir, 'images', line.split(' ')[0])
    binary_cls_label, triple_cls_label, cls16_label = map(int, line.split()[-3:])
    if cls16_label not in class2paths:
        class2paths[cls16_label] = []
    class2paths[cls16_label].append(img_path)
class2paths = collections.OrderedDict(sorted(class2paths.items(), key=lambda t: t[0]))

In [None]:
for label in class2paths.keys():
    print(label, len(class2paths[label]))

In [None]:
data = {
    'attack': [], 
    'img_name': [], 
    'label': [], 
    'method': [],
    'code': [], 
    'image_source': [],
    'img_path': [],
    'origin_img_path': [],
    'frame_idx': [],
}

all_origin_paths = []
all_img_paths = []

workers = []
with ThreadPoolExecutor(max_workers=10) as executor:
    for ori_label in class2paths.keys():
        if ori_label in [0, 4, 5, 6, 9, 15]:
            continue

        origin_paths = []
        img_paths = []

        method = forgerynet_label2method[ori_label]
        attack = ATTACK_TO_FORGERY[method]
        label = ATTACK_METHOD[method]
        code = ATTACK_CODE[method]
        image_source = 'ForgeryNet'

        prefix = f'{DATA_ROOT}/{attack}/{method}/{image_source}'
        chosen_img_paths = class2paths[ori_label]
        np.random.seed(SEED)
        chosen_img_paths = np.random.choice(chosen_img_paths, 10000, replace=False)

        for img_path in tqdm(chosen_img_paths):
            img_name = img_path.split('/')[-1]
            origin_img_path = img_path
            origin_paths.append(origin_img_path)
            frame_idx = int(img_name.split('.')[0].replace('frame', ''))
            img_path = f'{prefix}/{"-".join(img_path.split("/")[-4:])}'
            os.makedirs(os.path.dirname(img_path), exist_ok=True)
            img_paths.append(img_path)
            for key in data.keys():
                data[key].append(eval(key))
        workers.append(executor.submit(copyfiles, origin_paths, img_paths))
        all_origin_paths.extend(origin_paths)
        all_img_paths.extend(img_paths)

In [None]:
for worker in workers:
    print(worker.done())

In [None]:
forgerynet_df = pd.DataFrame(data)
forgerynet_df.head()

In [None]:
forgerynet_df.to_csv(f'{DATA_ROOT}/meta_data/forgerynet_large_meta.csv', index=False)

## DiverseFakeFaceDataset

In [None]:
data_dir = '/Datasets/deepfakes_detection_datasets/DiverseFakeFaceDataset'
attack_types = {
    'faceapp': 'FaceAPP',
    'pggan_v2': 'PGGAN',
    'stylegan_ffhq': 'StyleGAN',
    'stargan': 'StarGAN',
}

In [None]:
with open(f'{data_dir}/all_img_paths.txt') as f:
    origin_img_paths = f.read().splitlines()

In [None]:
img_group = {
    'FaceAPP': [],
    'PGGAN': [],
    'StyleGAN': [],
    'StarGAN': [],
}

rest_img_group = {
    'FaceAPP': [],
    'PGGAN': [],
    'StyleGAN': [],
    'StarGAN': [],
}

for img_path in tqdm(origin_img_paths):
    if 'mask' in img_path.split('/')[-2]:
        continue
    attack_name = img_path.split('/')[-3]
    if attack_name == 'ffhq':
        continue
    attack = attack_types[attack_name]
    if 'train' not in img_path.split('/')[-2]:
        rest_img_group[attack].append(img_path)
        continue
    img_group[attack].append(img_path)

In [None]:
for attack in img_group.keys():
    print(attack, len(img_group[attack]))
    print(attack, len(rest_img_group[attack]))

In [None]:
random.seed(SEED)
img_group['FaceAPP'] = random.sample(rest_img_group['FaceAPP'], 10000 - len(img_group['FaceAPP'])) + img_group['FaceAPP']
img_group['PGGAN'] = random.sample(rest_img_group['PGGAN'], 10000 - len(img_group['PGGAN'])) + img_group['PGGAN']
img_group['StarGAN'] = random.sample(rest_img_group['StarGAN'], 10000 - len(img_group['StarGAN'])) + img_group['StarGAN']
img_group['StyleGAN'] = random.sample(rest_img_group['StyleGAN'], 10000 - len(img_group['StyleGAN'])) + img_group['StyleGAN']

In [None]:
for attack in img_group.keys():
    print(attack, len(img_group[attack]))

In [None]:
all_origin_paths = []
all_img_paths = []

workers = []
with ThreadPoolExecutor(max_workers=8) as executor:
    for attack in img_group:
        origin_paths = []
        img_paths = []
        for img_path in tqdm(img_group[attack]):
            prefix = f'{DATA_ROOT}/{ATTACK_TO_FORGERY[attack]}/{attack}/DFFD'
            os.makedirs(prefix, exist_ok=True)
            dst = f'{prefix}/{"-".join(img_path.split("/")[-2:])}'
            origin_paths.append(img_path)
            img_paths.append(dst)
        workers.append(executor.submit(copyfiles, origin_paths, img_paths))
        all_origin_paths.extend(origin_paths)
        all_img_paths.extend(img_paths)

In [None]:
data = {
    'attack': [], 
    'img_name': [], 
    'label': [], 
    'method': [],
    'code': [], 
    'image_source': [],
    'img_path': [],
    'origin_img_path': [],
}

for i in tqdm(range(len(all_origin_paths))):
    img_path = all_img_paths[i]
    origin_img_path = all_origin_paths[i]
    attack, method, _, img_name = img_path.split('/')[-4:]
    label = ATTACK_METHOD[method]
    code = ATTACK_CODE[method]
    image_source = 'DFFD'
    for key in data.keys():
        data[key].append(eval(key))
dffd_df = pd.DataFrame(data)
dffd_df.head()

In [None]:
dffd_df.to_csv(f'{DATA_ROOT}/meta_data/dffd_large_meta.csv', index=False)

## ForgeryNIR+

In [None]:
data_dir = '/Datasets/deepfakes_detection_datasets/ForgeryNIR'
attack_types = {
    'cyclegan': 'CycleGAN',
    'progan': 'PGGAN',
    'stylegan': 'StyleGAN',
    'stylegan2': 'StyleGAN2',
}

In [None]:
with open(f'{data_dir}/unzip_files/lists/ForgeryNIR-mix_multi.txt') as f:
    origin_img_paths = f.read().splitlines()

In [None]:
img_group = {
    'CycleGAN': [],
    'PGGAN': [],
    'StyleGAN': [],
    'StyleGAN2': [],
}

for img_path in tqdm(origin_img_paths):
    attack_name = img_path.split('/')[-2]
    attack = attack_types[attack_name]
    img_group[attack].append(img_path)

In [None]:
random.seed(SEED)
img_group['CycleGAN'] = random.sample(img_group['CycleGAN'], 10000)
img_group['PGGAN'] = random.sample(img_group['PGGAN'], 10000)
img_group['StyleGAN'] = random.sample(img_group['StyleGAN'], 10000)
img_group['StyleGAN2'] = random.sample(img_group['StyleGAN2'], 10000)

In [None]:
all_origin_paths = []
all_img_paths = []

workers = []
with ThreadPoolExecutor(max_workers=8) as executor:
    for attack in img_group:
        origin_paths = []
        img_paths = []
        for img_path in tqdm(img_group[attack]):
            prefix = f'{DATA_ROOT}/{ATTACK_TO_FORGERY[attack]}/{attack}/ForgeryNIR'
            os.makedirs(prefix, exist_ok=True)
            src = f'{data_dir}/unzip_files/{img_path}'
            dst = f'{prefix}/{img_path.split("/")[-1]}'
            origin_paths.append(src)
            img_paths.append(dst)
        workers.append(executor.submit(copyfiles, origin_paths, img_paths))
        all_origin_paths.extend(origin_paths)
        all_img_paths.extend(img_paths)

In [None]:
data = {
    'attack': [], 
    'img_name': [], 
    'label': [], 
    'method': [],
    'code': [], 
    'image_source': [],
    'img_path': [],
    'origin_img_path': [],
}

for i in tqdm(range(len(all_origin_paths))):
    img_path = all_img_paths[i]
    origin_img_path = all_origin_paths[i]
    attack, method, _, img_name = img_path.split('/')[-4:]
    label = ATTACK_METHOD[method]
    code = ATTACK_CODE[method]
    image_source = 'ForgeryNIR+'
    for key in data.keys():
        data[key].append(eval(key))
forgeryNIR_df = pd.DataFrame(data)
forgeryNIR_df.head()

In [None]:
forgeryNIR_df.to_csv(f'{DATA_ROOT}/meta_data/forgerynir_large_meta.csv', index=False)

## Merge meta info

In [None]:
meta_files = [
    'ffpp_c23_meta.csv',
    # 'ffpp_c40_meta.csv',
    # 'ffpp_meta.csv',
    'ffpp_c23_real_meta.csv',
    # 'ffpp_c40_real_meta.csv',
    # 'ffpp_meta_real.csv',
    'celebdf_real_meta.csv',
    'forgerynet_meta.csv',
    'dffd_meta.csv',
    'forgerynir_meta.csv',
]

all_dfs = []
for meta_file in meta_files:
    meta_path = os.path.join(DATA_ROOT, 'meta_data', meta_file)
    df = pd.read_csv(meta_path)
    all_dfs.append(df)

In [None]:
merge_df = pd.concat(all_dfs, ignore_index=True)
merge_df['forgery_label'] = merge_df['attack'].apply(lambda x: FORGERY_TYPES[x])
merge_df['forgery_type'] = merge_df['attack']
merge_df['tag'] = 1
merge_df['face_type'] = merge_df['method'].apply(lambda x: 1 if x == 'Real' else 0)

In [None]:
merge_df['label'].value_counts()

In [None]:
order = ["img_name", "label", "method", "forgery_type", "forgery_label", "face_type", "tag",
        "code", "image_source", "img_path", "compression", "frame_idx"]
merge_df = merge_df[order]

In [None]:
merge_df

In [None]:
# merge_df.to_csv(f'{DATA_ROOT}/meta_data/fake_val_merge_meta.csv', index=False)
merge_df.to_csv(f'{DATA_ROOT}/meta_data/real_fake_val_merge_meta.csv', index=False)

### Large

In [None]:
meta_files = [
    'ffpp_large_c23_meta.csv',
    # 'ffpp_c40_large_meta.csv',
    # 'ffpp_large_meta.csv',
    'ffpp_large_c23_real_meta.csv',
    # 'ffpp_c40_large_real_meta.csv',
    # 'ffpp_meta_large_real.csv',
    'celebdf_large_real_meta.csv',
    'forgerynet_large_meta.csv',
    # 'fakeavceleb_meta.csv',
    'dffd_large_meta.csv',
    'forgerynir_large_meta.csv',
    # 'aisc_meta.csv',
]

all_dfs = []
for meta_file in meta_files:
    meta_path = os.path.join(DATA_ROOT, 'meta_data', meta_file)
    df = pd.read_csv(meta_path)
    all_dfs.append(df)

merge_df = pd.concat(all_dfs, ignore_index=True)
merge_df['forgery_label'] = merge_df['attack'].apply(lambda x: FORGERY_TYPES[x])
merge_df['forgery_type'] = merge_df['attack']
merge_df['tag'] = 1
merge_df['face_type'] = merge_df['method'].apply(lambda x: 1 if x == 'Real' else 0)

In [None]:
merge_df['label'].value_counts()

In [None]:
order = ["img_name", "label", "method", "forgery_type", "forgery_label", "face_type", "tag",
        "code", "image_source", "img_path", "compression", "frame_idx"]
merge_df = merge_df[order]
merge_df

In [None]:
# merge_df.to_csv(f'{DATA_ROOT}/meta_data/fake_large_merge_meta.csv', index=False)
merge_df.to_csv(f'{DATA_ROOT}/meta_data/real_fake_large_merge_meta.csv', index=False)

## Semi-supervised setting

In [None]:
meta_path = f'{DATA_ROOT}/meta_data/real_fake_large_merge_meta.csv'
semi_df = pd.read_csv(meta_path)

In [None]:
def semi_sup(df):
    if df['image_source'] == 'FaceForensics++':
        return True
    elif df['image_source'] == 'CelebDF':
        return True
    elif df['image_source'] == 'ForgeryNet':
        return df['method'] in [
            'FaceShifter',
            'DeepFaceLab',
            'FSGAN',
            'Talking-Head-Video',
            'ATVG-Net',
            'FOMM',
            'MaskGAN',
            'StarGAN2',
            'SC-FEGAN',
            'StyleGAN2',
        ]
    elif df['image_source'] == 'DFFD':
        return True
    elif df['image_source'] == 'ForgeryNIR+':
        return df['method'] in [
            'CycleGAN',
            'StyleGAN2',
        ]
    else:
        assert False

In [None]:
semi_df = semi_df[semi_df.apply(semi_sup, axis=1)]

In [None]:
def semi_sup_label(df):
    if df['image_source'] == 'FaceForensics++':
        if df['method'] in [
            "Deepfakes",
            "Face2Face",
            'Real',
        ]:
            return 1
        else:
            return 0
    elif df['image_source'] == 'CelebDF':
        if df['method'] in []:
            return 1
        else:
            return 0
    elif df['image_source'] == 'ForgeryNet':
        if df['method'] in [
            'DeepFaceLab',
            'FOMM',
            'MaskGAN',
        ]:
            return 1
        else:
            return 0
    elif df['image_source'] == 'DFFD':
        if df['method'] in [
            "FaceAPP",
            "StyleGAN",
        ]:
            return 1
        else:
            return 0
    elif df['image_source'] == 'ForgeryNIR+':
        if df['method'] in [
            "CycleGAN",
        ]:
            return 1
        else:
            return 0
    else:
        assert False

In [None]:
# Default unlabeled
semi_df['tag'] = 2

random.seed(SEED)

for i, k in semi_df.method.groupby(semi_df.image_source).value_counts().index:
    idx = semi_df[lambda x: x['method'] == k][lambda x: x['image_source'] == i].index
    num = len(idx)
    if semi_sup_label({ 'image_source': i, 'method': k }):
        # Labeled
        semi_df.loc[random.sample(list(idx), int(num * 0.75 + 0.5)), 'tag'] = 1
    else:
        # Unlabeled
        if k == 'Real':
            semi_df.loc[random.sample(list(idx), num - 25000), 'tag'] = 0
        else:
            semi_df.loc[random.sample(list(idx), num - int(num * 0.75 + 0.5)), 'tag'] = 0

In [None]:
semi_df

In [None]:
label_id = 0

for i, k in semi_df.method.groupby(semi_df.image_source).value_counts().index:
    idx = semi_df[lambda x: x['method'] == k][lambda x: x['image_source'] == i].index
    semi_df.loc[idx, 'label'] = label_id
    label_id += 1

In [None]:
label_map = {}
labeled_key = semi_df[semi_df.tag == 1].label.value_counts().keys()
unlabeled_key = semi_df[semi_df.tag == 2].label.value_counts().keys()

n = 0
for i in labeled_key:
    label_map[i] = n
    n += 1
    
for i in unlabeled_key:
    if i in label_map:
        continue
    label_map[i] = n
    n += 1

for i in range(0,25):
    if i in label_map:
        continue
    label_map[i] = n
    n += 1

semi_df.label = semi_df.label.apply(lambda x: label_map[x])

In [None]:
labeled_key = semi_df[semi_df.tag == 1].label.value_counts().keys()
labeled_key

In [None]:
unlabeled_key = semi_df[semi_df.tag == 2].label.value_counts().keys()
unlabeled_key

In [None]:
semi_df.method.groupby(semi_df.label).value_counts().sort_index()

In [None]:
semi_df.tag.value_counts()

In [None]:
semi_df.face_type.groupby(semi_df.tag).value_counts()

In [None]:
semi_df.to_csv(f'{DATA_ROOT}/meta_data/openset_real_fake_large_merge_meta.csv', index=False)

## Statistics

In [None]:
image_dirs = glob(f'{DATA_ROOT}/*/*/*')
for dir in image_dirs:
    # count the number of images in the directory
    num_images = len(glob(f'{dir}/*'))
    print(f'{dir} has {num_images} images')

## Change setting of Semi-Supervised Learning

In [None]:
meta_path = f'{DATA_ROOT}/meta_data/Protocol2_openset_real_fake_large_merge_meta.csv'
df = pd.read_csv(meta_path)
df

In [None]:
df.face_type.groupby(df.tag).value_counts()

In [None]:
df = df[df.face_type != 1]
df.face_type.groupby(df.tag).value_counts()

In [None]:
df['label'] = df.label.apply(lambda x: x - 1 if x < 9 else x - 2)

In [None]:
df.to_csv(f'{DATA_ROOT}/meta_data/Protocol1_openset_fake_large_merge_meta.csv', index=False)