In [None]:
import os
import glob
import random
import math
import numpy as np
random.seed(123)

import cv2
import pandas as pd
import matplotlib.pyplot as plt

from IPython.display import display
from scipy.stats import norm
from sklearn.model_selection import train_test_split

# Load Data

In [None]:
s_submission_df = pd.read_csv("../input/deepfake-detection-challenge/sample_submission.csv")
metadata_df = pd.read_json('../input/deepfake-detection-challenge/train_sample_videos/metadata.json').transpose()

train_embeddings = pd.read_csv("../input/edadfdc/train_embeddings.csv")
test_embeddings = pd.read_csv("../input/edadfdc/test_embeddings.csv")

aligned_train_embeddings = pd.read_csv("../input/edadfdc/aligned_train_embeddings.csv")
aligned_test_embeddings = pd.read_csv("../input/edadfdc/aligned_test_embeddings.csv")

train_embeddings = pd.merge(left=train_embeddings, right=metadata_df[['label']], left_on='128', right_index=True)
aligned_train_embeddings = pd.merge(left=aligned_train_embeddings, right=metadata_df[['label']], left_on='128', right_index=True)

In [None]:
metadata_df.head()

In [None]:
def func(pct, allvals):
    absolute = int(pct/100.*np.sum(allvals))
    return "{:.1f}%\n({:d} faces)".format(pct, absolute)

fig, axes = plt.subplots(1, 2, figsize=(10, 5))

axes[0].bar(['train', 'test'], [train_embeddings.shape[0], test_embeddings.shape[0]])
axes[0].set_ylabel('# of faces')


data = [train_embeddings.label.value_counts().REAL,
        train_embeddings.label.value_counts().FAKE]

wedges, texts, autotexts = axes[1].pie(data, autopct=lambda pct: func(pct, data),
                                       textprops=dict(color="w"))
axes[1].legend(wedges, ['real', 'fake'],
          title="Label",
          loc="center left",
          bbox_to_anchor=(1, 0, 0.5, 1))

plt.show()

# Visualize Data

## Embedding vectors

Clean data<p>
Videos that have been claimed to have more than 625 faces by the detector needs to be removed (we are assuming that each video can only have one or more persons which would have a maximum of 600 faces)

In [None]:
outliers = train_embeddings.groupby('128').size()
index = (outliers > 625).to_numpy().nonzero()[0]
train_embeddings = train_embeddings[~train_embeddings['128'].isin(outliers[index].index)]
aligned_train_embeddings = aligned_train_embeddings[~aligned_train_embeddings['128'].isin(outliers[index].index)]

In [None]:
plt.hist(train_embeddings.groupby('128').size())
plt.title("Amount of embeddings in a video")
plt.xlabel('# of embeddings')
plt.ylabel('frequency')
plt.show()

Visualize data points

Comparing the not aligned and aligned figures  

In [None]:
from sklearn.decomposition import PCA

def reduce_dims(df, video):
    fake_emb_reduced = PCA(n_components=2).fit_transform(df.loc[(df['128']==video) & (df.label=='FAKE')][np.arange(0,128).astype(str)])
    real_video = metadata_df.loc[metadata_df.index==video]['original'].values[0]
    real_emb_reduced = PCA(n_components=2).fit_transform(df.loc[(df['128']==real_video) & (df.label=='REAL')][np.arange(0,128).astype(str)])
    return fake_emb_reduced, real_emb_reduced, real_video
    
fake_samples = np.random.choice(metadata_df.loc[
    (metadata_df.label=='FAKE') &
    (metadata_df.original.isin(train_embeddings['128'])) &
    (metadata_df.index.isin(train_embeddings['128']))].index, 4)

fig, axes = plt.subplots(2, 4, figsize=(18, 8))
fig.suptitle('Scatter plot fake/real video embeddings', fontsize=16)
for idx, video in enumerate(fake_samples):
    fake_emb_reduced, real_emb_reduced, real_video = reduce_dims(train_embeddings, video)
    axes[0, idx].scatter(fake_emb_reduced[:, 0], fake_emb_reduced[:, 1])
    axes[0, idx].set_title(f'{video} (FAKE)')
    axes[1, idx].scatter(real_emb_reduced[:, 0], real_emb_reduced[:, 1], color='pink')
    axes[1, idx].set_title(f'{real_video} (REAL)')
plt.show()

In [None]:
fig, axes = plt.subplots(2, 4, figsize=(18, 8))
fig.suptitle('Scatter plot real/fake aligned video embeddings', fontsize=16)
for idx, video in enumerate(fake_samples):
    real_video = metadata_df.loc[metadata_df.index==video]['original'].values[0]
    fake_emb_reduced, real_emb_reduced, real_video = reduce_dims(aligned_train_embeddings, video)
    axes[0, idx].scatter(fake_emb_reduced[:, 0], fake_emb_reduced[:, 1])
    axes[0, idx].set_title(f'{video} (FAKE)')
    axes[1, idx].scatter(real_emb_reduced[:, 0], real_emb_reduced[:, 1], color='pink')
    axes[1, idx].set_title(f'{real_video} (REAL)')
plt.show()

In [None]:
distances_videos = {}
distances_align_videos = {}


def get_mean_distance(df, video):
    emb = df[(df['128']==video)][np.arange(0,128).astype(str)]
    avg_point = np.mean(emb)
    distances = np.linalg.norm(emb - avg_point, axis=1)
    return np.mean(distances)
    
for video in train_embeddings['128'].unique():
    distances_videos[video] = get_mean_distance(train_embeddings, video)

for video in aligned_train_embeddings['128'].unique():
    distances_align_videos[video] = get_mean_distance(aligned_train_embeddings, video)

distances_df = pd.DataFrame(distances_videos.items())
distances_df.columns = ['Video', 'Spread']
distances_df = distances_df.set_index('Video')

al_distances_df = pd.DataFrame(distances_align_videos.items())
al_distances_df.columns = ['Video', 'Spread']
al_distances_df = al_distances_df.set_index('Video')

train_embeddings = pd.merge(left=train_embeddings, right=distances_df['Spread'], left_on='128', right_index=True)
aligned_train_embeddings = pd.merge(left=aligned_train_embeddings, right=al_distances_df['Spread'], left_on='128', right_index=True)

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].hist(train_embeddings.loc[train_embeddings.label=='FAKE'].groupby('128').mean()['Spread'])
axes[1].hist(train_embeddings.loc[train_embeddings.label=='REAL'].groupby('128').mean()['Spread'], color='pink')
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(12, 4))
axes[0].hist(aligned_train_embeddings.loc[aligned_train_embeddings.label=='FAKE'].groupby('128').mean()['Spread'])
axes[1].hist(aligned_train_embeddings.loc[aligned_train_embeddings.label=='REAL'].groupby('128').mean()['Spread'], color='pink')
plt.show()

In [None]:
data = [train_embeddings.loc[train_embeddings.label=='FAKE'].groupby('128').mean()['Spread'],
        train_embeddings.loc[train_embeddings.label=='REAL'].groupby('128').mean()['Spread']]
al_data = [aligned_train_embeddings.loc[aligned_train_embeddings.label=='FAKE'].groupby('128').mean()['Spread'],
        aligned_train_embeddings.loc[aligned_train_embeddings.label=='REAL'].groupby('128').mean()['Spread']]

fig, axes = plt.subplots(1, 2, figsize=(12, 5))
fig.suptitle('Boxplot average distance of points in a video')
b1 = axes[0].boxplot(data, labels=['fake', 'real'], patch_artist=True, widths=[0.3, 0.3])
b2 = axes[1].boxplot(al_data, labels=['fake', 'real'], patch_artist=True, widths=[0.3, 0.3])

axes[0].yaxis.grid(True)
axes[0].set_title('Not Aligned')
axes[1].yaxis.grid(True)
axes[1].set_title('Aligned')

b1['boxes'][1].set_facecolor('pink')
b2['boxes'][1].set_facecolor('pink')

plt.show()

In [None]:
from sklearn.cluster import DBSCAN
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import log_loss

from xgboost import XGBClassifier
from sklearn.svm import SVC

def cluster_faces(embeddings_df, eps, min_samples):
    embeddings_data = []
    for video in metadata_df.index:
        data = embeddings_df.loc[embeddings_df['128']==video]
        embeddings = data[np.arange(0,128).astype(str)]
        avg_point = np.mean(embeddings)
        distances = np.linalg.norm(embeddings - avg_point, axis=1)
        label = metadata_df.loc[metadata_df.index==video].label.values[0]
        if embeddings.empty:
            embeddings_data.append([video, 0, 0, 0, label])
            continue
        clf = DBSCAN(eps=eps, min_samples=min_samples, n_jobs=6).fit(embeddings)
        classes = np.bincount(clf.labels_+1).shape[0] - 1
        noises = np.bincount(clf.labels_+1)[0]
        embeddings_data.append([video, classes, noises, np.mean(distances), label])
    return embeddings_data

In [None]:
def evaluate_cluster(embeddings_df, eps=0.5, min_samples=100):
    embeddings_data = pd.DataFrame(cluster_faces(embeddings_df, eps, min_samples))
    embeddings_data.columns = ['Video', 'Classes', 'Noises', 'Spread', 'Label']
    y = embeddings_data.Label.apply(lambda x: 0 if x == "REAL" else 1).values
    X = embeddings_data[['Classes', 'Noises', 'Spread']]
#     clf = LogisticRegression(solver='lbfgs', random_state=0)
    clf = SVC(kernel='rbf', gamma=0.7, C=1.0, probability=True)
    score = cross_val_score(clf, X, y, cv=5, scoring='neg_log_loss')
    return score

def evaluate_aligned_cluster(min_samples):
    eps = np.median(aligned_train_embeddings.loc[aligned_train_embeddings.label=='REAL'].groupby('128').mean()['Spread'])
    return evaluate_cluster(aligned_train_embeddings, eps, min_samples)
    
def evaluate_not_aligned_cluster(min_samples):
    eps = np.median(train_embeddings.loc[train_embeddings.label=='REAL'].groupby('128').mean()['Spread'])
    return evaluate_cluster(train_embeddings, eps, min_samples)

In [None]:
min_sample_values = np.arange(30, 34, 1)
print ('Calculando para dados não alinhados')
score = (list(map(evaluate_not_aligned_cluster, min_sample_values)))
mean_score = list(map(lambda s: -s.mean(), score))

print ('Calculando para dados alinhados')
aligned_score = (list(map(evaluate_aligned_cluster, min_sample_values)))
aligned_mean_score = list(map(lambda s: -s.mean(), aligned_score))

for idx, s in enumerate(mean_score):
    print (f'[Not aligned] Score for {min_sample_values[idx]}: {s}')
for idx, s in enumerate(aligned_mean_score):
    print (f'[Aligned] Score for {min_sample_values[idx]}: {s}')

We are going to start the evaluation with a *min_samples* value of 300, wich means that for a point to belong to the dense region of a cluster it needs to have at least 300 points close to its neighborhood

In [None]:
eps = np.median(train_embeddings.loc[train_embeddings.label=='REAL'].groupby('128').mean()['Spread'])
al_eps = np.median(aligned_train_embeddings.loc[aligned_train_embeddings.label=='REAL'].groupby('128').mean()['Spread'])

embeddings_data_df = pd.DataFrame(cluster_faces(train_embeddings, eps=eps, min_samples=32))
embeddings_data_df.columns = ['Video', 'Classes', 'Noises', 'Spread', 'Label']
display(embeddings_data_df.head(5))

al_embeddings_data_df = pd.DataFrame(cluster_faces(aligned_train_embeddings, eps=al_eps, min_samples=33))
al_embeddings_data_df.columns = ['Video', 'Classes', 'Noises', 'Spread', 'Label']
display(al_embeddings_data_df.head(5))

1. Noises

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14,8))

lbls = ['REAL', 'FAKE']
c = ['pink', 'C0']
for idx in range(2):
    axes[0, idx].hist(embeddings_data_df.loc[embeddings_data_df.Label==lbls[idx]].Noises.values, color=c[idx])
    axes[0, idx].set_xlabel(f'# of noise in {lbls[idx].lower()} videos (not aligned)')
    axes[0, idx].set_ylabel('frequency')

for idx in range(2, 4):
    axes[1, idx-2].hist(al_embeddings_data_df.loc[al_embeddings_data_df.Label==lbls[idx-2]].Noises.values, color=c[idx-2])
    axes[1, idx-2].set_xlabel(f'# of noise in {lbls[idx-2].lower()} videos (aligned)')
    axes[1, idx-2].set_ylabel('frequency')
    
plt.show()

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 4))

c = list(map(lambda x: 'pink' if x == 'REAL' else 'C0', embeddings_data_df.Label.values))
axes[0].scatter(embeddings_data_df.Noises, embeddings_data_df.Label, c=c)
axes[0].set_xlabel('# of noises')
axes[0].set_title('Target vs Noises in videos')
axes[1].scatter(al_embeddings_data_df.Noises, embeddings_data_df.Label, c=c)
axes[1].set_title('Target vs Noises in aligned videos')
axes[1].set_xlabel('# of noises')

plt.show()

2. Classes

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 8))

lbls = ['REAL', 'FAKE']
c = ['pink', 'C0']
for idx in range(2):
    axes[0, idx].hist(embeddings_data_df.loc[embeddings_data_df.Label==lbls[idx]].Classes.values, color=c[idx])
    axes[0, idx].set_xlabel(f'# of classes in {lbls[idx].lower()} videos (not aligned)')

for idx in range(2, 4):
    axes[1, idx-2].hist(al_embeddings_data_df.loc[al_embeddings_data_df.Label==lbls[idx-2]].Classes.values, color=c[idx-2])
    axes[1, idx-2].set_xlabel(f'# of classes in {lbls[idx-2].lower()} videos (aligned)')

plt.show()

3. Noise vs Classes

In [None]:
fig, axes = plt.subplots(1, 2, figsize=(14, 4))
c = list(map(lambda x: 'pink' if x == 'REAL' else 'C0', embeddings_data_df.Label.values))
al_c = list(map(lambda x: 'pink' if x == 'REAL' else 'C0', al_embeddings_data_df.Label.values))
axes[0].scatter(embeddings_data_df.Classes, embeddings_data_df.Noises, c=c)
axes[1].scatter(al_embeddings_data_df.Classes, al_embeddings_data_df.Noises, c=al_c)

axes[0].set_xlabel('Classes')
axes[0].set_ylabel('Noise')
axes[1].set_xlabel('Classes')

plt.show()

3. Spread vs Noises and Classes

In [None]:
fig, axes = plt.subplots(2, 2, figsize=(14, 8))
c = list(map(lambda x: 'pink' if x == 'REAL' else 'C0', embeddings_data_df.Label.values))
al_c = list(map(lambda x: 'pink' if x == 'REAL' else 'C0', al_embeddings_data_df.Label.values))
axes[0, 0].scatter(embeddings_data_df.Spread, embeddings_data_df.Noises, c=c)
axes[0, 1].scatter(al_embeddings_data_df.Spread, al_embeddings_data_df.Noises, c=al_c)

axes[1, 0].scatter(embeddings_data_df.Spread, embeddings_data_df.Classes, c=c)
axes[1, 1].scatter(al_embeddings_data_df.Spread, al_embeddings_data_df.Classes, c=al_c)

axes[0, 0].set_ylabel('Noise')
axes[1, 0].set_ylabel('Classes')
axes[1, 0].set_xlabel('Spread')
axes[1, 1].set_xlabel('Spread')

plt.show()

Ensemble with deep fake score

In [None]:
dfscore_train_result = pd.read_csv("../input/edadfdc/resnext_dfscore_train_result.csv")
dfscore_test_result = pd.read_csv("../input/edadfdc/resnext_dfscore_test_result.csv")

dfscore_train_result = dfscore_train_result.set_index('0')
dfscore_train_result.columns = ['DFscore']
dfscore_test_result = dfscore_test_result.set_index('0')
dfscore_test_result.columns = ['DFscore']
dfscore_train_result.head(5)

In [None]:
df_score = dfscore_train_result.groupby(dfscore_train_result.index).mean()
al_embeddings_data_df = pd.merge(left=al_embeddings_data_df, right=df_score['DFscore'], left_on='Video', right_index=True)

In [None]:
# Write Model
import pickle

clf = LogisticRegression(solver='lbfgs', random_state=0)

y = al_embeddings_data_df.Label.apply(lambda x: 0 if x == "REAL" else 1).values
X = al_embeddings_data_df[['Classes', 'Noises', 'Spread', 'DFscore']]
# score = cross_val_score(clf, X, y, cv=5, scoring='neg_log_loss')
clf.fit(X, y)

# print (score.mean())
pickle.dump(clf, open('lr_dfdc_resnext.dat', 'wb'))

In [None]:
import pandas as pd
submissions = pd.read_csv('../input/edadfdc/dfscore_test_result.csv')
submissions.hist()