In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.autograd import Variable
from sklearn.preprocessing import StandardScaler

%matplotlib inline                               


In [33]:
path = "./FIDs-features/"

def process(df, rType):
    df['FID'] = df['p1'].str[:5]
    df['p1'] = path+df['p1']
    df['p2'] = path+df['p2']
    df['p1'] = df['p1'].str.replace('.jpg', '.pkl')
    df['p2'] = df['p2'].str.replace('.jpg', '.pkl')
    
    sample1 = df.sample(n=df.shape[0], replace=True)[['p1','FID']].reset_index().drop('index',axis=1)
    sample2 = df.sample(n=df.shape[0], replace=True)[['p2','FID']].reset_index().drop('index',axis=1)
    sample2.columns = ['p2','FID2']
    unrelated = pd.concat([sample1,sample2],axis=1)
    unrelated = unrelated[unrelated['FID'] != unrelated['FID2']][['p1','p2']]
    
    ###############################################################
    # Related file
    p1 = df['p1'].values.tolist()
    p2 = df['p2'].values.tolist()
    F = pd.read_pickle(p1[0]).reshape(1,512)
    C = pd.read_pickle(p2[0]).reshape(1,512)
    related_fs_set = np.append(F,C,axis=1)

    for r1, r2 in zip(p1[1:],p2[1:]):
        F = pd.read_pickle(r1).reshape(1,512)
        C = pd.read_pickle(r2).reshape(1,512)
        temp = np.append(F,C,axis=1)
        related_fs_set = np.append(related_fs_set, temp, axis=0)
        
    pd.DataFrame(related_fs_set).to_pickle("Related_{}_1024.pkl".format(rType))
        
    ###############################################################
    # UnRelated file
    p1 = unrelated['p1'].values.tolist()
    p2 = unrelated['p2'].values.tolist()
    F = pd.read_pickle(p1[0]).reshape(1,512)
    C = pd.read_pickle(p2[0]).reshape(1,512)
    unrelated_fs_set = np.append(F,C,axis=1)

    for r1, r2 in zip(p1[1:],p2[1:]):
        F = pd.read_pickle(r1).reshape(1,512)
        C = pd.read_pickle(r2).reshape(1,512)
        temp = np.append(F,C,axis=1)
        unrelated_fs_set = np.append(unrelated_fs_set, temp, axis=0)
        
    pd.DataFrame(unrelated_fs_set).to_pickle("Unrelated_{}_1024.pkl".format(rType))
  
# Example 
process(pd.read_pickle("./lists/pairs/pickles/Direct/fs-faces.pkl"), "fs")
process(pd.read_pickle("./lists/pairs/pickles/Direct/fd-faces.pkl"), "fd")
process(pd.read_pickle("./lists/pairs/pickles/Direct/ms-faces.pkl"), "ms")
process(pd.read_pickle("./lists/pairs/pickles/Direct/md-faces.pkl"), "md")

In [2]:
p1 = pd.read_pickle("./Related_fs_1024.pkl")
p1['class'] = 1
p2 = pd.read_pickle("./Unrelated_fs_1024.pkl")
p2['label'] = 0
df_fs = pd.concat([p1, p2])
df_fs['label'] = 'fs'
#################################################################

p1 = pd.read_pickle("./Related_fd_1024.pkl")
p1['class'] = 1
p2 = pd.read_pickle("./Unrelated_fd_1024.pkl")
p2['class'] = 0
df_fd = pd.concat([p1, p2])
df_fd['label'] = 'fd'
#################################################################

p1 = pd.read_pickle("./Related_ms_1024.pkl")
p1['class'] = 1
p2 = pd.read_pickle("./Unrelated_ms_1024.pkl")
p2['class'] = 0
df_ms = pd.concat([p1, p2])
df_ms['label'] = 'ms'

#################################################################
p1 = pd.read_pickle("./Related_md_1024.pkl")
p1['class'] = 1
p2 = pd.read_pickle("./Unrelated_md_1024.pkl")
p2['class'] = 0
df_md = pd.concat([p1, p2])
df_md['label'] = 'md'


df = pd.concat([df_fs, df_fd, df_ms, df_md])
df = df.sample(frac=1).reset_index(drop=True)

In [9]:
## Family DataSet
X_train = df.drop(['label','class'],axis=1)[0:300000].values.astype(np.float64)
X_valid = df.drop(['label','class'],axis=1)[300000:370000].values.astype(np.float64)
X_test = df.drop(['label','class'],axis=1)[370000:].values.astype(np.float64)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_valid = scaler.transform(X_valid)
X_test = scaler.transform(X_test)

y_train = df['class'][0:300000].values.astype(np.int64)
y_valid = df['class'][300000:370000].values.astype(np.int64)
y_test = df['class'][370000:].values.astype(np.int64)

train_dataset = TensorDataset(torch.from_numpy(X_train).type(torch.FloatTensor), torch.from_numpy(y_train))
valid_dataset = TensorDataset(torch.from_numpy(X_valid).type(torch.FloatTensor), torch.from_numpy(y_valid))
test_dataset = TensorDataset(torch.from_numpy(X_test).type(torch.FloatTensor), torch.from_numpy(y_test))

loaders = {}
loaders['train'] = DataLoader(train_dataset, batch_size=200, shuffle=True)
loaders['valid'] = DataLoader(valid_dataset, batch_size=200)
loaders['test'] = DataLoader(test_dataset, batch_size=200)

In [10]:
## relation DataSet
realation_dict = {'fs':0, 'fd':1, 'ms':2, 'md':3}
y_train = df['label'].map(realation_dict)[0:300000].values.astype(np.int64)
y_valid = df['label'].map(realation_dict)[300000:370000].values.astype(np.int64)
y_test = df['label'].map(realation_dict)[370000:].values.astype(np.int64)

train_dataset = TensorDataset(torch.from_numpy(X_train).type(torch.FloatTensor), torch.from_numpy(y_train))
valid_dataset = TensorDataset(torch.from_numpy(X_valid).type(torch.FloatTensor), torch.from_numpy(y_valid))
test_dataset = TensorDataset(torch.from_numpy(X_test).type(torch.FloatTensor), torch.from_numpy(y_test))

relation_loaders = {}
relation_loaders['train'] = DataLoader(train_dataset, batch_size=200, shuffle=True)
relation_loaders['valid'] = DataLoader(valid_dataset, batch_size=200)
relation_loaders['test'] = DataLoader(test_dataset, batch_size=200)

In [None]:
def getLabel()
if ['class'] = 1
    return label
else 


## Family Model Development

In [8]:
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(1024, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, 64)
        self.output = nn.Linear(64, 2)
        
        self.dropout = nn.Dropout(p=0.5)
        self.logSoftMax = nn.LogSoftmax(dim=1)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.dropout(self.relu(self.fc2(x)))
        x = self.dropout(self.relu(self.fc3(x)))
        x = self.logSoftMax(self.output(x))
        return x

model = Net()
use_cuda = torch.cuda.is_available()
if use_cuda:
    model.cuda()

In [9]:
criterion = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.01)

In [10]:
"""returns trained model"""
# initialize tracker for minimum validation loss
valid_loss_min = np.Inf 

for epoch in range(200):
    # initialize variables to monitor training and validation loss
    train_loss = 0.0
    valid_loss = 0.0

    ###################
    # train the model #
    ###################
    model.train()
    for batch_idx, (data, target) in enumerate(loaders['train']):
        optimizer.zero_grad()
        # move to GPU
        if use_cuda:
            data, target = data.cuda(), target.cuda()
        ## find the loss and update the model parameters accordingly
        pred = model(data)
        loss = criterion(pred, target)
        loss.backward()
        optimizer.step()
        ## record the average training loss, using something like
        train_loss += ((1 / (batch_idx + 1)) * (loss.data - train_loss))

    ######################    
    # validate the model #
    ######################
    model.eval()
    for batch_idx, (data, target) in enumerate(loaders['valid']):
        # move to GPU
        if use_cuda:
            data, target = data.cuda(), target.cuda()
        ## update the average validation loss
        pred = model(data)
        loss = criterion(pred, target)
        valid_loss += ((1 / (batch_idx + 1)) * (loss.data - valid_loss))
    if(epoch % 10 == 0):
        # print training/validation statistics 
        print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss
            ))

    ## TODO: save the model if validation loss has decreased
    if valid_loss < valid_loss_min:
        if(epoch % 10 == 0):
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss))
        torch.save(model.state_dict(), "checkpoint.cpt")
        valid_loss_min = valid_loss

Epoch: 0 	Training Loss: 0.693051 	Validation Loss: 0.690293
Validation loss decreased (inf --> 0.690293).  Saving model ...
Epoch: 10 	Training Loss: 0.545161 	Validation Loss: 0.511763
Validation loss decreased (0.524733 --> 0.511763).  Saving model ...
Epoch: 20 	Training Loss: 0.417602 	Validation Loss: 0.364236
Validation loss decreased (0.378216 --> 0.364236).  Saving model ...
Epoch: 30 	Training Loss: 0.336811 	Validation Loss: 0.282483
Validation loss decreased (0.284864 --> 0.282483).  Saving model ...
Epoch: 40 	Training Loss: 0.288169 	Validation Loss: 0.239538
Validation loss decreased (0.244177 --> 0.239538).  Saving model ...
Epoch: 50 	Training Loss: 0.257086 	Validation Loss: 0.211998
Validation loss decreased (0.214534 --> 0.211998).  Saving model ...
Epoch: 60 	Training Loss: 0.229341 	Validation Loss: 0.193628
Validation loss decreased (0.195296 --> 0.193628).  Saving model ...
Epoch: 70 	Training Loss: 0.210941 	Validation Loss: 0.177830
Validation loss decreased (

In [12]:
model.load_state_dict(torch.load('checkpoint.cpt'))
# monitor test loss and accuracy
test_loss = 0.
correct = 0.
total = 0.

model.eval()
for batch_idx, (data, target) in enumerate(loaders['test']):
    # move to GPU
    if use_cuda:
        data, target = data.cuda(), target.cuda()
    # forward pass: compute predicted outputs by passing inputs to the model
    output = model(data)
    # calculate the loss
    loss = criterion(output, target)
    # update average test loss 
    test_loss = test_loss + ((1 / (batch_idx + 1)) * (loss.data - test_loss))
    # convert output probabilities to predicted class
    pred = output.data.max(1, keepdim=True)[1]
    # compare predictions to true label
    correct += np.sum(np.squeeze(pred.eq(target.data.view_as(pred))).cpu().numpy())
    total += data.size(0)

print('Test Loss: {:.6f}\n'.format(test_loss))

print('\nTest Accuracy: %2d%% (%2d/%2d)' % (
    100. * correct / total, correct, total))

Test Loss: 0.124207


Test Accuracy: 95% (65273/68579)


## Relationship Model

In [14]:
class TypeModel(nn.Module):
    def __init__(self):
        super(TypeModel, self).__init__()
        self.fc1 = nn.Linear(1024, 512)
        self.fc2 = nn.Linear(512, 128)
        self.fc3 = nn.Linear(128, 64)
        self.output = nn.Linear(64, 4)
        
        self.dropout = nn.Dropout(p=0.5)
        self.logSoftMax = nn.LogSoftmax(dim=1)
        self.relu = nn.ReLU()
        
    def forward(self, x):
        x = self.dropout(self.relu(self.fc1(x)))
        x = self.dropout(self.relu(self.fc2(x)))
        x = self.dropout(self.relu(self.fc3(x)))
        x = self.logSoftMax(self.output(x))
        return x

relation_model = TypeModel()
use_cuda = torch.cuda.is_available()
if use_cuda:
    relation_model.cuda()

In [15]:
relation_criterion = nn.NLLLoss()
relation_optimizer = optim.SGD(relation_model.parameters(), lr=0.01)

In [16]:
"""returns trained model"""
# initialize tracker for minimum validation loss
valid_loss_min = np.Inf 

for epoch in range(30):
    # initialize variables to monitor training and validation loss
    train_loss = 0.0
    valid_loss = 0.0

    ###################
    # train the model #
    ###################
    relation_model.train()
    for batch_idx, (data, target) in enumerate(relation_loaders['train']):
        relation_optimizer.zero_grad()
        # move to GPU
        if use_cuda:
            data, target = data.cuda(), target.cuda()
        ## find the loss and update the model parameters accordingly
        pred = relation_model(data)
        loss = relation_criterion(pred, target)
        loss.backward()
        relation_optimizer.step()
        ## record the average training loss, using something like
        train_loss += ((1 / (batch_idx + 1)) * (loss.data - train_loss))

    ######################    
    # validate the model #
    ######################
    relation_model.eval()
    for batch_idx, (data, target) in enumerate(relation_loaders['valid']):
        # move to GPU
        if use_cuda:
            data, target = data.cuda(), target.cuda()
        ## update the average validation loss
        pred = relation_model(data)
        loss = relation_criterion(pred, target)
        valid_loss += ((1 / (batch_idx + 1)) * (loss.data - valid_loss))
    if(epoch % 10 == 0):
        # print training/validation statistics 
        print('Epoch: {} \tTraining Loss: {:.6f} \tValidation Loss: {:.6f}'.format(
            epoch, 
            train_loss,
            valid_loss
            ))

    ## TODO: save the model if validation loss has decreased
    if valid_loss < valid_loss_min:
        if(epoch % 10 == 0):
            print('Validation loss decreased ({:.6f} --> {:.6f}).  Saving model ...'.format(
            valid_loss_min,
            valid_loss))
        torch.save(relation_model.state_dict(), "relation_checkpoint.cpt")
        valid_loss_min = valid_loss

Epoch: 0 	Training Loss: 1.273120 	Validation Loss: 0.941659
Validation loss decreased (inf --> 0.941659).  Saving model ...
Epoch: 10 	Training Loss: 0.136390 	Validation Loss: 0.055443
Validation loss decreased (0.064613 --> 0.055443).  Saving model ...
Epoch: 20 	Training Loss: 0.062214 	Validation Loss: 0.016665
Validation loss decreased (0.018802 --> 0.016665).  Saving model ...


In [17]:
relation_model.load_state_dict(torch.load('relation_checkpoint.cpt'))
# monitor test loss and accuracy
test_loss = 0.
correct = 0.
total = 0.

relation_model.eval()
for batch_idx, (data, target) in enumerate(relation_loaders['test']):
    # move to GPU
    if use_cuda:
        data, target = data.cuda(), target.cuda()
    # forward pass: compute predicted outputs by passing inputs to the model
    output = relation_model(data)
    # calculate the loss
    loss = relation_criterion(output, target)
    # update average test loss 
    test_loss = test_loss + ((1 / (batch_idx + 1)) * (loss.data - test_loss))
    # convert output probabilities to predicted class
    pred = output.data.max(1, keepdim=True)[1]
    # compare predictions to true label
    correct += np.sum(np.squeeze(pred.eq(target.data.view_as(pred))).cpu().numpy())
    total += data.size(0)

print('Test Loss: {:.6f}\n'.format(test_loss))

print('\nTest Accuracy: %2d%% (%2d/%2d)' % (
    100. * correct / total, correct, total))

Test Loss: 0.008401


Test Accuracy: 99% (68403/68579)
