In [2]:
import os
import itertools
import pandas as pd

In [3]:
root_dir = "d:/Coding/CZ4041/CZ4041-kaggle" # Set appropriate directory
os.chdir(root_dir)

### Processing Train Relationships

This will effectively serve as our annotations.

In [28]:
df = pd.read_csv("./data/train-relationships/train_relationships.csv")
df.head()

Unnamed: 0,p1,p2
0,F0002/MID1,F0002/MID3
1,F0002/MID2,F0002/MID3
2,F0005/MID1,F0005/MID2
3,F0005/MID3,F0005/MID2
4,F0009/MID1,F0009/MID4


Splitting path into 2 columns.

In [29]:
path1 = df["p1"].str.split("/", n = 1, expand = True)

# making separate first name column from new data frame 
df["Family1"]= path1[0]
# making separate last name column from new data frame 
df["Person1"]= path1[1]

# Dropping old Name columns
df.drop(columns =["p1"], inplace = True)

path2 = df["p2"].str.split("/", n = 1, expand = True)

# making separate first name column from new data frame 
df["Family2"]= path2[0]
# making separate last name column from new data frame 
df["Person2"]= path2[1]

# Dropping old Name columns
df.drop(columns =["p2"], inplace = True)
df.head()

Unnamed: 0,Family1,Person1,Family2,Person2
0,F0002,MID1,F0002,MID3
1,F0002,MID2,F0002,MID3
2,F0005,MID1,F0005,MID2
3,F0005,MID3,F0005,MID2
4,F0009,MID1,F0009,MID4


In [31]:
root_dir = './data/train/'
temp = []

for index, row in df.iterrows():
    dir1 = os.path.join(root_dir, row.Family1, row.Person1)
    dir2 = os.path.join(root_dir, row.Family2, row.Person2)

    dir1_exists_and_non_empty = os.path.exists(dir1) and (len(os.listdir(dir1)) > 0)
    dir2_exists_and_non_empty = os.path.exists(dir2) and (len(os.listdir(dir2)) > 0)

    if not (dir1_exists_and_non_empty and dir2_exists_and_non_empty):
        temp.append(index)
        
print(len(temp))
df = df.drop(temp, axis=0)

236


NOTE: There are 231 rows with non-existent directories and 5 with empty directories. Dropped them.

### Creating Negative Pairs

In [32]:
#A new column in the existing dataframe with all values as 1, since these people are all related
df['Related'] = 1

#Creating a dictionary, and storing members of each family
family_dict = {}
for index, row in df.iterrows():
    if row['Family1'] in family_dict:
        family_dict[row['Family1']].append(row['Person1'])
    else:
        family_dict[row['Family1']] = [row['Person1']]

In [33]:
#For each family in this dictionary, we'll first make pairs of people
#For each pair, we'll check if they're related in our existing Dataset
#If they're not in the dataframe, means we'll create a row with both persons and related value 0
i=1
for key in family_dict:
    pair = itertools.combinations(family_dict[key], 2)
    for item in pair:
        if len(df[(df['Family1']==key)&(df['Person1']==item[0])&(df['Person2']==item[1])])==0 \
        and len(df[(df['Family1']==key)&(df['Person1']==item[1])&(df['Person2']==item[0])])==0:
            new = {'Family1':[key], 'Person1':[item[0]], 'Family2':[key], 'Person2':[item[1]], 'Related':[0]}   # Find a better fix
            df = pd.concat([df, pd.DataFrame(new, columns=df.columns)], ignore_index=True)
        
#Storing rows only where Person1 and Person2 are not same
df = df[(df['Person1']!=df['Person2'])]

print(df['Related'].value_counts())

Related
1    3362
0    1561
Name: count, dtype: int64


Balance the classes.

In [34]:
df

Unnamed: 0,Family1,Person1,Family2,Person2,Related
0,F0002,MID1,F0002,MID3,1
1,F0002,MID2,F0002,MID3,1
2,F0005,MID1,F0005,MID2,1
3,F0005,MID3,F0005,MID2,1
4,F0009,MID1,F0009,MID4,1
...,...,...,...,...,...
5827,F1000,MID2,F1000,MID7,0
5828,F1000,MID3,F1000,MID7,0
5830,F1000,MID4,F1000,MID5,0
5832,F1000,MID5,F1000,MID6,0


In [35]:
extra = df['Related'].value_counts()[1]-df['Related'].value_counts()[0]
while extra>=0:
    rows = df.sample(n=2)
    first = rows.iloc[0,:]
    second = rows.iloc[1,:]
    
    if first.Family1!=second.Family1 and first.Family2!=second.Family2:
        new1 = {'Family1':[first.Family1],'Person1':[first.Person1],'Family2':[second.Family1],'Person2':[second.Person1],'Related':[0]}
        extra=extra-1

        if extra==0:
            break
        
        new2 = {'Family1':[first.Family2],'Person1':[first.Person2],'Family2':[second.Family2],'Person2':[second.Person2],'Related':[0]}
        extra=extra-1
        
        df = pd.concat([df, pd.DataFrame(new1, columns=df.columns)], ignore_index=True)
        df = pd.concat([df, pd.DataFrame(new2, columns=df.columns)], ignore_index=True)

In [36]:
df['Related'].value_counts()

Related
1    3362
0    3361
Name: count, dtype: int64

In [37]:
# Shuffle dataset
df = df.sample(frac=1).reset_index(drop=True)

In [38]:
df

Unnamed: 0,Family1,Person1,Family2,Person2,Related
0,F0227,MID4,F0227,MID3,1
1,F0284,MID6,F0284,MID2,1
2,F0601,MID13,F0601,MID25,0
3,F0368,MID3,F0368,MID1,1
4,F0974,MID7,F0974,MID6,1
...,...,...,...,...,...
6718,F0783,MID5,F0784,MID2,0
6719,F0717,MID3,F0393,MID2,0
6720,F0604,MID1,F0604,MID4,1
6721,F0601,MID39,F0988,MID3,0


Export dataset as csv file.

In [39]:
df.to_csv(path_or_buf="./data/train-relationships/train_relationships_processed.csv", index=False)

### Processing Sample Submission File (Redundant)

This will serve as our submisison template.

Edit: Incorporated in dataloader.

In [4]:
sub = pd.read_csv("./data/submissions/sample_submission.csv")
sub.head()

Unnamed: 0,img_pair,is_related
0,face05508.jpg-face01210.jpg,0
1,face05750.jpg-face00898.jpg,0
2,face05820.jpg-face03938.jpg,0
3,face02104.jpg-face01172.jpg,0
4,face02428.jpg-face05611.jpg,0


In [6]:
type(sub['is_related'][1])

numpy.int64

In [28]:
images = sub["img_pair"].str.split("-", n=1, expand=True)

# making separate first name column from new data frame 
sub["Person1"]= images[0]
# making separate last name column from new data frame 
sub["Person2"]= images[1]

# Dropping old Name columns
sub.head()

                  0              1
0     face05508.jpg  face01210.jpg
1     face05750.jpg  face00898.jpg
2     face05820.jpg  face03938.jpg
3     face02104.jpg  face01172.jpg
4     face02428.jpg  face05611.jpg
...             ...            ...
5305  face99998.jpg  face99993.jpg
5306  face99997.jpg  face99996.jpg
5307  face99997.jpg  face99995.jpg
5308  face99997.jpg  face99994.jpg
5309  face99997.jpg  face99993.jpg

[5310 rows x 2 columns]


Unnamed: 0,img_pair,is_related,Person1,Person2
0,face05508.jpg-face01210.jpg,0,face05508.jpg,face01210.jpg
1,face05750.jpg-face00898.jpg,0,face05750.jpg,face00898.jpg
2,face05820.jpg-face03938.jpg,0,face05820.jpg,face03938.jpg
3,face02104.jpg-face01172.jpg,0,face02104.jpg,face01172.jpg
4,face02428.jpg-face05611.jpg,0,face02428.jpg,face05611.jpg


### Float adjustment

The submission is ranked based on the AUC nor accuracy. Hence ensure the 'is_related' column are saved as floats for better precision.

In [8]:
x = pd.read_csv("./data/submissions/test_submission.csv")
type(x['is_related'][1])

numpy.int64

In [9]:
x['is_related'] = x['is_related'].astype(float)
type(x['is_related'][1])

numpy.float64

In [10]:
x.to_csv(path_or_buf="./data/submissions/test_submission_processed.csv", index=False)