### This notebook computes and saves file information for pre-processing
Two objectives for this notebook: 
+ 1. Get checksum information of every video and image file for deduplication
- 2. Get filesize information of image data to filter for images that display ads content


Save the information into video and image info tables respectively

In [1]:
import os
import pandas as pd
import hashlib

In [5]:
video_source_path = 'my-video-dir'
image_source_path = 'my-image-dir'

### Function to collect filepaths of advertising media from local directory 

In [29]:
def search_files(directory, filetype=None):
    '''
    Take a root directory and target filetype (e.g. '.mp4', '.png')
    
    Search recursively in the directory and return a list of files under the directory.  
    '''
    filepaths = []
    for root, dirs, files in os.walk(directory):
        for file in files:
            if filetype:
                if file.endswith(filetype):
                    filepaths.append(os.path.join(root,file))
            else:
                filepaths.append(os.path.join(root,file))
    
    return filepaths

### Functions to get checksum information of media files to prepare for deduplication

In [15]:
# Get the MD5 checksum value (message-digest algorithm that produces the message digest of inputs) of a file
# Caveat: MD5's use as cryptographic checksum runs risk of encryption collision https://www.techtarget.com/searchsecurity/definition/MD5

# Code reference: 
# 1. https://github.com/thorrak/brewflasher/blob/master/fhash.py 
# 2. https://stackoverflow.com/questions/3431825/generating-an-md5-checksum-of-a-file

def file_as_blockiter(afile, blocksize=65536):
    with afile:
        block = afile.read(blocksize)
        while len(block) > 0:
            yield block
            block = afile.read(blocksize)

def hash_bytestr_iter(bytesiter, hasher, ashexstr=True):
    for block in bytesiter:
        hasher.update(block)
    return hasher.hexdigest() if ashexstr else hasher.digest()

def checksum_of_file(filepath, blocksize=65536, ashexstr=True):
    '''
    Take the path of a file path and return the text sha256 hash of the file 
    '''
    with open(filepath, 'rb') as f:
        checksum = hash_bytestr_iter(file_as_blockiter(f, blocksize=blocksize), hashlib.sha256(), ashexstr=ashexstr)
    return checksum


## Create video information table
Get checksum info (sha256 hash value in text string) for every video file

In [None]:
# Get video paths
video_filepaths = search_files(video_source_path, '.mp4')

In [38]:
# Get video checksum values
video_checksum = [checksum_of_file(path) for path in video_filepaths]

In [41]:
# Create table
video_info_table = pd.DataFrame({
    'filepath': video_filepaths,
    'filename': [os.path.basename(path) for path in video_filepaths], # this is not necessary since we have filepath
    'checksum': video_checksum,
})

In [43]:
# Preview table
video_info_table.head()

Unnamed: 0,filepath,filename,checksum
0,/data/1/wesmediafowler/projects/AdMedia/Google...,nIAa2wx0y2Q.mp4,3eb256475d44d124d8d671b15460066e767a87efaea137...
1,/data/1/wesmediafowler/projects/AdMedia/Google...,ZL_4kcX7kIM.mp4,16270b71732f94fd4a9bc2169f76cf94010d4e0bacbe1d...
2,/data/1/wesmediafowler/projects/AdMedia/Google...,QDt2eixQ-7E.mp4,2f5617ad11a6c8900a106261d9e56d5eb34c750c159312...
3,/data/1/wesmediafowler/projects/AdMedia/Google...,wAE3Adxg8z4.mp4,bc61eec68d979c23eacdee58c5df66a915c669d93283e8...
4,/data/1/wesmediafowler/projects/AdMedia/Google...,31E4r62s34c.mp4,d34b9f98fd5a1084181969b3ee61ebcd9f40bc79b2593f...


In [None]:
# Save table
outfile = 'outfile.csv' # google2022_video_info.csv for Google 2022 video ad data
video_info_table.to_csv(f'../output/{outfile}', index=False)

## Create image file information table

Get MD5 checksum information (sha256 hash value in text string) and file size for every image file

In [None]:
# Get image file paths
image_filepaths = search_files(image_source_path)

In [46]:
# Get image checksum info
image_checksum = [checksum_of_file(path) for path in image_filepaths]

In [47]:
# Get image filesize info
image_filesize = [os.path.getsize(path) for path in image_filepaths]

In [50]:
# Create table

image_info_table = pd.DataFrame({
    "filepath": image_filepaths,
    "filename": [os.path.basename(path) for path in image_filepaths], # not necessary
    "checksum": image_checksum,
    "filesize": image_filesize
})


In [51]:
# Preview table
image_info_table.head(4)

Unnamed: 0,filepath,filename,checksum,filesize
0,/data/1/wesmediafowler/projects/AdMedia/Google...,CR00189532233848061953_screenshot.png,ac285eefadca1531c92638cc4e3662f3e20c06ee8638e7...,73866
1,/data/1/wesmediafowler/projects/AdMedia/Google...,CR00189532233848061953_hqimg_0.png,5e7c33fc965c222ab5ad92a4b26e9be7938f271faa96bc...,39850
2,/data/1/wesmediafowler/projects/AdMedia/Google...,CR00189532233848061953_hqimg_1.png,cdf8972333e05c9b0cc42f6a83d1153ef1aab75742f473...,13129
3,/data/1/wesmediafowler/projects/AdMedia/Google...,CR00047879951818424321_screenshot.png,78c1c9a513e4ac14c356b570c3d6f6741bd6a5c042dec0...,68964


In [None]:
# Save table
outfile = 'outfile.csv' # google2022_image_info.csv for Google 2022 image ad data
image_info_table.to_csv(f'../output/{outfile}', index=False)