In [1]:
import os
import pandas as pd

## Load data saved from step 1 (notebook 01-get-checksum-for-deduplication)

In [None]:
# below are output files saved from the first notebook (01-get-checksum-for-deduplication)
video_info_path = 'my-video-info-path.csv' 
image_info_path = 'my-image-info-path.csv'

In [None]:
video_info_table = pd.read_csv(video_info_path)
image_info_table = pd.read_csv(video_info_path)

## Deduplicate based on checksum information for videos

In [None]:
video_info_table = video_info_table.drop_duplicates(subset=['checksum'], keep='last')

## Deduplicate and additional preprocessing steps for image data

In [None]:
'''
1. Deduplicate based on filenames AND checksum first
'''
image_info_table = image_info_table.drop_duplicates(subset=['filename', 'checksum'], keep='last')

In [None]:
'''
2. Because filenames are often composed of an ad id, followed by an underline and then filetype, 
there are many files with the same prefix (ad id) and different suffixes.  
We want to extract the ad ids from these file names: 
'''
image_info_table['ad_id'] = image_info_table['filename'].str.rsplit("_", n=2, expand=True).iloc[:, 0]

In [None]:
'''
3. check filetypes through filenames and exclude screenshot images 
(filenames with 'screenshot.png' after ad id and '_')
'''
image_info_table['filename'].str.rsplit("_", n=2, expand=True).iloc[:, 1].value_counts()

In [None]:
'''
4. After identifying the screentshot filetypes, exclude them from the sample to be analyzed, 
and obtain an updated image information table 
''' 

In [None]:
'''
5. Deduplicate once again based on checksum alone (after dropping screenshot filetypes)
'''
updated_image_info_table.drop_duplicates(subset=['checksum'], keep='last')

### Notes 

For facebook image data: 
The same ad_id often returns multiple images. Some are the content of posts (political advertising content we are interested in), some are profile pictures of posters or other thumnail images irrelevant to the political ad content. 

How to filter for relevant image data depends on users' use cases and judgment. To filter for only content of the posts, remove image files with significantly small filesizes compared to images under the same ad_id. 