In this assignment, we're going to build on this idea to look for changing patterns in print media. Specifically, we are going to look at the presence in historical newspapers of *pictures of human faces*. This is a culturally meaningful question - how has the prevelance of images of human faces changed in print media over the last roughly 200 years? Are there any significant differences and what might this mean?

You should write code which does the following:

- For each of the three newspapers
    - Go through each page and find how many faces are present
    - Group these results together by *decade* and then save the following:
        - A CSV showing the total number of faces per decade and the percentage of pages for that decade which have faces on them
        - A plot which shows the latter information - i.e. percentage of pages with faces per decade over all of the decades avaiable for that newspaper
- Repeat for the other newspapers

For this task, we are going to use a pretrained CNN model which has been finteuned for face detection. You can see documentation of this model and some starter code about how to get it running at [this website](https://medium.com/@danushidk507/facenet-pytorch-pretrained-pytorch-face-detection-mtcnn-and-facial-recognition-b20af8771144). In particular, you'll want to use the first code block down to the line which detects faces in images:


In [1]:
import os
import pandas as pd
from facenet_pytorch import MTCNN, InceptionResnetV1
import torch
from PIL import Image
import matplotlib.pyplot as plt

In [93]:
def face_detection(image, mtcnn):
    """
    The function detects faces in a given input image using MTCNN.
    """    
    boxes, _ = mtcnn.detect(image)

    return boxes

In [94]:

def process_newspaper(filepath, mtcnn):
    """ 
    The function iterates over the the three newspapers.
    """
    for newpaper in os.listdir(filepath):
        newspaper_path = os.path.join(filepath, newpaper)

        # Initialize data structures for the current newspaper
        newspaper_results = pd.DataFrame(columns=("Decade", "Total Faces", "Total Pages"))

        # Iterate through each newspaper issue
        for issue in os.listdir(newspaper_path):
            issue_year = int(issue.split('-')[1])
            decade = get_decade(issue_year)
            
            print(issue) # JDG-1955-07-29-a-p0005.jpg

            issue_faces = []

            issue_dir = os.path.join(newspaper_path, issue)
            print(issue_dir) # ../in/JDG/JDG-1955-07-29-a-p0005.jpg

            image = Image.open(issue_dir)
            detected_faces = face_detection(image, mtcnn)
            

            # Calculate
            total_faces = sum(issue_faces)
            total_pages = len(issue_faces)
            pages_with_faces = sum([1 for faces in issue_faces if faces > 0])
            percentage_pages_with_faces = (pages_with_faces / total_pages) * 100

            text_row = [decade, total_faces, total_pages]
            newspaper_results.loc[len(newspaper_results)] = text_row


        csv_outpath = os.path.join("out", f"{newspaper}_results.csv")

    return print("The results has been saved to the out folder")


In [45]:
def initialize_MTCNN():
    """
    Initialize Multi-Task Cascaded Convolutional Neural Networks (MTCNN) for face detection
    """
    mtcnn = MTCNN(keep_all = True)
    return mtcnn

def load_classifier():
    """
    Load pre-trained FaceNet model
    """
    resnet = InceptionResnetV1(pretrained = 'casia-webface').eval()
    return resnet

def df_structure_results():
    """
    total number of faces per dacade
    percentage of pages for that decade which have faces on them
    """
    results = pd.DataFrame(columns=("Newspaper", "Decade", "Total Faces", "Pages with Faces (%)"))
    #results = pd.DataFrame(columns=("Newspaper", "Decade", "Total Faces", "Pages with Faces (%)"))
    return results


In [46]:
def get_decade(year):
    """
    Get the decade from a given year, ie. 1789 = 80
    """
    decade = str(year)[2:3] + "0"
    return decade

In [None]:
issue = "GDL-1789-02-05-a-p0001.jpg"
issue_year = int(issue.split('-')[1])
issue_year

decade = get_decade(issue_year)
decade

In [None]:
# A plot which shows the latter information - i.e. percentage of pages with faces per decade over
#  all of the decades avaiable for that newspaper

def plot():
    """
    Plot the percentage of pages with faces per decade 
    """

    # x = decade
    # y = percentage of pages with faces 

    plt.title(f'Percentage of pages with faces per pecade - {newspaper}')
    plt.xlabel('Decade')
    plt.ylabel('Pages with faces (%)')

    plt.savefig("out", f'{newspaper}_faces_by_decade_plot.png')
    #plt.show()

    return print("The plot has been saved to the out folder")


In [None]:
  
def main():

    filepath = os.path.join("../in")

    mtcnn = initialize_MTCNN()
    resnet = load_classifier()

    results = df_structure_results()

    process_newspaper(filepath, mtcnn)

if __name__ == "__main__":
    main()

In [96]:
df = os.path.join("../in")
process_newspaper(df, mtcnn)

../in/JDG
JDG-1955-07-29-a-p0005.jpg
../in/JDG/JDG-1955-07-29-a-p0005.jpg
<PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=1378x2000 at 0x7F66B127FC40>
None


ZeroDivisionError: division by zero

In [70]:
mtcnn = MTCNN(keep_all = True)
image = Image.open("../in/GDL/GDL-1798-02-05-a-p0001.jpg")
type(image)

PIL.JpegImagePlugin.JpegImageFile