# Notebook: Analyse Image Corpus

This notebook is used to analyse the crawled cropus of images.

## Packages

In [1]:
import pandas as pd
import nltk
import math
import os

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Parameters

Exchange `/img_dataset_mentions` with `/img_dataset_politicians`

In [3]:
PHOTOS_POLITICIANS_PATH = "../Datasets/img_dataset_mentions/"
PARTIES = ["CDU_CSU", "SPD", "AFD", "FDP", "GRUENE", "LINKE"]

## Code

### 1. Create Statistic for each Party

In [4]:
df = pd.read_csv(PHOTOS_POLITICIANS_PATH + "images_dataset.csv")
df["source_party"].unique()

array(['CDU_CSU', 'SPD', 'AFD', 'FDP', 'GRUENE', 'LINKE'], dtype=object)

In [5]:
df

Unnamed: 0.1,Unnamed: 0,tweet_id,image_index,filename,extracted_text,url,image_path,source_party,source_account,date
0,0,1345866502268985354,0,1345866502268985354_0.jpg,,https://pbs.twimg.com/media/Eq16c2qXYAwa__x.jpg,../Datasets/img_dataset_mentions/CDU_CSU/Armin...,CDU_CSU,ArminLaschet,2021-01-03 22:55:53
1,1,1345863370579320832,0,1345863370579320832_0.jpg,Asyl- |\nmißbrauch\n\n,https://pbs.twimg.com/media/Eq13meUXEAMWanr.jpg,../Datasets/img_dataset_mentions/CDU_CSU/Armin...,CDU_CSU,ArminLaschet,2021-01-03 22:43:27
2,2,1345860999602184196,0,1345860999602184196_0.jpg,"kann, dass dieses '\n\nGesindel\n\nwieder vers...",https://pbs.twimg.com/media/Eq11cdkW8AU8leb.jpg,../Datasets/img_dataset_mentions/CDU_CSU/Armin...,CDU_CSU,ArminLaschet,2021-01-03 22:34:01
3,3,1345841888289550345,0,1345841888289550345_0.jpg,"25. Februar 2011, 11:45 Uhr FDPrudert zurück\n...",https://pbs.twimg.com/media/Eq1kEA2XUAAyW_f.jpg,../Datasets/img_dataset_mentions/CDU_CSU/Armin...,CDU_CSU,ArminLaschet,2021-01-03 21:18:05
4,4,1345840672113373186,0,1345840672113373186_0.jpg,You can fool some of the people all of the tim...,https://pbs.twimg.com/media/Eq1i03_W4AANqWe.png,../Datasets/img_dataset_mentions/CDU_CSU/Armin...,CDU_CSU,ArminLaschet,2021-01-03 21:13:15
...,...,...,...,...,...,...,...,...,...,...
34136,34136,1438112331666530313,1,1438112331666530313_1.jpg,,https://pbs.twimg.com/media/E_UzixMXoAg5Bay.jpg,../Datasets/img_dataset_mentions/LINKE/b_riexi...,LINKE,b_riexinger,2021-09-15 13:07:53
34137,34137,1438111715409936387,0,1438111715409936387_0.jpg,,https://pbs.twimg.com/media/E_Uy-8FWYAUkFY5.jpg,../Datasets/img_dataset_mentions/LINKE/b_riexi...,LINKE,b_riexinger,2021-09-15 13:05:26
34138,34138,1444703262951759880,0,1444703262951759880_0.jpg,Zusammenhalten.\nFür Gerechtigkeit.\nDIE LINKE.\n,https://pbs.twimg.com/media/FAyahiCXoAI6Kzy.jpg,../Datasets/img_dataset_mentions/LINKE/b_riexi...,LINKE,b_riexinger,2021-10-03 17:37:53
34139,34139,1446709991805751297,0,1446709991805751297_0.jpg,Das umgekehrte Bild bei der\n\nLinken: Sie ver...,https://pbs.twimg.com/media/FBO-9t7WEAkN-fA.jpg,../Datasets/img_dataset_mentions/LINKE/b_riexi...,LINKE,b_riexinger,2021-10-09 06:31:55


In [6]:
def get_n_tokens_for_text(extracted_text):
    if isinstance(extracted_text, float) and math.isnan(extracted_text):
        return 0
    else:
        return len(nltk.word_tokenize(str(extracted_text)))

In [7]:
n_images_total = 0
n_tokens_total = 0
n_images_with_text_total = 0
for party in PARTIES:
    n_tokens_party = 0
    n_images_party = 0
    n_images_with_text_party = 0
    
    df_party = df[df["source_party"] == party]
                
    for index, row in df_party.iterrows():
        n_tokens_in_image = get_n_tokens_for_text(row['extracted_text'])
        n_tokens_party += n_tokens_in_image
        if n_tokens_in_image == 0:
            n_images_with_text_party += 1
            n_images_with_text_total += 1
            
                
    n_images_party = df_party.shape[0]
    n_images_total += n_images_party
    n_tokens_total += n_tokens_party
    print(f'Total #Images in corpus for {party}: {n_images_party}')
    print(f'Total #Images in corpus for {party} with text in image: {n_images_with_text_party}')
    print(f'#Tokens in corpus for {party}: {n_tokens_party}')
    print(f'#Avg text length for images of {party}: {round(n_tokens_party / n_images_party, 2)}')
    print("=================================== \n")

print("\n")
print(f'Total #Images in corpus: {n_images_total}')
print(f'Total #Images in corpus with text in image: {n_images_with_text_total}')
print(f'#Tokens in corpus: {n_tokens_total}')
print(f'#Avg text length for images in corpus: {round(n_tokens_total / n_images_total, 2)}')

Total #Images in corpus for CDU_CSU: 11678
Total #Images in corpus for CDU_CSU with text in image: 2762
#Tokens in corpus for CDU_CSU: 644981
#Avg text length for images of CDU_CSU: 55.23

Total #Images in corpus for SPD: 10387
Total #Images in corpus for SPD with text in image: 2154
#Tokens in corpus for SPD: 672867
#Avg text length for images of SPD: 64.78

Total #Images in corpus for AFD: 2548
Total #Images in corpus for AFD with text in image: 625
#Tokens in corpus for AFD: 130901
#Avg text length for images of AFD: 51.37

Total #Images in corpus for FDP: 3818
Total #Images in corpus for FDP with text in image: 760
#Tokens in corpus for FDP: 225916
#Avg text length for images of FDP: 59.17

Total #Images in corpus for GRUENE: 3932
Total #Images in corpus for GRUENE with text in image: 986
#Tokens in corpus for GRUENE: 225889
#Avg text length for images of GRUENE: 57.45

Total #Images in corpus for LINKE: 1778
Total #Images in corpus for LINKE with text in image: 483
#Tokens in corp