# Notebook: Analyse Image Corpus

This notebook is used to analyse the crawled cropus of images.

## Packages

In [1]:
import pandas as pd
import nltk
import math
import os

In [2]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/nils_hellwig/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## Parameters

Exchange `/img_dataset_mentions` with `/img_dataset_political_accounts`

In [3]:
PHOTOS_POLITICAL_ACCOUNTS_PATH = "../Datasets/img_dataset_political_accounts/"
PARTIES = ["CDU_CSU", "SPD", "AFD", "FDP", "GRUENE", "LINKE"]

## Code

### 1. Create Statistic for each Party

In [4]:
df = pd.read_csv(PHOTOS_POLITICAL_ACCOUNTS_PATH + "images_dataset.csv")
df["source_party"].unique()

array(['AFD', 'CDU_CSU', 'LINKE', 'GRUENE', 'FDP', 'SPD'], dtype=object)

In [5]:
df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,tweet_id,image_index,filename,extracted_text,url,image_path,source_party,source_account,date
0,0,0,1377157230630301700,0,1377157230630301700_0.jpg,"Behördenwillkür\n\n\C\n\nD Es wird Zeit, den B...",https://pbs.twimg.com/media/ExylKvEU8AgowSU?fo...,../Datasets/img_dataset_political_accounts/AFD...,AFD,AfDBerlin,2021-03-31 07:14:04
1,1,1,1377486425499832320,0,1377486425499832320_0.jpg,,https://pbs.twimg.com/media/Ev5cqNvWgAYX_Fl?fo...,../Datasets/img_dataset_political_accounts/AFD...,AFD,AfDBerlin,2021-04-01 05:02:10
2,2,2,1379754099609047040,0,1379754099609047040_0.jpg,IREBSSWEE Folgen\n\n* So geht es nicht weiter!...,https://pbs.twimg.com/media/EyXfBlTWEAEfrHg?fo...,../Datasets/img_dataset_political_accounts/AFD...,AFD,AfDBerlin,2021-04-07 11:13:06
3,3,3,1381254353336606721,0,1381254353336606721_0.jpg,6% Die Landesvorsitzende der AfD\nBerlin kriti...,https://pbs.twimg.com/media/EyszgrWXEAEqXID?fo...,../Datasets/img_dataset_political_accounts/AFD...,AFD,AfDBerlin,2021-04-11 14:34:34
4,4,4,1381863885666406400,0,1381863885666406400_0.jpg,,https://pbs.twimg.com/media/Ey1d1VqWQAACVNs?fo...,../Datasets/img_dataset_political_accounts/AFD...,AFD,AfDBerlin,2021-04-13 06:56:38
...,...,...,...,...,...,...,...,...,...,...,...
21954,21954,21954,1350695518536683524,0,1350695518536683524_0.jpg,Deutsches\nRotes\nKreuz\n\nServiceportal zur I...,https://pbs.twimg.com/media/Er6iaV7XAAIkOIp?fo...,../Datasets/img_dataset_political_accounts/AFD...,AFD,Tino_Chrupalla,2021-01-17 06:44:40
21955,21955,21955,1351083430218493953,0,1351083430218493953_0.jpg,Deutschland droht der Mega-Lockdown -\n\nobwoh...,https://pbs.twimg.com/media/EsADN8IXAAAnR7o?fo...,../Datasets/img_dataset_political_accounts/AFD...,AFD,Tino_Chrupalla,2021-01-18 08:26:06
21956,21956,21956,1352556008934137858,0,1352556008934137858_0.jpg,‚ . Die verlorene\nSchülergeneration\n\n,https://pbs.twimg.com/media/EsU-hOsXYAAQyIO?fo...,../Datasets/img_dataset_political_accounts/AFD...,AFD,Tino_Chrupalla,2021-01-22 09:57:36
21957,21957,21957,1462488643638108170,0,1462488643638108170_0.jpg,zur Impfpflicht! B |\n\nTino Chrupalla Bundess...,https://pbs.twimg.com/media/FEvNqgDWUAUD_5i?fo...,../Datasets/img_dataset_political_accounts/AFD...,AFD,Tino_Chrupalla,2021-11-21 18:30:39


In [6]:
def get_n_tokens_for_text(extracted_text):
    if isinstance(extracted_text, float) and math.isnan(extracted_text):
        return 0
    else:
        return len(nltk.word_tokenize(str(extracted_text)))

In [7]:
n_images_total = 0
n_tokens_total = 0
n_images_with_text_total = 0
for party in PARTIES:
    n_tokens_party = 0
    n_images_party = 0
    n_images_with_text_party = 0
    
    df_party = df[df["source_party"] == party]
                
    for index, row in df_party.iterrows():
        n_tokens_in_image = get_n_tokens_for_text(row['extracted_text'])
        n_tokens_party += n_tokens_in_image
        if n_tokens_in_image > 0:
            n_images_with_text_party += 1
            n_images_with_text_total += 1
            
                
    n_images_party = df_party.shape[0]
    n_images_total += n_images_party
    n_tokens_total += n_tokens_party
    print(f'Total #Images in corpus for {party}: {n_images_party}')
    print(f'Total #Images in corpus for {party} with text in image: {n_images_with_text_party}')
    print(f'#Tokens in corpus for {party}: {n_tokens_party}')
    print(f'#Avg text length for images of {party}: {round(n_tokens_party / n_images_party, 2)}')
    print("=================================== \n")

print("\n")
print(f'Total #Images in corpus: {n_images_total}')
print(f'Total #Images in corpus with text in image: {n_images_with_text_total}')
print(f'#Tokens in corpus: {n_tokens_total}')
print(f'#Avg text length for images in corpus: {round(n_tokens_total / n_images_total, 2)}')

Total #Images in corpus for CDU_CSU: 4741
Total #Images in corpus for CDU_CSU with text in image: 2203
#Tokens in corpus for CDU_CSU: 73600
#Avg text length for images of CDU_CSU: 15.52

Total #Images in corpus for SPD: 3710
Total #Images in corpus for SPD with text in image: 1880
#Tokens in corpus for SPD: 91028
#Avg text length for images of SPD: 24.54

Total #Images in corpus for AFD: 4543
Total #Images in corpus for AFD with text in image: 2887
#Tokens in corpus for AFD: 123943
#Avg text length for images of AFD: 27.28

Total #Images in corpus for FDP: 1802
Total #Images in corpus for FDP with text in image: 912
#Tokens in corpus for FDP: 49495
#Avg text length for images of FDP: 27.47

Total #Images in corpus for GRUENE: 3382
Total #Images in corpus for GRUENE with text in image: 1706
#Tokens in corpus for GRUENE: 62678
#Avg text length for images of GRUENE: 18.53

Total #Images in corpus for LINKE: 3781
Total #Images in corpus for LINKE with text in image: 2229
#Tokens in corpus 

In [8]:
df[df["source_account"] == "ArminLaschet"]

Unnamed: 0.1,Unnamed: 0,tweet_id,image_index,filename,extracted_text,url,image_path,source_party,source_account,date
1177,1177,1345785739691352065,0,1345785739691352065_0.jpg,#impulse2021\n\nFür ein innovatives\n\nund leb...,https://pbs.twimg.com/media/Eq0w_wsWMAI4HxJ?fo...,../Datasets/img_dataset_politicians/CDU_CSU/Ar...,CDU_CSU,ArminLaschet,2021-01-03 17:34:58
1178,1178,1345765744424984577,0,1345765744424984577_0.jpg,,https://pbs.twimg.com/media/Eq0ez02XUAYb1cN?fo...,../Datasets/img_dataset_politicians/CDU_CSU/Ar...,CDU_CSU,ArminLaschet,2021-01-03 16:15:31
1179,1179,1345765744424984577,0,1345765744424984577_0.jpg,,https://pbs.twimg.com/media/Eq0ez05W4AEBpHN?fo...,../Datasets/img_dataset_politicians/CDU_CSU/Ar...,CDU_CSU,ArminLaschet,2021-01-03 16:15:31
1180,1180,1345765744424984577,0,1345765744424984577_0.jpg,,https://pbs.twimg.com/media/Eq0ez1vWMAIjjhl?fo...,../Datasets/img_dataset_politicians/CDU_CSU/Ar...,CDU_CSU,ArminLaschet,2021-01-03 16:15:31
1181,1181,1346844883697471488,0,1346844883697471488_0.jpg,O GiscoWebex Meetings © Meeting-info Menüleist...,https://pbs.twimg.com/media/ErC31XrWMAIRcGD?fo...,../Datasets/img_dataset_politicians/CDU_CSU/Ar...,CDU_CSU,ArminLaschet,2021-01-06 15:43:38
...,...,...,...,...,...,...,...,...,...,...
1347,1347,1456583351100481538,0,1456583351100481538_0.jpg,Foto: Tobias Koch\n,https://pbs.twimg.com/media/FDbS1Y7XMAMnPn1?fo...,../Datasets/img_dataset_politicians/CDU_CSU/Ar...,CDU_CSU,ArminLaschet,2021-11-05 11:25:07
1348,1348,1458738019914104835,0,1458738019914104835_0.jpg,® | für kulturelle Angelegenheiten im R\nüber ...,https://pbs.twimg.com/media/FD56ftZWYAEkAOy?fo...,../Datasets/img_dataset_politicians/CDU_CSU/Ar...,CDU_CSU,ArminLaschet,2021-11-11 10:07:00
1349,1349,1466473999748288518,0,1466473999748288518_0.jpg,er\n\nbis 19.10 Uhr Eintreffen der Gäste\n\nau...,https://pbs.twimg.com/media/FFn2U8hXwAADGi9?fo...,../Datasets/img_dataset_politicians/CDU_CSU/Ar...,CDU_CSU,ArminLaschet,2021-12-02 18:27:01
1350,1350,1468500193926688774,0,1468500193926688774_0.jpg,H\n!\n8\n,https://pbs.twimg.com/media/FGEpJGBXsAMOYt4?fo...,../Datasets/img_dataset_politicians/CDU_CSU/Ar...,CDU_CSU,ArminLaschet,2021-12-08 08:38:24
