# Notebook: Download Images

This notebook is used to download all images from the tweets. What actions are taken in the process is explained below.
<br>**Contributors:** [Nils Hellwig](https://github.com/NilsHellwig/) 

## Packages

In [1]:
from io import BytesIO
from PIL import Image
import pandas as pd
import pytesseract
import requests
import json
import csv
import re
import os

## Parameters

In [2]:
DATASET_PATH = "../Datasets/dataset_politicians/dataset.csv"
PHOTOS_PATH = "../Datasets/img_dataset_politicians/"
PARTIES = ["CDU_CSU", "SPD", "AFD", "FDP", "GRUENE", "LINKE"]
TESSERACT_PATH = "/opt/homebrew/bin/tesseract"

## Settings

In [3]:
pytesseract.tesseract_cmd = TESSERACT_PATH

## Code

### 1. Create new Directories

In [4]:
# Iterate over the parties
for party in PARTIES:
    # Try to create a subdirectory for the party
    try:
        os.makedirs(PHOTOS_PATH + party)
    except FileExistsError:
        # The directory already exists, so do nothing
        pass

### 2. Load Dataset

In [5]:
def load_politicians_dataset():
    df = pd.read_csv(DATASET_PATH)
    
    # Umbenennen einer Spalte
    df = df.rename(columns={"Embedded_text": "text", "UserName": "source_account", "Partei": "source_party", "sentiment_prediction": "sentiment", "Timestamp": "date", "Image link": "photos"})
    
    df['date'] = pd.to_datetime(df['date'])
    df['date'] = df['date'].dt.strftime('%Y-%m-%d %H:%M:%S')
    df['source_account'] = df['source_account'].str.lstrip('@')
    
    # Löschen einer Spalte durch ihren Namen
    df = df.drop(columns=["Unnamed: 0", "Unnamed: 0.1"])
    
    # Mapping der Parteinamen
    party_mapping = {
        'AfD': 'AFD',
        'CDU/CSU': 'CDU_CSU',
        'DieLinke': 'LINKE',
        'Gruene': 'GRUENE',
        'FDP': 'FDP',
        'SPD': 'SPD'
    }
    
    df['source_party'] = df['source_party'].map(party_mapping)
    df['tweet_id'] = df['Tweet URL'].str.extract(r'status/(\d+)')
    
    return df

df = load_politicians_dataset()
df

Unnamed: 0,UserScreenName,source_account,date,Text,text,Emojis,Comments,Likes,Retweets,photos,Tweet URL,id,source_party,sentiment,tweet_id
0,AfD Berlin,AfDBerlin,2021-03-26 21:07:22,AfD Berlin\n@AfDBerlin\n·\n26. März,AfD wirkt.\n\nSchluss mit dem #Gendergaga\nMDR...,,4.0,28.0,132.0,['https://pbs.twimg.com/profile_images/1037343...,https://twitter.com/AfDBerlin/status/137555499...,1,AFD,2,1375554998461984769
1,AfD Berlin,AfDBerlin,2021-03-27 07:20:27,AfD Berlin\n@AfDBerlin\n·\n27. März,Im Herbst wird gewählt.\nSchluss mit den Recht...,,10.0,20.0,112.0,['https://pbs.twimg.com/card_img/1471780757332...,https://twitter.com/AfDBerlin/status/137570928...,2,AFD,2,1375709283850063876
2,AfD Berlin,AfDBerlin,2021-03-31 07:14:04,AfD Berlin\n@AfDBerlin\n·\n31. März,Behördenwillkür\nFlüchtlingsheime durchgedrück...,,3.0,13.0,34.0,['https://pbs.twimg.com/media/ExylKvEU8AgowSU?...,https://twitter.com/AfDBerlin/status/137715723...,3,AFD,1,1377157230630301700
3,AfD Berlin,AfDBerlin,2021-04-01 14:29:00,AfD Berlin\n@AfDBerlin\n·\n1. Apr.,Aus Raider wird jetzt Twix \n\nLeider kein #Ap...,,1.0,4.0,17.0,['https://pbs.twimg.com/card_img/1472910546907...,https://twitter.com/AfDBerlin/status/137762907...,4,AFD,2,1377629070662373376
4,AfD Berlin,AfDBerlin,2021-04-01 05:02:10,AfD Berlin\n@AfDBerlin\n·\n1. Apr.,Gendern geht („*innen“)\nImpfen geht nicht.\nD...,,1.0,3.0,16.0,['https://pbs.twimg.com/profile_images/9706413...,https://twitter.com/AfDBerlin/status/137748642...,5,AFD,1,1377486425499832320
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58859,Tino Chrupalla,Tino_Chrupalla,2021-12-04 17:26:46,Tino Chrupalla\n@Tino_Chrupalla\n·\n4. Dez. 2021,Friedlicher Protest gegen einen #Impfzwang ist...,,265.0,122.0,578.0,[],https://twitter.com/Tino_Chrupalla/status/1467...,61802,AFD,1,1467183612034433034
58860,Tino Chrupalla,Tino_Chrupalla,2021-12-13 16:30:19,Tino Chrupalla\n@Tino_Chrupalla\n·\n13. Dez. 2021,@OlafScholz\n muss sich endlich klar zu Nord S...,,4.0,2.0,11.0,[],https://twitter.com/Tino_Chrupalla/status/1470...,61803,AFD,0,1470430897199628298
58861,Tino Chrupalla,Tino_Chrupalla,2021-12-17 14:02:24,Tino Chrupalla\n@Tino_Chrupalla\n·\n17. Dez. 2021,Mit \n@_FriedrichMerz\n gibt es keine konserva...,,188.0,227.0,1.008,[],https://twitter.com/Tino_Chrupalla/status/1471...,61804,AFD,1,1471843224251740177
58862,Tino Chrupalla,Tino_Chrupalla,2021-12-19 09:27:23,Tino Chrupalla\n@Tino_Chrupalla\n·\n19. Dez. 2021,Wir wünschen Ihnen und Ihrer Familie einen bes...,,449.0,346.0,2.648,['https://pbs.twimg.com/media/FG9dwrcXIAUF5BP?...,https://twitter.com/Tino_Chrupalla/status/1472...,61805,AFD,0,1472498789122514945


### 3. Download all Images

In [6]:
df_images = pd.DataFrame(columns=['tweet_id', 'image_index', 'filename', 'extracted_text', 'url', 'image_path', 'source_party', 'source_account', 'date'])

for row in df.itertuples():
    photos_string = row.photos
    photo_links = re.findall(r"'(.*?)'", photos_string)

    for link in photo_links:
        index = 0
        if '/media/' in link:
            if "&name=small" in link:
                link = link.replace("&name=small", "")
            response = requests.get(link)
            if response.status_code == 200:
                image = Image.open(BytesIO(response.content))
                # Convert image mode to RGB if necessary
                if image.mode != 'RGB':
                    image = image.convert('RGB')
                filename = f"{row.tweet_id}_{index}.jpg"
                directory = os.path.join(PHOTOS_PATH, row.source_party, row.source_account)
                if not os.path.exists(directory):
                    os.makedirs(directory)
                image.save(os.path.join(directory, filename))
                
                image_path = os.path.join(directory, filename)
                text = pytesseract.image_to_string(Image.open(image_path), lang='deu')
                    
                new_row = {'tweet_id': row.tweet_id, 'image_index': index, 'filename': filename, 'extracted_text': text, 'url': link, 'image_path': image_path, 'source_party': row.source_party, 'source_account': row.source_account, 'date': row.date}
                df_images = pd.concat([df_images, pd.DataFrame(new_row, index=[0])], ignore_index=True)
                
                index += 1



In [7]:
df_images.to_csv(PHOTOS_PATH + "images_dataset.csv")

In [8]:
df_images

Unnamed: 0,tweet_id,image_index,filename,extracted_text,url,image_path,source_party,source_account,date
0,1377157230630301700,0,1377157230630301700_0.jpg,"Behördenwillkür\n\n\C\n\nD Es wird Zeit, den B...",https://pbs.twimg.com/media/ExylKvEU8AgowSU?fo...,../Datasets/img_dataset_politicians/AFD/AfDBer...,AFD,AfDBerlin,2021-03-31 07:14:04
1,1377486425499832320,0,1377486425499832320_0.jpg,,https://pbs.twimg.com/media/Ev5cqNvWgAYX_Fl?fo...,../Datasets/img_dataset_politicians/AFD/AfDBer...,AFD,AfDBerlin,2021-04-01 05:02:10
2,1379754099609047040,0,1379754099609047040_0.jpg,IREBSSWEE Folgen\n\n* So geht es nicht weiter!...,https://pbs.twimg.com/media/EyXfBlTWEAEfrHg?fo...,../Datasets/img_dataset_politicians/AFD/AfDBer...,AFD,AfDBerlin,2021-04-07 11:13:06
3,1381254353336606721,0,1381254353336606721_0.jpg,6% Die Landesvorsitzende der AfD\nBerlin kriti...,https://pbs.twimg.com/media/EyszgrWXEAEqXID?fo...,../Datasets/img_dataset_politicians/AFD/AfDBer...,AFD,AfDBerlin,2021-04-11 14:34:34
4,1381863885666406400,0,1381863885666406400_0.jpg,,https://pbs.twimg.com/media/Ey1d1VqWQAACVNs?fo...,../Datasets/img_dataset_politicians/AFD/AfDBer...,AFD,AfDBerlin,2021-04-13 06:56:38
...,...,...,...,...,...,...,...,...,...
21954,1350695518536683524,0,1350695518536683524_0.jpg,Deutsches\nRotes\nKreuz\n\nServiceportal zur I...,https://pbs.twimg.com/media/Er6iaV7XAAIkOIp?fo...,../Datasets/img_dataset_politicians/AFD/Tino_C...,AFD,Tino_Chrupalla,2021-01-17 06:44:40
21955,1351083430218493953,0,1351083430218493953_0.jpg,Deutschland droht der Mega-Lockdown -\n\nobwoh...,https://pbs.twimg.com/media/EsADN8IXAAAnR7o?fo...,../Datasets/img_dataset_politicians/AFD/Tino_C...,AFD,Tino_Chrupalla,2021-01-18 08:26:06
21956,1352556008934137858,0,1352556008934137858_0.jpg,‚ . Die verlorene\nSchülergeneration\n\n,https://pbs.twimg.com/media/EsU-hOsXYAAQyIO?fo...,../Datasets/img_dataset_politicians/AFD/Tino_C...,AFD,Tino_Chrupalla,2021-01-22 09:57:36
21957,1462488643638108170,0,1462488643638108170_0.jpg,zur Impfpflicht! B |\n\nTino Chrupalla Bundess...,https://pbs.twimg.com/media/FEvNqgDWUAUD_5i?fo...,../Datasets/img_dataset_politicians/AFD/Tino_C...,AFD,Tino_Chrupalla,2021-11-21 18:30:39
