# Making example Data based on scraped Data

The example data is needed to make the stacked bar chart based on the articles. Here, we assign random classes to the articles.

In [44]:
import pandas as pd
import numpy as np

In [53]:
# Read initial csv. This was manuelly made from the scraped data by taking the relevant columns
df = pd.read_csv("title_link.csv", sep=';')

# Seed for consistent Class assigning later
np.random.seed(42)

In [54]:
# Extract Medium from link
pattern = r"^(?:https?://)?(?:www\.)?"

mapping = {
    "jungefreiheit": "Junge Freiheit",
    "nd-aktuell": "nd",
    "jungewelt": "Junge Welt",
    "tagesschau": "tagesschau",
    "jacobin": "Jacobin", 
    "taz": "taz",
    "tichyseinblick": "Tichys Einblick"
}

df['Medium'] = df['link'].str.replace(pattern, '', regex=True)
df['Medium'] = df['Medium'].str.split('.de', n=1, expand=True)[0]
df['Medium'] = df['Medium'].map(mapping).fillna(df['Medium'])

In [55]:
# Make random placeholder Classes
political_leanings = ["left", "left-center", "center", "right-center", "right"]
random_leanings = np.random.choice(political_leanings, size=len(df))
df['Class'] = random_leanings

# Check distribution (not necessary for functionality)
print("\nDistribution of 'political_leaning' values:")
print(df['Class'].value_counts())


Distribution of 'political_leaning' values:
right-center    10
center           9
right            7
left-center      5
left             4
Name: Class, dtype: int64


In [56]:
# Add HTML-syntax to make the links clickable
df['clickable_title'] = df.apply(lambda row: f"<a href='{row['link']}' target='_blank'>{row['title']}</a>", axis=1)

## Everything above is always the same, regardless the classes. The code below now depends on the class values

In [57]:
# Drop the columns that are no longer needed to make the clickable link
df = df.drop(columns=['title', 'link'])

# Combine the clickable titles to have one row for each Medium-Class
df = df.groupby(['Medium', 'Class'])['clickable_title'].agg(list).reset_index()

# Calculate the percentage of each Class
articles_per_medium = 5
df['Value'] = df['clickable_title'].apply(lambda x: (len(x) / articles_per_medium) * 100)
df['Percentage'] = df['Value'].apply(lambda x: f"{x:.1f}%")

In [58]:
def format_titles(titles_list):
    """
    Adds <br> before every string in the list except the first one.
    Assumes titles_list is a list of strings.
    """
    if not titles_list: # Handle empty lists
        return []

    formatted_list = []
    # Add the first title as is
    formatted_list.append(titles_list[0])

    # Add <br> before subsequent titles
    for title in titles_list[1:]: # Loop from the second element to the end
        formatted_list.append(f"<br>{title}")

    return formatted_list

In [59]:
# Apply the function to the 'clickable_title' column
df['clickable_title'] = df['clickable_title'].apply(format_titles)
df['clickable_title'] = df['clickable_title'].apply(lambda x: "".join(x))

# Sort the dataframe
df['Class'] = pd.Categorical(df['Class'], categories=political_leanings, ordered=True)
df = df.sort_values(by=['Medium', 'Class'])
df = df.reset_index(drop=True)

In [60]:
df

Unnamed: 0,Medium,Class,clickable_title,Value,Percentage
0,Jacobin,left,<a href='https://www.jacobin.de/artikel/merz-w...,20.0,20.0%
1,Jacobin,center,<a href='https://www.jacobin.de/artikel/ilan-p...,20.0,20.0%
2,Jacobin,right-center,<a href='https://www.jacobin.de/artikel/elon-m...,20.0,20.0%
3,Jacobin,right,<a href='https://www.jacobin.de/artikel/fortsc...,40.0,40.0%
4,Junge Freiheit,left,<a href='https://jungefreiheit.de/politik/deut...,20.0,20.0%
5,Junge Freiheit,left-center,<a href='https://jungefreiheit.de/politik/deut...,20.0,20.0%
6,Junge Freiheit,right-center,<a href='https://jungefreiheit.de/politik/deut...,40.0,40.0%
7,Junge Freiheit,right,<a href='https://jungefreiheit.de/politik/deut...,20.0,20.0%
8,Junge Welt,left-center,<a href='https://www.jungewelt.de/artikel/5008...,20.0,20.0%
9,Junge Welt,center,<a href='https://www.jungewelt.de/artikel/5008...,40.0,40.0%


In [61]:
# Just to check everything went right
print(df["clickable_title"][0])

<a href='https://www.jacobin.de/artikel/merz-wirtschaft-arbeit-gewerkschaften-agenda2010-arbeitszeit-teilzeit-klassenkampf' target='_blank'>Nein, die Menschen in Deutschland arbeiten nicht zu wenig</a>


In [43]:
# Save the example data
df.to_csv("../articleDataExample.csv", index=False)