# Analyze the article tags for 2022

In [None]:
from collections import Counter
from datetime import datetime
import pandas as pd
import numpy as np
import sqlite3
import seaborn as sns
import matplotlib.pyplot as plt
sns.set_theme(style="whitegrid")


## Load the data

In [None]:
conn = sqlite3.connect("news.db")
cursor = conn.cursor()
cursor.execute("SELECT timestamp, tags FROM Tagesschau")
results = cursor.fetchall()

In [None]:
df_news = pd.DataFrame(results, columns=['timestamp', 'tags'])
df_news['timestamp'] = pd.to_datetime(df_news['timestamp'])
df_news.sort_values("timestamp", inplace=True)

In [None]:
df_news

## Simple exploratory data analysis on tags

### Article frequency
* How many articles are published in 2022?
* How are the articles distributed throughout the years?
* Are there months where more articles are published than usual?

In [None]:
df_articles_per_year = df_news.groupby(pd.Grouper(key="timestamp", freq="1Y")).agg({'tags': 'count'}).rename(columns={'tags': 'number of articles per year'})
df_articles_per_month = df_news.groupby(pd.Grouper(key="timestamp", freq="1M")).agg({'tags': 'count'}).rename(columns={'tags': 'number of articles per month'})
df_articles_per_week = df_news.groupby(pd.Grouper(key="timestamp", freq="1W")).agg({'tags': 'count'}).rename(columns={'tags': 'number of articles per week'})

print(df_articles_per_year)

In [None]:
fig, ax = plt.subplots(figsize=(10, 8), nrows=2, ncols=1, sharex=True)
# articles per month
sns.lineplot(
    data=df_articles_per_month, color='C0', ax=ax[0])
ax[0].hlines(y=df_articles_per_month.mean(), xmin=datetime(2022,1,1), xmax=datetime(2022,12,31), colors='C1', label='Average number of published articles', linestyles='dashed')
ax[0].legend()

# articles per week
sns.lineplot(
    data=df_articles_per_week, color='C0', ax=ax[1])
ax[1].hlines(y=df_articles_per_week.mean(), xmin=datetime(2022,1,1), xmax=datetime(2022,12,31), colors='C1', label='Average number of published articles', linestyles='dashed')
ax[1].legend()


### Tags
* What are the unique tags used in 2022?
* How many unique tags are used in 2022?
* What are the top tags used in 2022?
* What are the top tags used per month?
* How is the frequency of the top tags distributed over the year?
* How many tags are used per article?

In [None]:
df_news['tags_splitted'] = df_news['tags'].apply(lambda x: x.split(','))

In [None]:
df_news['tags_splitted']

In [None]:
# How many tags are used per article?
df_news['num_tags'] = df_news['tags_splitted'].apply(len)

In [None]:
# plot distribution for number of tags per article
fig, ax = plt.subplots()
sns.countplot(data=df_news, x='num_tags', color='C0', ax=ax)

In [None]:

all_tags = []
for tags in df_news['tags_splitted']:
    all_tags.extend(tags)

In [None]:
# What are the unique tags used in 2022?
unique_tags = sorted(list(set(all_tags)))
print("The first 10 tags: ", unique_tags[:11])

# How many unique tags are used in 2022?
print(f"The number of unique tags in 2022: {len(unique_tags)}")

In [None]:
# What are the top tags used in 2022?
from collections import Counter
# https://note.nkmk.me/en/python-collections-counter/
tags_count_per_year = Counter(all_tags)

In [None]:
# What are the top tags used per month?
df_tags_per_month = df_news.groupby(pd.Grouper(key="timestamp", freq="1M")).agg({'tags_splitted': 'sum'}).rename(columns={'tags_splitted': 'all tags'})
df_tags_per_month['tags frequency'] = df_tags_per_month['all tags'].apply(Counter)
top_n = 10
df_tags_per_month['most frequently used tags'] = df_tags_per_month['tags frequency'].apply(lambda c: c.most_common(top_n))

In [None]:
df_tags_per_month

In [None]:
# How are the top tags distributed over the year?
sns.barplot(
    data=pd.DataFrame(df_tags_per_month.iloc[11]["most frequently used tags"], columns=['tag', 'count']),
    y='tag',
    x='count',
    color='C0'
)

In [None]:
top_n_overall = 20
df_top_n_tags = pd.DataFrame(tags_count_per_year.most_common(top_n_overall), columns=['tag', 'count'])
sns.barplot(
    data=df_top_n_tags,
    y='tag',
    x='count',
    color='C0'
)

In [None]:
# Temporal distribution of tag occurrence 
df_tags_occurrence = pd.DataFrame()
for tag in df_top_n_tags['tag']:
    df_tags_occurrence = pd.concat([
        df_tags_occurrence,
        df_news\
            .set_index('timestamp')['tags_splitted']\
            .apply(lambda x: 1 if tag in x else 0)\
            .groupby(pd.Grouper(freq="1W"))\
            .sum()\
            .rename(tag)
        ],axis=1
    )

In [None]:
df_tags_occurrence

In [None]:
df_tags_rel_occurrence = df_tags_occurrence / df_tags_occurrence.sum()
sns.lineplot(data=df_tags_rel_occurrence[["Coronavirus", "Energiekrise", "Ukraine-Krieg"]])

In [None]:
# https://medium.com/@szabo.bibor/how-to-create-a-seaborn-correlation-heatmap-in-python-834c0686b88e
df_tag_correlation_pearson = df_tags_occurrence.corr(method='pearson')
df_tag_correlation_spearman = df_tags_occurrence.corr(method='spearman')

def plot_correlation(df_tag_correlation, method):
    plt.figure(figsize=(16, 6))
    mask = np.triu(np.ones_like(df_tag_correlation, dtype=bool))
    heatmap = sns.heatmap(df_tag_correlation, vmin=-1, vmax=1, annot=True, cmap='RdBu', mask=mask, cbar=False, fmt='.2f')
    heatmap.set_title(f'{method.title()} correlation heatmap')

In [None]:
plot_correlation(df_tag_correlation_pearson, method='pearson')
plot_correlation(df_tag_correlation_spearman, method='spearman')

In [None]:
# https://www.statology.org/numpy-get-indices-where-true/
# https://stackoverflow.com/questions/17778394/list-highest-correlation-pairs-from-a-large-correlation-matrix-in-pandas
def get_correlation_pairs(df_tag_correlation):
    no_duplicated_tag_correlation = (df_tag_correlation * (np.triu(np.ones_like(df_tag_correlation, dtype=int)) - np.eye(len(df_tag_correlation), dtype=int)))
    return no_duplicated_tag_correlation.unstack().sort_values(ascending=False).to_dict()

In [None]:
correlation_pairs_pearson = get_correlation_pairs(df_tag_correlation_pearson)
correlation_pairs_spearman = get_correlation_pairs(df_tag_correlation_spearman)

In [None]:
correlation_pairs_pearson

In [None]:
correlation_pairs_spearman

## Topic modeling for tags

In [None]:
from time import time

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation


In [None]:
def custom_analyzer(comma_separated_tags):
    return [x.split(',') for x in comma_separated_tags]

In [None]:
data_samples = df_news['tags'].apply(lambda x: x.split(',')).tolist()

In [None]:
# create a dictionary for all tags used and assign integers to each tag
# all tags
all_tags = []
for tags_in_article in data_samples:
    all_tags.extend(tags_in_article)

In [None]:
# What are the top tags used?

top_tags = df_tags_counts['tag'].values

In [None]:
unique_tags = sorted(set(top_tags))
vocabulary = {key: value for key, value in zip(unique_tags, range(len(unique_tags)))}

tag_to_index = {key: value for key, value in zip(unique_tags, range(len(unique_tags)))}
index_to_tag = {key: value for key, value in enumerate(unique_tags)}

In [None]:
# vectorize the article tags with the vocabulary mapping
# example:
# [['Corona', 'Liveblog'], ['Frankreich', 'Corona']] --> [[1, 2], [3, 1]]

test_input = ['Corona', 'Liveblog', 'Test']
test_output = [1, 2]
test_vocabulary = {'Corona': 1, 'Liveblog': 2, 'Frankreich': '3'}

def vectorize_tags(article_tags, vocabulary):
    return [vocabulary[tag] for tag in article_tags if tag in vocabulary]
    

In [None]:
assert vectorize_tags(test_input, test_vocabulary) == test_output

In [None]:
vectorized_tags = [vectorize_tags(article_tags, tag_to_index) for article_tags in data_samples]

In [None]:
vectorized_tags

In [None]:
test_input = [[1, 2, 4], [2, 3, 1], [5, 4, 1]]
test_vocabulary_id = [0, 1, 2, 3, 4, 5]

In [None]:
import numpy as np

In [None]:
# [1, 2, 4] -> [0, 1, 1, 0, 1, 0]
# [2, 3, 1] -> [0, 1, 1, 1, 0, 0]

# 1. get first id in doc
# 2. increment position with 1 in target vector
# 3. get next id in doc and do 2. again

def count_vectorize(doc_ids, vocabulary):
    num_docs = len(doc_ids)
    num_vocabulary_entities = len(vocabulary.values())
    vectors = np.zeros((num_docs, num_vocabulary_entities), dtype=int)
    for doc_count, doc in enumerate((doc_ids)):
        for id in doc:
            vectors[doc_count, id] += 1
    return vectors


In [None]:
doc_ids_test = [[1, 2], [2, 3], [0], []]
vocabulary_test = {'a':0, 'b':1, 'c':2, 'd':3}

count_vectorize(doc_ids_test, vocabulary_test)

In [None]:
count_vectorized = count_vectorize(vectorized_tags, tag_to_index)

In [None]:
# get tfidf results
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
tfidf = TfidfTransformer()

In [None]:
tfidf_vectorized = tfidf.fit_transform(count_vectorized)

In [None]:
# Reduce dimensionality
from sklearn.decomposition import TruncatedSVD

In [None]:
tsvd = TruncatedSVD(n_components=10)
tsvd.fit(tfidf_vectorized)

In [None]:
tsvd.explained_variance_ratio_

In [None]:
X_tsvd = tsvd.transform(tfidf_vectorized)

In [None]:
from sklearn.cluster import KMeans

In [None]:
kmeans = KMeans(n_clusters=5, max_iter=100, n_init=1)

In [None]:
kmeans.fit(X_tsvd)

In [None]:
cluster_ids, cluster_sizes = np.unique(kmeans.labels_, return_counts=True)

In [None]:
cluster_sizes

In [None]:
[index_to_tag[index] for index in tsvd.inverse_transform(kmeans.cluster_centers_).argsort()[:, ::-1][4]]

In [None]:
kmeans.cluster_centers_

In [None]:
lda = LatentDirichletAllocation()
lda.fit(count_vectorized)

In [None]:
lda.transform(vectorized_tfidf[-2:])

In [None]:
lda.components_.shape

In [None]:
topic = lda.components_[0]

In [None]:
topic

In [None]:
top_words = topic.argsort()[:-20:-1]

In [None]:
[index_to_tag[index] for index in top_words]

In [None]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [None]:
# What tags are occurring together?

tag = 'Russland'

In [None]:
# Give me all articles where I can find the tag
relevant_tags = []
for data_sample_ in data_samples:
    if tag in data_sample_:
        relevant_tags.append(data_sample_)
        

In [None]:
df_relevant_tags_counts = pd.Series(flatten(relevant_tags)).value_counts().sort_values(ascending=False).reset_index()
df_relevant_tags_counts.columns = ['tag', 'count']

In [None]:
df_relevant_tags_counts.head(10)

In [None]:
from gensim import utils

In [None]:
utils.simple_preprocess("This is a simple line")

In [None]:
import gensim.models

In [None]:
model = gensim.models.Word2Vec(sentences=data_samples, window=2, vector_size=100)

In [None]:
model.wv['EU']

In [None]:
all_tags_series.sample(1)

In [None]:
print(model.wv.wmdistance(["USA"], ["Homeoffice"]))
print(model.wv.wmdistance(["USA"], ["Biden"]))
print(model.wv.wmdistance(["USA"], ["Ukraine"]))
print(model.wv.wmdistance(["Krieg"], ["Ukraine"]))
print(model.wv.wmdistance(["Krieg"], ["Mondmission"]))
print(model.wv.wmdistance(["Russland"], ["Putin"]))
print(model.wv.wmdistance(["Impfung"], ["Putin"]))

In [None]:
print(model.wv.most_similar(["Landtagswahl"], topn=100))

In [None]:
from sklearn.decomposition import IncrementalPCA    # inital reduction
from sklearn.manifold import TSNE                   # final reduction
import numpy as np                                  # array handling


def reduce_dimensions(model):
    num_dimensions = 2  # final num dimensions (2D, 3D, etc)

    # extract the words & their vectors, as numpy arrays
    vectors = np.asarray(model.wv.vectors)
    labels = np.asarray(model.wv.index_to_key)  # fixed-width numpy strings

    # reduce using t-SNE
    tsne = TSNE(n_components=num_dimensions, random_state=0)
    vectors = tsne.fit_transform(vectors)

    x_vals = [v[0] for v in vectors]
    y_vals = [v[1] for v in vectors]
    return x_vals, y_vals, labels


x_vals, y_vals, labels = reduce_dimensions(model)

def plot_with_plotly(x_vals, y_vals, labels, plot_in_notebook=True):
    from plotly.offline import init_notebook_mode, iplot, plot
    import plotly.graph_objs as go

    trace = go.Scatter(x=x_vals, y=y_vals, mode='text', text=labels)
    data = [trace]

    if plot_in_notebook:
        init_notebook_mode(connected=True)
        iplot(data, filename='word-embedding-plot')
    else:
        plot(data, filename='word-embedding-plot.html')


def plot_with_matplotlib(x_vals, y_vals, labels):
    import matplotlib.pyplot as plt
    import random

    random.seed(0)

    plt.figure(figsize=(12, 12))
    plt.scatter(x_vals, y_vals)

    #
    # Label randomly subsampled 25 data points
    #
    indices = list(range(len(labels)))
    selected_indices = random.sample(indices, 25)
    for i in selected_indices:
        plt.annotate(labels[i], (x_vals[i], y_vals[i]))

try:
    get_ipython()
except Exception:
    plot_function = plot_with_matplotlib
else:
    plot_function = plot_with_plotly

plot_function(x_vals, y_vals, labels)