In [None]:
import os
import pandas as pd
import seaborn as sns
import pandas_profiling as pp
import matplotlib.pyplot as plt
import random
from pathlib import Path
from scipy.optimize import curve_fit
from datetime import datetime
import numpy as np

script_dir = Path.cwd()
processed_json_dir = os.path.join(*[script_dir, "processed/json"])
processed_csv_dir = os.path.join(*[script_dir, "processed", "csv"])
processed_stats_dir = os.path.join(*[script_dir, "processed/stats"])
processed_plots_dir = os.path.join(*[script_dir, "processed/plots"])

In [None]:
# Load comments.csv from processed/csv
comments_csv = os.path.join(*[processed_csv_dir, "comments.csv"])
# print(comments_csv)
comments_df = pd.read_csv(comments_csv, encoding="utf-8", delimiter=";", on_bad_lines="skip")

In [None]:
# Print head of comments dataframe
print(comments_df.head())

In [None]:
# Print a summary of the comments dataframe
comments_df.info()

In [None]:
# Remove rows where both sentiment_posaitive_count and sentiment_negative_count are 0 and their sum is lower than 5
comments_df = comments_df[(comments_df["sentiment_positive_count"] != 0) | (comments_df["sentiment_negative_count"] != 0)]
comments_df = comments_df[(comments_df["sentiment_positive_count"] + comments_df["sentiment_negative_count"]) > 5]
comments_df.info()

In [None]:
# Transform column date to datetime with form "%Y-%m-%d_%H-%M"
comments_df["date"] = comments_df["date"].apply(lambda x: datetime.strptime(x, "%Y-%m-%d_%H-%M"))
comments_df.info()

In [None]:
# # Add column date_day where it is taken from date column and formatted as YYYY-MM-DD string
# comments_df["date_day"] = comments_df["date"].apply(lambda x: x.strftime("%Y-%m-%d"))
# Add column date_day where it is taken from date column and formatted as YYYY-MM-DD datetime object
comments_df["date_day"] = comments_df["date"].apply(lambda x: datetime.strptime(x.strftime("%Y-%m-%d"), "%Y-%m-%d"))
comments_df.info()

In [None]:

# Add column date_sentiment where it is calculated as mean of sentiment grouped by date_day
comments_df["date_sentiment"] = comments_df.groupby("date_day")["sentiment"].transform("mean")
comments_df.info()

In [None]:
# Aggregate comments by date_day column
# comments_df_agg = comments_df.groupby("date_day").agg(
# 	{"sentiment": ["mean", "count"], "sentiment_positive_count": ["mean", "count"], "sentiment_negative_count": ["mean", "count"], "sentiment_word_count": ["mean", "count"]}
# )
comments_df_agg = comments_df.groupby("date_day").agg(
	{"sentiment": "mean", "sentiment_positive_count": "mean", "sentiment_negative_count": "mean", "sentiment_word_count": "mean"}
)
# Add back date_day column
comments_df_agg.reset_index(inplace=True)
comments_df_agg.info()
print(comments_df_agg.head())

In [None]:
# # Add column date_year where it is taken from date column as YYYY datetime object
# comments_df_agg["date_year"] = comments_df_agg["date_day"].apply(lambda x: datetime.strptime(x.strftime("%Y"), "%Y"))
# # Add column date_month where it is taken from date column as YYYY-MM datetime object
# comments_df_agg["date_month"] = comments_df_agg["date_day"].apply(lambda x: datetime.strptime(x.strftime("%Y-%m"), "%Y-%m"))
# Add column date_year where it is taken from date column as YYYY datetime string
comments_df_agg["date_year"] = comments_df_agg["date_day"].apply(lambda x: x.strftime("%Y"))
# Add column date_month where it is taken from date column as YYYY-MM datetime string
comments_df_agg["date_month"] = comments_df_agg["date_day"].apply(lambda x: x.strftime("%Y-%m"))
# Add a column year_day where it is taken from date column and formatted as day of year string
comments_df_agg["year_day"] = comments_df_agg["date_day"].apply(lambda x: x.strftime("2000-%j"))
# Convert year_day column to datetime object
comments_df_agg["year_day"] = comments_df_agg["year_day"].apply(lambda x: datetime.strptime(x, "%Y-%j"))
comments_df_agg.info()
print(comments_df_agg.head())

In [None]:
# Split comments_df_agg into a list of dataframes where each dataframe is a year
comments_df_agg_list = [comments_df_agg[comments_df_agg["date_year"] == year] for year in comments_df_agg["date_year"].unique()]
print(f"Number of years: {len(comments_df_agg_list)}")
print(f"Dataframes: {comments_df_agg_list}")

In [None]:
key_points = {
	"PKP-1": "2020-03-29",
	"PKP-2": "2020-04-22",
	"PKP-3": "2020-05-19",
	"PKP-4": "2020-06-28",
	"PKP-5": "2020-09-23",
	"PKP-6": "2020-11-10",
	"PKP-7": "2020-12-19",
	"PKP-8": "2021-01-25",
	"PKP-9": "2021-07-14",
	"Razglasitev epidemije": "2020-03-12",
	"Preklic epidemije": "2020-05-31",
	"Raglasitev epidemije 2": "2020-10-18",
	"Hitri antigenski testi": "2020-12-22",
	"Začetek cepljenja": "2020-12-27",
	"Lockdown 3. val (začetek)": "2021-4-1",
	"Lockdown 3. val (konec)": "2021-4-11",
	"Povratek v oranžno fazo": "2021-4-21",
	"Povratek v rumeno fazo": "2021-5-12",
	"Preklic epidemije 2": "2021-5-14",
	"Uvedba evropskega zdravstvenega potrdila": "2021-6-5",
	"Obvezni osebni dokument ob PCT potrdilu": "2021-11-8",
	"Obvezno testiranje za necepljene (3x tedensko)": "2021-11-15"
}

In [None]:

fig_width = 40 # 40
fig_height = 12 # 6
window_size = 30 # 7 # 30

# Superimpose all years over matplotlib figure
fig, ax = plt.subplots(figsize=(fig_width, fig_height))
# Create a list of matplotlib colors
colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd", "#8c564b", "#e377c2", "#7f7f7f", "#bcbd22", "#17becf"]
for i, comments_df_agg_year in enumerate(comments_df_agg_list):
	# Using savitzky-golay filter to smooth the plot
	ax.plot(comments_df_agg_year["year_day"], comments_df_agg_year["sentiment"],  linestyle="--", color=colors[i], label=comments_df_agg_year["date_year"].unique()[0], alpha=0.5)
	ax.plot(comments_df_agg_year["year_day"], comments_df_agg_year["sentiment"].rolling(window=window_size, center=True).mean(), color=colors[i], alpha=1)

# Draw key points as vertical lines
for key, value in key_points.items():
	# Set all value years to 2000
	v = datetime.strptime(value, "%Y-%m-%d")
	v_plot = v.replace(year=2000)
	col = colors[v.year - 2019]
	# ax.axvline(x=v_plot, color=col, linestyle="--", label=key)
	ax.axvline(x=v_plot, color=col, label=key)
	# Add vertical text to key points and center to top of the plot
	ax.text(v_plot, 0.125, f"{key} ({value})", rotation=90, color=col, ha="right", va="top", fontsize=12)
	# ax.text(v_plot, 0.5, key, rotation=90, color=col, fontsize=8)

# Use month of year as x-axis
ax.xaxis.set_major_formatter(plt.FuncFormatter(lambda x, pos: datetime.strftime(datetime.fromordinal(int(x)), "%b")))
# Set title to Comments mean sentiment (years superimposed)
ax.set_title(f"Comments mean sentiment (years superimposed - window size: {window_size})")
ax.legend()

# Create path processed_plots_dir if it does not exist
if not os.path.exists(processed_plots_dir):
	os.makedirs(processed_plots_dir)

# Save plot as svg to processed_plots_dir
plt.savefig(os.path.join(*[processed_plots_dir, "comments_sentiment_years_superimposed.svg"]), format="svg")

# Show plot
plt.show()


In [None]:
# Save the final dataframe as a csv file to 

In [None]:
# Save the final dataframe as a csv file to processec_csv_dir
comments_df_agg.to_csv(os.path.join(*[processed_csv_dir, "comments_sentiment_years_superimposed.csv"]), index=False, encoding="utf-8")