# To perform descriptive analysis using post-processed data

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np 
import seaborn as sns

In [None]:
%matplotlib qt
#qt or inline

In [None]:
df = pd.read_excel('~/code/amr/test_file/post_processed_data.xlsx')

# Category counts

In [None]:
#6 messeages count in 2017-2023
df['cat'].value_counts()
cat_sum = df['cat'].value_counts()
cat_sum = cat_sum.sort_values(ascending = False) # inplace: If True, perform operation in-place.
cat_percentage = (cat_sum.values/sum(cat_sum.values)) *100
cat_percentage = cat_percentage.round(2)
# build a table for this
cat_table = pd.DataFrame({'Category':cat_sum.index,'Count':cat_sum.values,'Percentage':cat_percentage})

In [None]:
cat_table

# Plot

In [None]:
fig, ax = plt.subplots()
bars = ax.bar(cat_sum.index,cat_sum.values)
ax.set_ylabel('Messages frequency')
plt.title('Messeges')
for i, bar in enumerate(bars):
    height = bar.get_height()
    ax.annotate(f'n= {cat_sum.values[i]}, {(cat_percentage)[i]}%',  # The text to display
                xy=(bar.get_x() + bar.get_width() / 2, height),  # Position of the text above the bar
                xytext=(0, 3),  # Offset (adjust as needed)
                textcoords="offset points",
                ha='center', va='bottom')
#plt.xticks(rotation=90)
plt.show()

# Likes in each message category

In [None]:
likes_in_cat = df.groupby('cat')['likesCount'].sum()
likes_in_cat = likes_in_cat.sort_values(ascending = False) # inplace: If True, perform operation in-place.
pct_likes = (likes_in_cat.values / sum(likes_in_cat.values))*100
pct_likes = pct_likes.round(2)
# build a table for this
like_table = pd.DataFrame({'Category':likes_in_cat.index,'LikeCounts':likes_in_cat.values,'Percentage':pct_likes})

In [None]:
like_table

In [None]:
#like counts in each category by year
likes_in_cat = df.groupby(['year', 'cat'])['likesCount'].sum().unstack()

In [None]:
ax = likes_in_cat.plot(kind='bar', stacked=True, figsize=(50, 6))
plt.xlabel('Year')
plt.ylabel('Likes Count')
plt.title('Likes Count by messages')
plt.legend(title='Category', loc='upper left', bbox_to_anchor=(1, 1))
plt.xticks(rotation=0)

# for container in ax.containers:
#     ax.bar_label(container, fmt='%d', label_type='edge',fontsize=8)
    
plt.tight_layout(rect=[0, 0, 0.85, 1])
plt.tight_layout()
plt.show()

In [None]:
# Histogram
sns.histplot(df['year'], kde=False).set_title('Histogram of categories')  # kde: plot density estimation
plt.show()

In [None]:
# Bar
sns.barplot(df, x='year', y='likesCount', hue='cat')  # add a second layer of grouping with hue
plt.show()

# Time graph - messages across time 

In [None]:
year_in_cat = df.groupby('year')['cat'].count()
year_in_cat = year_in_cat.sort_values(ascending =False)
print(year_in_cat.index)
print(year_in_cat.values)

In [None]:
messages_df = df.groupby(['year', 'cat']).size().unstack(fill_value=0)
messages_df

In [None]:
messages_df = df.groupby(['year', 'cat']).size().unstack(fill_value=0)
total_by_year = messages_df.sum(axis=1)
percentage_df = messages_df.divide(total_by_year, axis=0) * 100

In [None]:
plt.figure(figsize=(12, 6))

for cat in percentage_df.columns:
    if cat != 'year':
        plt.plot(percentage_df.index, percentage_df[cat], marker='o', linestyle='-', label=cat)
plt.xlabel('Year')
plt.ylabel('Percentages of posts(%)')
plt.title('Messages across the year 2017-2023')
plt.grid(True)
plt.legend()
plt.show()