In [10]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from scipy.interpolate import make_interp_spline


# Load the data from Excel files
all_en_data = pd.read_excel('All_EN.xlsx')
loc_en_data = pd.read_excel('Loc_En.xlsx')

# Convert 'Created_at' to datetime
all_en_data['Created_at'] = pd.to_datetime(all_en_data['Created_at'])

# Filter data for the specified date range
start_date = '2022-12-23'
end_date = '2022-12-27'
filtered_data = all_en_data[(all_en_data['Created_at'] >= start_date) & (all_en_data['Created_at'] <= end_date)]

# Group by location and stance, calculate count
location_stance_counts = pd.crosstab(index=loc_en_data['location'], columns=filtered_data['Class'])

# Filter locations based on top_countries
top_countries = location_stance_counts.sum(axis=1).nlargest(8).index
filtered_location_stance_counts = location_stance_counts.loc[top_countries]

# Normalize the stance counts to calculate percentages
normalized_location_stance_counts = filtered_location_stance_counts.div(filtered_location_stance_counts.sum(axis=1), axis=0) * 100

# Visualizations

# Line graph frequency by date
plt.figure(figsize=(10, 6))
tweet_counts_by_date = filtered_data.set_index('Created_at').resample('D').size()

# Smooth the data using make_interp_spline
x_smooth = np.linspace(0, len(tweet_counts_by_date) - 1, 300)  # Create a smooth x-axis
spline = make_interp_spline(np.arange(len(tweet_counts_by_date)), tweet_counts_by_date)
y_smooth = spline(x_smooth)

plt.plot(x_smooth, y_smooth, label='Smoothed Curve', color='blue')
plt.xlabel('Date')
plt.ylabel('Frequency of Tweets')
plt.xticks(np.arange(len(tweet_counts_by_date)), tweet_counts_by_date.index.strftime('%b %d'), rotation=45)

# Annotate points with date and tweet count
for x, y, date, count in zip(x_smooth, y_smooth, tweet_counts_by_date.index.strftime('%b %d'), tweet_counts_by_date):
    plt.text(x, y, f'{date}\nTweets: {count}', ha='center', va='bottom', fontsize=8, color='black', alpha=0.7)

# specific dates and their tweet counts
specific_dates = ['Dec 23', 'Dec 24', 'Dec 25', 'Dec 26']
for date in specific_dates:
    x_coord = np.where(tweet_counts_by_date.index.strftime('%b %d') == date)[0][0]
    y_coord = tweet_counts_by_date[tweet_counts_by_date.index.strftime('%b %d') == date][0]
    plt.scatter(x_coord, y_coord, c='red', marker='o', s=50, label=f'{date} Tweets: {y_coord}')

plt.legend()
plt.tight_layout()
plt.savefig('Line_graph.png')
plt.close()


# Pie chart for stance counts
plt.figure(figsize=(8, 8))
stance_counts = filtered_data['Class'].value_counts()
stance_counts.plot(kind='pie', autopct='%1.1f%%')
plt.tight_layout()
plt.savefig('stance.png')
plt.close()

# Pie chart for location percentages
plt.figure(figsize=(8, 8))
top_locations = loc_en_data['location'].value_counts().nlargest(8)
location_percentages = (top_locations / top_locations.sum()) * 100
location_percentages.plot(kind='pie', autopct='%1.1f%%')
plt.title('Location Distribution')
plt.tight_layout()
plt.savefig('location_distribution.png')
plt.close()

# Bar chart for stance percentages by location (filtered)
plt.figure(figsize=(12, 6))
ax = normalized_location_stance_counts.plot(kind='bar', stacked=True, color=['green', 'red', 'red'])
plt.title('Stance Distribution by Location')
plt.xlabel('Location')
plt.ylabel('Percentage')
plt.xticks(rotation=45)

# Adding percentage values to the bars
for i in ax.containers:
    ax.bar_label(i, label_type='edge', fontsize=10)

plt.legend(title='Stance', loc='upper right', bbox_to_anchor=(1.2, 1))
plt.tight_layout()
plt.savefig('stance_percentage_by_location.png')
plt.close()


<Figure size 1200x600 with 0 Axes>