# Data Pre-processing and Graph/Chart Generation

## Importing libraries and the Tweet dataset

In [None]:
import pandas as pd
import plotly.express as px

df = pd.read_excel("dataset.xlsx").copy()

### Data pre-processing for the Tweet dataset

In [None]:
# Check which columns have null values
# print(df.isna().any()) # Should only be True for 'Account bio' and 'Location'

# Dealing with null values
df["Account bio"] = df["Account bio"].fillna("No bio")
df["Location"] = df["Location"].fillna("No location")

# Joined
for i in range(len(df)):
    if isinstance(df["Joined"][i], str):
        date = df["Joined"][i].split("/")
        df["Joined"][i] = f"{('0' * (2 - len(date[0]))) + date[0]}/{df['Joined'][i][-2:]}"
    else:
        date = str(df["Joined"][i]).split()[0].split("-")
        df["Joined"][i] = f"{date[1]}/{date[2]}"

# Interactions
df["Interactions"] = df["Likes"] + df["Replies"] + df["Retweets"]

# Date posted
for i in range(len(df)):
    if isinstance(df["Date posted"][i], str):
        date = df["Date posted"][i].split()[0].split("/")
        df["Date posted"][i] = int(date[1]) + (12 * (int(date[2][2:] if len(date[2]) > 2 else date[2]) - 16))
        # df["Date posted"][i] = f"{('0' * (2 - len(date[1]))) + date[1]}/{date[2][2:] if len(date[2]) > 2 else date[2]}"
    else:
        date = str(df["Date posted"][i]).split()[0].split("-")
        df["Date posted"][i] = int(date[2]) + (12 * (int(date[0][2:]) - 16))
        # df["Date posted"][i] = f"{date[2]}/{date[0][2:]}"

### Creating a new dataframe based on the Tweet dataset binned by months

In [None]:
# mm_yy = [f"{i if i >= 10 else '0' + str(i)}/{j}" for j in range(16, 23) for i in range(1, 13)]
mm_yy = [i for i in range(1, 85)]
date_df = pd.DataFrame(mm_yy, columns=["Month"])
date_df["Count"] = date_df["Month"].map(df["Date posted"].value_counts())
date_df["Count"] = date_df["Count"].fillna(0)
date_df["Interactions"] = date_df["Month"].map(df.groupby("Date posted")["Interactions"].sum())
date_df["Interactions"] = date_df["Interactions"].fillna(0)

### Generating a heatmap for the whole Tweet dataset and another heatmap focusing on Followers and Interactions

In [None]:
fig = px.imshow(df.corr(), text_auto=True, title="df dataset numerical variable correlation")
fig.show()
# fig.write_html("dfcorr.html")
fig = px.imshow(df[["Followers", "Interactions"]].corr(), text_auto=True, title="Tweet dataset correlation between Account Followers and Tweet Interactions")
fig.show()
# fig.write_html("dfcorr_followers_interactions.html")

### Generating a histogram for Account type, various violin plots for numerical variables, and a bar graph detailing total interactions per tweet

In [None]:
fig = px.histogram(df, x="Account type", title="'Account type' Histogram")
fig.show()
# fig.write_html("hist_acctype.html")
fig = px.violin(df, y="Followers", title="'Followers' Violin Plot")
fig.show()
# fig.write_html("violin_followers.html")
fig = px.violin(df, y="Following", title="'Following' Violin Plot")
fig.show()
# fig.write_html("violin_following.html")
fig = px.violin(df, y="Likes", title="'Likes' Violin Plot")
fig.show()
# fig.write_html("violin_likes.html")
fig = px.violin(df, y="Retweets", title="'Retweets' Violin Plot")
fig.show()
# fig.write_html("violin_retweets.html")
fig = px.violin(df, y="Replies", title="'Replies' Violin Plot")
fig.show()
# fig.write_html("violin_replies.html")
fig = px.bar(df, y=["Likes", "Retweets", "Replies"], title="'Interactions' Bar Graph detailing 'Likes', 'Retweets', and 'Replies'")
fig.show()
# fig.write_html("bar_interactions.html")

### Generating a heatmap for the binned dataframe, a line graph showing tweet frequency per month, and a scatter plot for tweet interactions per month

In [None]:

fig = px.imshow(date_df.corr(), text_auto=True, title="Binned Monthly Tweet dataset numerical variable correlation")
fig.show()
# fig.write_html("datedfcorr.html")
fig = px.line(date_df, x="Month", y="Count", title="Line Graph showing frequency of tweets from January 2016 to December 2022")
fig.show()
# fig.write_html("tweetfrequency.html")
fig = px.scatter(date_df, x="Month", y="Interactions", color="Count", title="Scatter plt showing total tweet interactions per month")
fig.show()
# fig.write_html("scatterinteraction.html")