# The code where we do final analysis and generate insights

In [None]:
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow_text as text

In [None]:
AUTOTUNE = tf.data.AUTOTUNE
batch_size = 32

In [None]:
my_model = tf.keras.models.load_model("saved_models/twitter_bert")

In [None]:
def clean_text(data):
    data.text = data.text.str.encode("ascii", "ignore").str.decode("ascii")
    data.text = data.text.str.replace(r"https?:\/\/\S*\s?", "", regex = True)
    data.text = data.text.str.replace(r"@\w+", "", regex = True)
    data.text = data.text.str.replace(r"#\w+", "", regex = True)
    data.text = data.text.str.replace(r"\n", "", regex = True)
    return data

In [None]:
def predict_sentiment(data):
    data_batch = tf.data.Dataset.from_tensor_slices(data.text)
    data_batch = data_batch.batch(batch_size).prefetch(buffer_size = AUTOTUNE)
    sent_prob = my_model.predict(data_batch)
    return sent_prob

In [None]:
def group_10_min_sentiment(data, y_prob):
    data["sentiment"] = y_prob > 0.5
    data["min10"] = np.floor((pd.to_datetime(data.time) - start_time).dt.total_seconds() / 600)
    return data[["min10", "sentiment"]]

In [None]:
def group_8_hour_sentiment(data, y_prob):
    data["sentiment"] = y_prob > 0.5
    data["hour8"] = np.floor((pd.to_datetime(data.time) - start_day).dt.total_seconds() / 28800)
    return data[["hour8", "sentiment"]]

In [None]:
def group_12_hour_sentiment(data, y_prob, start_day):
    data["sentiment"] = y_prob > 0.5
    data["hour12"] = np.floor((pd.to_datetime(data.time) - start_day).dt.total_seconds() / 43200)
    return data[["hour12", "sentiment", "country"]]

In [None]:
def split_pos_neg(data, time = "min10"):
    data_pos = data[data.sentiment]
    data_neg = data[~data.sentiment]
    data_pos = data_pos.groupby(time).size()
    data_neg = data_neg.groupby(time).size()
    return data_pos, data_neg

In [None]:
data_stream = pd.read_csv("dataset/data_stream.csv", index_col = 0)

In [None]:
data_stream

In [None]:
data_stream = clean_text(data_stream)

In [None]:
y_prob = predict_sentiment(data_stream)

In [None]:
np.mean(y_prob > 0.5)

In [None]:
data_stream["sentiment"] = (y_prob > 0.5)

In [None]:
data_stream[data_stream.sentiment].to_csv("dataset/data_stream_pos.csv")
data_stream[~data_stream.sentiment].to_csv("dataset/data_stream_neg.csv")

In [None]:
y_prob

In [None]:
data_agt = pd.read_csv("dataset/data_stream_agt.csv", index_col = 0)
data_frc = pd.read_csv("dataset/data_stream_frc.csv", index_col = 0)

In [None]:
data_agt.head(10)

In [None]:
data_agt = clean_text(data_agt)
data_frc = clean_text(data_frc)

In [None]:
data_text_agt = tf.data.Dataset.from_tensor_slices(data_agt.text)
data_text_agt = data_text_agt.batch(batch_size).prefetch(buffer_size = AUTOTUNE)
data_text_frc = tf.data.Dataset.from_tensor_slices(data_frc.text)
data_text_frc = data_text_frc.batch(batch_size).prefetch(buffer_size = AUTOTUNE)

In [None]:
y_prob_agt = my_model.predict(data_text_agt)
y_prob_frc = my_model.predict(data_text_frc)

In [None]:
_, count_agt = np.unique(y_prob_agt > 0.5, return_counts = True)
_, count_frc = np.unique(y_prob_frc > 0.5, return_counts = True)

In [None]:
count_agt, count_frc

In [None]:
plt.bar(x = ["Argentina", "France"], height = [count_agt[1], count_frc[1]], label = "Positive")
plt.bar(x = ["Argentina", "France"], height = [-count_agt[0], -count_frc[0]], label = "Negative")
plt.legend()

In [None]:
y_pred_agt = np.select([y_prob_agt > 0.8, y_prob_agt > 0.2], [1, 0], default = -1)
_, pctg_agt = np.unique(y_pred_agt, return_counts = True)
pctg_agt = pctg_agt / np.sum(pctg_agt)

In [None]:
y_pred_frc = np.select([y_prob_frc > 0.8, y_prob_frc > 0.2], [1, 0], default = -1)
_, pctg_frc = np.unique(y_pred_frc, return_counts = True)
pctg_frc = pctg_frc / np.sum(pctg_frc)

In [None]:
pctg_agt, pctg_frc

In [None]:
color_map = plt.cm.RdBu
plt.barh(y = ["Argentina", "France"], width = [pctg_agt[2], pctg_frc[2]], color = "#67a9cf", label = "Positive")
plt.barh(y = ["Argentina", "France"], width = [pctg_agt[1], pctg_frc[1]], left = [pctg_agt[2], pctg_frc[2]], color = "#e0e0e0", label = "Neutral")
plt.barh(y = ["Argentina", "France"], width = [pctg_agt[0], pctg_frc[0]], left = [pctg_agt[1] + pctg_agt[2], pctg_frc[1] + pctg_frc[2]], color = "#ef8a62", label = "Negative")
plt.legend()

In [None]:
data_agt["sentiment"] = y_prob_agt
data_frc["sentiment"] = y_prob_frc

In [None]:
data_agt.to_csv("dataset/data_stream_agt_sent.csv")
data_frc.to_csv("dataset/data_stream_frc_sent.csv")

In [None]:
data_agt = pd.read_csv("dataset/data_static_agt.csv", index_col = 0)

In [None]:
data_agt

In [None]:
data_agt = clean_text(data_agt)

In [None]:
y_prob_agt = predict_sentiment(data_agt)

In [None]:
start_time = dt.datetime.strptime("2022-12-09 19:00:00 +0000", "%Y-%m-%d %H:%M:%S %z")

In [None]:
data_agt = group_10_min_sentiment(data_agt, y_prob_agt)

In [None]:
data_agt_pos, data_agt_neg = split_pos_neg(data_agt)

In [None]:
data_ntl = pd.read_csv("dataset/data_static_ntl.csv", index_col = 0)

In [None]:
data_ntl = clean_text(data_ntl)

In [None]:
y_prob_ntl = predict_sentiment(data_ntl)

In [None]:
data_ntl = group_10_min_sentiment(data_ntl, y_prob_ntl)
data_ntl_pos, data_ntl_neg = split_pos_neg(data_ntl)

In [None]:
data_agt_martinez = pd.read_csv("dataset/data_static_agt_martinez.csv", index_col = 0)
data_agt_martinez = clean_text(data_agt_martinez)
y_prob_agt_martinez = predict_sentiment(data_agt_martinez)
data_agt_martinez = group_10_min_sentiment(data_agt_martinez, y_prob_agt_martinez)
data_agt_martinez_pos, data_agt_martinez_neg = split_pos_neg(data_agt_martinez)

In [None]:
data_agt_messi = pd.read_csv("dataset/data_static_agt_messi.csv", index_col = 0)
data_agt_messi = clean_text(data_agt_messi)
y_prob_agt_messi = predict_sentiment(data_agt_messi)
data_agt_messi = group_10_min_sentiment(data_agt_messi, y_prob_agt_messi)
data_agt_messi_pos, data_agt_messi_neg = split_pos_neg(data_agt_messi)

In [None]:
data_agt_paredes = pd.read_csv("dataset/data_static_agt_paredes.csv", index_col = 0)
data_agt_paredes = clean_text(data_agt_paredes)
y_prob_agt_paredes = predict_sentiment(data_agt_paredes)
data_agt_paredes = group_10_min_sentiment(data_agt_paredes, y_prob_agt_paredes)
data_agt_paredes_pos, data_agt_paredes_neg = split_pos_neg(data_agt_paredes)

In [None]:
data_ntl_dijk = pd.read_csv("dataset/data_static_ntl_dijk.csv", index_col = 0)
data_ntl_dijk = clean_text(data_ntl_dijk)
y_prob_ntl_dijk = predict_sentiment(data_ntl_dijk)
data_ntl_dijk = group_10_min_sentiment(data_ntl_dijk, y_prob_ntl_dijk)
data_ntl_dijk_pos, data_ntl_dijk_neg = split_pos_neg(data_ntl_dijk)

In [None]:
data_ntl_dumfries = pd.read_csv("dataset/data_static_ntl_dumfries.csv", index_col = 0)
data_ntl_dumfries = clean_text(data_ntl_dumfries)
y_prob_ntl_dumfries = predict_sentiment(data_ntl_dumfries)
data_ntl_dumfries = group_10_min_sentiment(data_ntl_dumfries, y_prob_ntl_dumfries)
data_ntl_dumfries_pos, data_ntl_dumfries_neg = split_pos_neg(data_ntl_dumfries)

In [None]:
data_agt_paredes

In [None]:
data_ntl_dijk

In [None]:
# plt.plot(data_agt_pos.index, data_agt_pos, label = "Argentina")
# plt.plot(data_agt_neg.index, -data_agt_neg, c = plt.gca().lines[-1].get_color())
# plt.plot(data_ntl_pos.index, data_ntl_pos, label = "Netherlands")
# plt.plot(data_ntl_neg.index, -data_ntl_neg, c = plt.gca().lines[-1].get_color())
# plt.plot(data_agt_martinez_pos.index, data_agt_martinez_pos, label = "Martinez (AGT)")
# plt.plot(data_agt_martinez_neg.index, -data_agt_martinez_neg, c = plt.gca().lines[-1].get_color())
plt.plot(data_agt_messi_pos.index, data_agt_messi_pos, label = "Messi (AGT)")
plt.plot(data_agt_messi_neg.index, -data_agt_messi_neg, c = plt.gca().lines[-1].get_color())
plt.plot(data_agt_paredes_pos.index, data_agt_paredes_pos, label = "Paredes (AGT)")
plt.plot(data_agt_paredes_neg.index, -data_agt_paredes_neg, c = plt.gca().lines[-1].get_color())
# plt.plot(data_ntl_dijk_pos.index, data_ntl_dijk_pos, label = "van Dijk (NTL)")
# plt.plot(data_ntl_dijk_neg.index, -data_ntl_dijk_neg, c = plt.gca().lines[-1].get_color())
plt.plot(data_ntl_dumfries_pos.index, data_ntl_dumfries_pos, label = "Dumfries (NTL)")
plt.plot(data_ntl_dumfries_neg.index, -data_ntl_dumfries_neg, c = plt.gca().lines[-1].get_color())

plt.axhline(y = 0, c = "Black")
plt.xlim(0, 18)
plt.xticks(np.arange(0, 18, 3), np.arange(0, 180, 30))
plt.xlabel("Time from game starts (mins)")
plt.ylabel("Number of Tweets (Positive/Negative)")
plt.legend()

In [None]:
data_static = pd.read_csv("dataset/data_static.csv", index_col = 0)

In [None]:
data_static = clean_text(data_static)
y_prob = predict_sentiment(data_static)

In [None]:
start_day = dt.datetime.strptime("2022-12-15 00:00:00 -0500", "%Y-%m-%d %H:%M:%S %z")

In [None]:
data_static = group_12_hour_sentiment(data_static, y_prob, start_day)

In [None]:
data_group = data_static.groupby(["hour12", "country", "sentiment"]).size()
data_group = data_group.reset_index().rename(columns = {0: "count"})
data_group["sentiment"] = np.where(data_group["sentiment"], "positive", "negative")

In [None]:
data_group = data_group[data_group["hour12"] >= 0]

In [None]:
data_group.to_csv("output/viz_data.csv")

In [None]:
data_frc_pos, data_frc_neg = split_pos_neg(data_frc, "hour8")

In [None]:
data = pd.DataFrame({"time": data_frc_pos.index, "pos": data_frc_pos, "neg": data_frc_neg, "country": "France"}).reset_index(drop = True)

In [None]:
data_mrc = pd.read_csv("dataset/data_static_mrc.csv", index_col = 0)
data_mrc = clean_text(data_mrc)

In [None]:
y_prob_mrc = predict_sentiment(data_mrc)

In [None]:
data_mrc = group_8_hour_sentiment(data_mrc, y_prob_mrc)
data_mrc_pos, data_mrc_neg = split_pos_neg(data_mrc, "hour8")

In [None]:
plt.plot(data_frc_pos.index, data_frc_pos, label = "France")
plt.plot(data_frc_neg.index, -data_frc_neg, c = plt.gca().lines[-1].get_color())
plt.plot(data_mrc_pos.index, data_mrc_pos, label = "Morocco")
plt.plot(data_mrc_neg.index, -data_mrc_neg, c = plt.gca().lines[-1].get_color())
plt.axhline(y = 0, c = "Black")
plt.xlim(0, 18)
plt.xticks(np.arange(0, 18, 3), ["12/10", "12/11", "12/12", "12/13", "12/14", "12/15"])
plt.xlabel("Date")
plt.ylabel("Number of Tweets (Positive/Negative)")
plt.legend()