# Load packages

In [None]:
import re
import string
from collections import Counter

import numpy as np
import pandas as pd
import plotly.express as px
import seaborn as sns
import spacy
import torch
import torch.nn as nn
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from torchtext.data.utils import get_tokenizer
from torchtext.vocab import Vocab

In [None]:
pd.set_option("display.max_colwidth", None)

In [None]:
# def init_pipeline(input_df: pd.DataFrame) -> pd.DataFrame:
#     return input_df.copy()

# def remove_user_mention(input_df: pd.DataFrame) -> pd.DataFrame:
#     user_handle_pattern = re.compile("(@[a-zA-Z0-9_]+)")

#     input_df["tweet_text"] = input_df["tweet_text"].str.replace(
#         pat=user_handle_pattern, repl="", regex=True
#     )

#     return input_df

# def remove_hashtag(input_df: pd.DataFrame) -> pd.DataFrame:
#     hashtag_pattern = re.compile("#(\w+)")

#     input_df["tweet_text"] = input_df["tweet_text"].str.replace(
#         pat=hashtag_pattern, repl="", regex=True
#     )

#     return input_df

In [None]:
def read_data(file_name: str):
    data_list = []

    with open(f"data/semeval-tweets/{file_name}.txt", encoding="utf8") as f:
        for line in f:
            fields = line.strip().split("\t")
            data_list.append(fields)

    df = pd.DataFrame(
        data=data_list,
        columns=[
            "tweet_id",
            "tweet_sentiment",
            "tweet_text",
        ],
    )

    return df

# Read data

In [None]:
training_data = read_data("twitter-training-data")

In [None]:
development_data = read_data("twitter-dev-data")

In [None]:
testing_1_data = read_data("twitter-test1")

In [None]:
testing_2_data = read_data("twitter-test2")

In [None]:
testing_3_data = read_data("twitter-test3")

In [None]:
training_data.head()

In [None]:
print(f"Training data: {training_data['tweet_sentiment'].value_counts().to_dict()}")
print(
    f"Development data: {development_data['tweet_sentiment'].value_counts().to_dict()}"
)
print(f"Testing 1 data: {testing_1_data['tweet_sentiment'].value_counts().to_dict()}")
print(f"Testing 2 data: {testing_2_data['tweet_sentiment'].value_counts().to_dict()}")
print(f"Testing 3 data: {testing_3_data['tweet_sentiment'].value_counts().to_dict()}")

In [None]:
print(
    f"Training data: {training_data['tweet_sentiment'].value_counts(normalize=True).to_dict()}"
)
print(
    f"Development data: {development_data['tweet_sentiment'].value_counts(normalize=True).to_dict()}"
)
print(
    f"Testing 1 data: {testing_1_data['tweet_sentiment'].value_counts(normalize=True).to_dict()}"
)
print(
    f"Testing 2 data: {testing_2_data['tweet_sentiment'].value_counts(normalize=True).to_dict()}"
)
print(
    f"Testing 3 data: {testing_3_data['tweet_sentiment'].value_counts(normalize=True).to_dict()}"
)

# Exploratory data analysis

In [None]:
training_data["tweet_length"] = training_data["tweet_text"].str.len()
development_data["tweet_text"] = development_data["tweet_text"].str.len()

In [None]:
fig = px.histogram(
    data_frame=training_data,
    x="tweet_length",
    color="tweet_sentiment",
    nbins=20,
    barmode="group",
)

fig.show()

In [None]:
fig = px.box(data_frame=training_data, x="tweet_length", color="tweet_sentiment")

fig.show()

# Load GloVe embeddings

In [None]:
word_embedding_dict = {}

with open("data/glove.6B/glove.6B.100d.txt", encoding='utf8') as f:
    for line in f:
        tokens = line.split()
        word = tokens[0]
        word_embedding_vector = np.array(tokens[1:], dtype=np.float64)
        word_embedding_dict[word] = word_embedding_vector

In [None]:
len(word_embedding_dict)

In [None]:
tokenizer = get_tokenizer(tokenizer='basic_english')

In [None]:
from collections import Counter

In [None]:
counter = Counter()

for idx, row in training_data.iterrows():
    
    tweet_tokens = tokenizer(row['tweet_text'])
    counter.update(tweet_tokens)

In [None]:
# from torchtext.vocab.vectors import GloVe
from torchtext.vocab import GloVe, vocab

In [None]:
glove = GloVe(name='6B', dim=100)

In [None]:
vv = vocab(glove.stoi)

In [None]:
vv['syria']

In [None]:
from torchtext.vocab import Vectors

In [None]:
vv = Vectors('data/glove.6B/glove.6B.100d.txt')