# Import dependencies

In [None]:
import os
import re
import unicodedata
import spacy
import pandas as pd
from matplotlib import pyplot as plt
from dotenv import load_dotenv

# Load data

### Load datasets

In [114]:
# load the environment variables from the .env file
load_dotenv()
# get the dataset path from the environment variables
dataset_path = os.environ.get("DATASET_PATH")
# get the articles path for the known publisher
articles_path = os.path.join(dataset_path, "articles")
# load the dataset into the notebook
df = pd.read_csv(f"{dataset_path}/article_info_V2.csv", index_col=0, parse_dates=[1])
# read the external dataset
external_df = pd.read_csv(f"{dataset_path}/other-articles.csv", index_col=0, parse_dates=[1], sep=";")
# load English SpaCy model
nlp = spacy.load("en_core_web_trf")

TypeError: Tok2Vec.initialize() missing 1 required positional argument: 'get_examples'

### Article loading function

In [None]:
# load article by title
def load_article(title, load_contents=True):
    # create file name from title
    file_name = f"{title}.txt"
    # get the path of the article
    file_path = os.path.join(articles_path, file_name)
    # read the article
    file = open(file_path, "r", encoding="utf-8")
    # return the contents of the article if requested
    if load_contents:
        # read the contents of the article
        contents = file.read()
        # close the file
        file.close()
        # return the contents of the article
        return contents
    # close the file
    file.close()
    # return the article path
    return file

# Dataset cleaning

### Cleaning functions

In [None]:
def clean_string(str):
    # make the string lowercase
    str = str.lower()
    # remove all non-alphanumeric characters
    str = re.sub(r"[^\w\s]", "", str)
    # remove the leading and trailing spaces
    return str.strip()

def parse_list_from_string(str):
    # split the string on each comma
    raw_list = str.split(",")
    # clean every string in the list
    return list(map(clean_string, raw_list))

### Merge preparation

In [None]:
# drop unused columns
df.drop(columns=["Author", "Type", "Keywords"], inplace=True)

# drop rows with empty date values
df.dropna(subset=["Date"], inplace=True)

### Merge datasets

In [None]:
# remove the url column from the external dataset
external_df.drop(columns=["Url"], inplace=True)
# add the external dataset to the main dataset
df = pd.concat([df, external_df])
# reset the index of the main dataset
df.reset_index(drop=True, inplace=True)
# clear the external dataset variable
del external_df

### Remove duplicates

In [None]:
# count the number of entries before duplicate removal
entries_before_duplicate_removal = len(df)
# remove duplicate entries by title and date
df.drop_duplicates(subset=["Title", "Date"], keep="first", inplace=True)
# count the number of entries after duplicate removal
entries_after_duplicate_removal = len(df)
# print removed duplicate count
print(f"Removed {entries_before_duplicate_removal - entries_after_duplicate_removal} duplicate entries.")

Removed 3 duplicate entries.


### Parse the string lists

In [None]:
# convert the raw string values of the Tags column to lists of strings
df["Tags"] = df["Tags"].apply(parse_list_from_string)

# Article cleaning

### Remove articles which cannot be found

In [None]:
# initialize error count to 0 
err_count = 0

# iterate over dataset with index
for index, row in df.iterrows():
    # get the title of the article
    title = row["Title"]
    try:
        # attempt to load the article
        article = load_article(title, load_contents=False)
    except:
        # if the article cannot be loaded, increment the error count
        err_count += 1
        # remove row from dataset
        df.drop(index, inplace=True)
        # continue to next row
        continue

# reset the index of the main dataset
df.reset_index(drop=True, inplace=True)

# print the number of errors
print(f"{err_count} files could not be loaded by title!")

42 files could not be loaded by title!


### Add raw article contents to the dataset

In [None]:
# create a new column for the article contents
articles_content = []

def add_article(title):
    # load the article
    article = load_article(title)
    # add the article to the list
    articles_content.append(article)

df["Title"].apply(lambda title: add_article(title))

articles_df = pd.DataFrame(articles_content, columns=["Body"])

# delete the articles_content variable
del articles_content

# merge the articles with the main dataframe based on index
# df = pd.concat([df, articles_df], axis=1)
df = pd.merge(df, articles_df, left_index=True, right_index=True)

# delete the articles_df variable
del articles_df

### Define article cleaning methods

In [1]:
# remove all newline characters
def remove_newlines(text):
    text = text.replace("\n", " ")
    text = text.replace("\r", " ")
    return text

# replace shortened grammar with full grammar
def replace_grammar(text):
    text = text.replace("it's", "it is")
    text = text.replace("he's", "he is")
    text = text.replace("she's", "she is")
    text = text.replace("what's", "what is")
    text = text.replace("n't", "not")
    text = text.replace("'scuse", " excuse")
    text = text.replace("'s", "")
    text = text.replace("'t", " not")
    text = text.replace("'re", " are")
    text = text.replace("'ll", " will")
    text = text.replace("'ve", " have")
    text = text.replace("'d", " would")
    text = text.replace("'m", " am")
    return text

# replace accented characters (like é, ë, ï) with their unaccented counterparts (e, e, i)
def remove_accented_characters(text):
    return unicodedata.normalize("NFKD", text).encode("ascii", "ignore").decode("utf-8", "ignore")

# remove punctuation from text (like commas, periods, and exclamation marks)
def remove_punctuation(text):
    return re.sub(r"[^\w\s]", "", text)

# remove digits from text
def remove_digits(text):
    return re.sub(r"\d+", "", text)

# remove extra whitespace characters
def remove_extra_whitespace(text):
    # remove spaces at the start and end of the text
    text = text.strip()
    # remove double whitespace characters
    return re.sub(r"\s+", " ", text)

# convert uppercase text to lowercase text
def lowercase(text):
    return text.lower()

# method to combine all preprocessing steps
def preprocess(text):
    # remove newlines
    text = remove_newlines(text)
    # convert uppercase text to lowercase text
    text = lowercase(text)
    # remove accented characters (like é, ë, ï) with their unaccented counterparts (e, e, i)
    text = remove_accented_characters(text)
    # replace shortened grammar with full grammar
    text = replace_grammar(text)
    # remove double whitespace characters
    text = remove_extra_whitespace(text)
    # tokenize text
    tokens = nlp(text)
    # remove stop words, digits and punctuation from tokens
    # tokens = [token for token in tokens if not token.is_stop and not token.is_digit and not token.is_punct]
    return tokens


    # remove punctuation (like commas, periods, and exclamation marks)
    # text = remove_punctuation(text)
    # remove digits from text
    # text = remove_digits(text)
    # remove stop words from the text
    # text = remove_stop_words(text)
   
    # return the text
    # return text

### Apply cleaning method to all articles

In [None]:
# temporary testing
df["Preprocessed_Text"] = ""
df["Preprocessed_Text"][0] = preprocess(df["Body"][0])




# apply cleaning method to all articles
# preprocess all articles
# df["Preprocessed_Text"] = df["Body"].apply(preprocess)



ValueError: [E109] Component 'tok2vec' could not be run. Did you forget to call `initialize()`?

In [None]:
tokens = df["Preprocessed_Text"][0]


for doc in tok2vec.pipe(tokens, batch_size=1000, n_threads=4):
    print(doc.vector)

ValueError: [E007] 'tok2vec' already exists in pipeline. Existing names: ['transformer', 'tagger', 'parser', 'attribute_ruler', 'lemmatizer', 'ner', 'tok2vec']

# Data preparation

## Define one hot encoding function

In [None]:
# # get unique values from a 2D array of strings
# def get_unique_value_frequency(df_column):
#     # create a dictionary to store the unique values
#     unique_values = {}
#     # iterate over the column
#     for value_list in df_column:
#         # iterate over the values in the list
#         for value in value_list:
#             if value not in unique_values:
#                 # if the value is not in the dictionary, add it
#                 unique_values[value] = 1
#             else:
#                 # if the value is in the dictionary, increment the value
#                 unique_values[value] += 1
#     # return the dictionary of unique values
#     return unique_values

# # check if a list contains a certain word and returns a binary boolean value
# def list_has_word(l, word):
#     return word in l and 1 or 0

# # one hot encode a dataframe's column that contains lists of strings in each value
# def custom_one_hot_encoding(df, column_name, prefix=None, prefix_sep="_"):
#     # create a dictionary to store the one hot encoded columns
#     one_hot_encoded_columns = {}
#     # get the unique values from the column
#     unique_values = get_unique_value_frequency(df[column_name])
#     # iterate over the unique values
#     for unique_value in unique_values:
#         # create a clean string of the unique value
#         clean_unique_value = unique_value.replace(" ", "_")
#         # create a new column name
#         new_column_name = prefix and f"{prefix}{prefix_sep}{clean_unique_value}" or f"{column_name}{prefix_sep}{clean_unique_value}"
#         # one hot encode the column using the current unique value
#         ohe_list = df[column_name].apply(lambda l: list_has_word(l, unique_value))
#         # add the new list to the dictionary
#         one_hot_encoded_columns[new_column_name] = ohe_list
#     # return a new dataframe with the one hot encoded columns
#     return pd.DataFrame(one_hot_encoded_columns)

## Execute one hot encoding function

In [None]:
# # one hot encode the tags column of the dataframe
# ohe_tags_df = custom_one_hot_encoding(df, "Tags", "tag")
# # merge the one hot encoded tags dataframe with the main dataframe by index
# df = df.join(ohe_tags_df)
# # drop the tags column from the main dataframe
# df.drop(columns=["Tags"], inplace=True)
# # delete the one hot encoded dataframe variable
# del ohe_tags_df