# Problem 2 : Data Engineering Challenge

## Task1 (Data Structuring):

In [11]:
import os
import csv

In [12]:
# Path to the folder containing the article data
folder_path = "BBC_articles"

# List to store article data
articles_data = []

# Iterate over each file in the folder
for filename in os.listdir(folder_path):
    # Extract article ID and category from the filename
    article_id, category = filename.split("_")
    # Read the content of the text file
    with open(os.path.join(folder_path, filename), "r", encoding="utf-8") as file:
        # Read contents of file and replace the newline with a space as the same paragraph may span multiple rows
        text = file.read().replace("\n", " ")
    # Append article data to the list
    articles_data.append({"article_id": article_id, "text": text, "category": category})


In [13]:
# Write the data to bbc_articles.csv
csv_file_path = "bbc_articles.csv"
with open(csv_file_path, "w", newline="", encoding="utf-8") as csvfile:
    # Define the CSV writer
    writer = csv.DictWriter(csvfile, fieldnames=["article_id", "text", "category"])
    # Write the header
    writer.writeheader()
    # Write the article data
    writer.writerows(articles_data)


## Task 2 (Data Preprocessing for Model Training):

In [14]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
import string

# Read the articles csv file into a DataFrame
df = pd.read_csv("bbc_articles.csv")

#### Tokenizing

In [15]:
# Tokenize and preprocess the text data
# Download the punkt tokenizer and a list of stopwords from nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\USER\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [16]:
# Custom tokenizer function
def custom_tokenizer(text):
    # Tokenize the text
    tokens = nltk.word_tokenize(text)
    # Lowercase the tokens
    tokens = [token.lower() for token in tokens]
    # Remove stopwords and punctuation
    stop_words = set(stopwords.words('english'))
    tokens = [token for token in tokens if token not in stop_words and token not in string.punctuation]
    return tokens

In [17]:
# Apply custom tokenizer to text column
df['tokenized_text'] = df['text'].apply(custom_tokenizer)

#### Vectorization using Word2Vec

In [18]:
# Train Word2Vec model for vectorized representation of each word in dataset
from gensim.models import Word2Vec
# For practical purposes, word_embedding_vector_size should take 100+ size
# However for the purpose of completing the task and to save resources,
# The size is restricted to 10
word_embedding_vector_size=10
word2vec_model = Word2Vec(df['tokenized_text'], vector_size=word_embedding_vector_size, window=5, min_count=1, workers=4)

In [19]:
# Training the word2vec model: provide the total number of sentences to train from, since we train from the same data used to build
# vocab, we can use model.corpus_count
word2vec_model.train(df['tokenized_text'],total_examples=word2vec_model.corpus_count,epochs=5)

(1603323, 1636945)

In [20]:
# Function that takes a sentence in tokenized form and converts each token to its corresponding
# embedding using word2vec model defined earlier
def get_word_embeddings(tokens):
    embeddings = []
    for token in tokens:
        if token in word2vec_model.wv:
            embeddings.append(word2vec_model.wv[token])
    return embeddings

In [21]:
# Get the word embeddings for each row
df['word_embeddings'] = df['tokenized_text'].apply(get_word_embeddings)

#### Saving dataset

In [22]:
# Save the vectorized dataset in csv format
df.to_csv("vectorized_dataset.csv", index=False)

The file vectorized_dataset.csv contains the article data with column headings: (article_id, raw unmodified text, category, tokenized text, Word2Vec vectored tokenized text)

The Word2Vec Vectored Tokenized text is contains the vectored form of each word in a sentence. One can preprocess the dataset further for their task specific requirements. Aggregating the vectored representation for sentence classification, for instance.