# Arash Hajian nezhad | DataCoLab Interview Task

### Imports

In [1]:
import os
import ast
import json

import optuna
import numpy as np
import pandas as pd

from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score, train_test_split

### Task 1: Add full text `body` and `start` / `end` times of the transcriptions

#### Read transcriptions json files
Insert necessary data into a dictionary for later usage in the notebook

In [2]:
transcriptions_path = 'data/transcriptions/'
json_files = [json_file for json_file in os.listdir(transcriptions_path)]

transcriptions = {}
for json_file in json_files:
    with open(os.path.join(transcriptions_path, json_file), 'r') as j:
        source_video_id = json_file.split('.')[0]  # for removeing '.json' from the file name and leaving 
        current_data = json.loads(j.read())

        transcriptions[source_video_id] = {'text_body': current_data['text'], 'words_data': current_data['words']}

#### Import the dataframe that needs to be filled

In [3]:
df = pd.read_csv('data/to_fill.csv')
df.head()

Unnamed: 0,first_words,last_words,source_video_id
0,Well knew. This morning police need your help,"gunpoint, beating him and stealing his cell ph...",18246
1,a call. San Francisco firefighters rescued a man,all the way down to the ocean. ocean.,12387
2,"Paul. Meanwhile, the state set a record in","night through conservation, some 4000 conserva...",16859
3,Emergency crews in Florida continue to search for,"in Florida to more than 850,000 homes.",18246
4,But even though the state never ordered rolling,feel since their power got cut out needlessly.,16859


#### Some data cleaning
After some personal data exploration, I have noticed that some of the sentences in the `last_words` column have a reduntant empty `''` at their ends. These will prove to be problematic if left unaccounted for, hence the first thing we will clean from the data is this.

Secondly, some sentences in the mentioned column end with a redunant word, which is not found anywhere in the transcriptions. This word is the same as its previous word in the sentence, for example `all the way down to the ocean. ocean.` which has an extra `ocean.` in the end. This is not found anywhere in the transcriptions and must be dealt with, which we do :)

In [4]:
def data_cleaning(text: str) -> str:
    """
    Function that takes strings and cleans them specifically for this task.
    
    Args:
        text: a string that needs to be processed.
    
    Returns
        returns a processed string.
    """
    listed_text = text.split(' ')

    if listed_text[-1] == '':
        listed_text.pop()

    if listed_text[-1] == listed_text[-2]:
        listed_text.pop()
    
    return ' '.join(listed_text)


df['last_words'] = df['last_words'].apply(data_cleaning)

#### Main functions
These functions are used to do the main job of fetching the whole paragraph using the now-cleaned dataframe.

In [5]:
def find_start_and_end_indices(text_body: list, first_words: list, last_words: list) -> tuple:
    """
    Function for finding the starting and ending indices of the paragraph
    in the main text body using the `first_words` and `last_words` of it.

    Args:
        text_body: a list of strings made from the main text body.
        first_words: a list of strings made from the `first_words` column.
        last_words: a list of strings made from the `last_words` column.

    Returns:
        a tuple made of the starting and ending indices in the main text
        body.
    """
    possible_starts = [i for i, word in enumerate(text_body) if word == first_words[0]]

    for starting_index in possible_starts:
        if text_body[starting_index:starting_index + len(first_words)] == first_words:
            break

    possible_ends = [i + starting_index for i, word in enumerate(text_body[starting_index:]) if word == last_words[0]]

    last_words_length = len(last_words)

    for ending_index in possible_ends:
        if text_body[ending_index:ending_index + last_words_length] == last_words:
            break

    return starting_index, ending_index + last_words_length  # last_words_length is added to return the end of the paragraph index.


def get_full_text_and_times(row: pd.DataFrame) -> pd.DataFrame:
    """
    Function for fetching the whole paragraph, start time and end time using
    the `first_words` and `last_words` of it.

    Args:
        row: a row of a pandas dataframe.

    Returns:
        a pandas dataframe row cotaining three columns of `body`, `start` and `end`.
    """
    # getting the body first
    transcription_id = str(row['source_video_id'])
    current_transcription = transcriptions[transcription_id]

    listed_text_body = current_transcription['text_body'].split(' ')
    listed_first_words = row['first_words'].split(' ')
    listed_last_words = row['last_words'].split(' ')

    start_index, end_index = find_start_and_end_indices(listed_text_body, listed_first_words, listed_last_words)

    # now getting the start and end times
    words_data = current_transcription['words_data']

    start_time, end_time = words_data[start_index]['start'], words_data[end_index]['end']

    # collating the fetched data
    row['body'] = ' '.join(listed_text_body[start_index:end_index])
    row['start'] = start_time
    row['end'] = end_time

    return row

#### Applying the functions
Now we fetch the `body`, `start` and `end` columns.

In [6]:
df = df.apply(get_full_text_and_times, axis=1)

Checking the dataframe.

In [7]:
df.head()

Unnamed: 0,first_words,last_words,source_video_id,body,start,end
0,Well knew. This morning police need your help,"gunpoint, beating him and stealing his cell ph...",18246,Well knew. This morning police need your help ...,464928,505910
1,a call. San Francisco firefighters rescued a man,all the way down to the ocean.,12387,a call. San Francisco firefighters rescued a m...,359020,385526
2,"Paul. Meanwhile, the state set a record in","night through conservation, some 4000 conserva...",16859,"Paul. Meanwhile, the state set a record in ene...",60704,101238
3,Emergency crews in Florida continue to search for,"in Florida to more than 850,000 homes.",18246,Emergency crews in Florida continue to search ...,505290,534958
4,But even though the state never ordered rolling,feel since their power got cut out needlessly.,16859,But even though the state never ordered rollin...,100910,283606


### Task 2: Predicting the `topics` of the paragraphs

#### Loading the `stories` dataframe
This dataframe is used for training model(s) for predicting the `topics` of the now-fetched paragraphs

Note: I have trained a BERT model for predicting the `topics` in another notebook (colab) which is
included in the repo, but it has not yielded good results, as the `topics` labels are very sparse
(many zeros and few ones), hence the model will always try to output zeros, as it lowers the error
anyways. I have attempted to implement `positional weighting` technique for usage in BCEWithLogitsLoss
but still no good results were observed, so I sticked with vanilla `TF-IDF` method and RandomForest.

In [8]:
stories = pd.read_csv('data/stories.csv')
stories.head()

Unnamed: 0,body,topic
0,,['39822b5f-e37e-43e8-b997-7142fe55c3ea']
1,,['0d817400-3f5d-41e0-929c-c31fdbe75d31']
2,,['83a09c6b-5f2f-421f-ae50-b38acca7e008']
3,,['6fbf954a-03f9-4782-a65f-783271c9c447']
4,hello and welcome to BBC News a woman who gave...,"['83a09c6b-5f2f-421f-ae50-b38acca7e008', '9ff5..."


#### Data cleaning
We observe that some rows have empty bodies, but they are not `NaN` in a usual way,
so we check what character they are and attempt to delete these rows using that.

In [9]:
stories['body'][0], stories['body'][0]

(' ', ' ')

Deleting the empty rows.

In [10]:
stories = stories[stories['body'] != ' '].reset_index().drop(['index'], axis=1)
stories.head()

Unnamed: 0,body,topic
0,hello and welcome to BBC News a woman who gave...,"['83a09c6b-5f2f-421f-ae50-b38acca7e008', '9ff5..."
1,news now out of North Hollywood. A 14 yearold ...,['9ff54ded-904b-4e0c-85ce-a3617f5cb913']
2,homelessness his city's greatest failure. That...,"['83a09c6b-5f2f-421f-ae50-b38acca7e008', '74e2..."
3,Minneapolis police officer Kim Potter guilty o...,"['83a09c6b-5f2f-421f-ae50-b38acca7e008', '9ff5..."
4,Judy an update now to the wildfires that wiped...,"['9ff54ded-904b-4e0c-85ce-a3617f5cb913', '9a06..."


#### Getting all topics
Our data is now clean! We will move on with the task. First we will see how many unique topics we have.
We observe that the data in the `topic` column is made of strings of lists of strings (!) and we will read
them using the `literal_eval` from python's built-in `ast` library.

In [11]:
topics = set()
for i in stories['topic']:
    current_topics = ast.literal_eval(i)
    for topic in current_topics:
        if topic not in topics:
            topics.add(topic)

We have the topics now, but they are in a very strange format. We will turn them
in the usual numeric format, while also keeping the reverse of them for adding them
later on to the dataframe in the same format as the `stories` dataframe.

In [12]:
topic_to_label = {topic: label for label, topic in enumerate(topics)}
label_to_topic = {label: topic for label, topic in enumerate(topics)}

#### Adding the `labels` column
We add this column in a multilabel-encoded way, while also keeping it in string format
for later usage in the colab notebook by `ast.literal_eval` .

In [13]:
def generate_labels(row: pd.DataFrame) -> str:
    """
    Function for generating multilabels-encodings from the `topic` column
    in the `stories` dataframe.

    Args:
        row: a row of a pandas dataframe.
    
    Returns:
        a stringified multilabels-endcoding.
    """
    current_topics = ast.literal_eval(row['topic'])

    labels = np.zeros(len(topics), dtype=int)
    np.put(labels, [topic_to_label[topic] for topic in current_topics], [1] * len(current_topics))

    output_in_str_format = '[' + str(labels[0])
    for num in labels[1:]:
        output_in_str_format += ', ' + str(num)
    
    return output_in_str_format + ']'


stories['labels'] = stories.apply(generate_labels, axis=1)

I had saved the dataset for exporting to the other notebook for BERT training, which is now unnecessary.

In [14]:
# stories.drop(['topic'], axis=1).to_csv('stories_processed.csv', index=False)

I had also saved the processed `to_fill` dataframe for usage in the BERT notebook on colab, which is
now also unnecessary.

In [15]:
# df.to_csv('to_fill_proccessed.csv', index=False)

#### Check the dataframe.

In [16]:
stories.head()

Unnamed: 0,body,topic,labels
0,hello and welcome to BBC News a woman who gave...,"['83a09c6b-5f2f-421f-ae50-b38acca7e008', '9ff5...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0]"
1,news now out of North Hollywood. A 14 yearold ...,['9ff54ded-904b-4e0c-85ce-a3617f5cb913'],"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"
2,homelessness his city's greatest failure. That...,"['83a09c6b-5f2f-421f-ae50-b38acca7e008', '74e2...","[0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0]"
3,Minneapolis police officer Kim Potter guilty o...,"['83a09c6b-5f2f-421f-ae50-b38acca7e008', '9ff5...","[0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0]"
4,Judy an update now to the wildfires that wiped...,"['9ff54ded-904b-4e0c-85ce-a3617f5cb913', '9a06...","[0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0]"


#### Preparing the dataset for training a RandomForestClassifier
Note: We need to do a seperate vectorization on the split dataset.

In [17]:
X = stories['body']
y = [ast.literal_eval(label) for label in stories['labels'].values]

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.85, random_state=42)  # setting random state for reproducibility

vectorizer_whole = TfidfVectorizer(lowercase=True, stop_words='english')
vectorizer_train = TfidfVectorizer(lowercase=True, stop_words='english')

X = vectorizer_whole.fit_transform(X)

X_train = vectorizer_train.fit_transform(X_train)
X_test = vectorizer_train.transform(X_test)

#### Finding the optimized hyperparameters
This is done by maximizing the mean `Cross Validation` score using `Optuna`.

In [18]:
# This is a function used only by Optuna for optimizing the ML Model Hyperparameters
# It is in a typical format for Optuna
def objective(trial):
    hyperparamaters = {
        'n_estimators': trial.suggest_int('rf_num_estimators', 300, 500),
        'max_depth': trial.suggest_int('rf_max_depth', 5, 15),
        'min_samples_split': trial.suggest_int('rf_min_samples_split', 10, 30),
    }

    model = RandomForestClassifier(**hyperparamaters)

    score = cross_val_score(model, X, y, cv=3, n_jobs=-1)
    return score.mean()


study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=50)

[32m[I 2023-05-12 21:52:24,711][0m A new study created in memory with name: no-name-a9307132-0dc3-464b-a27b-57dc063db562[0m
[32m[I 2023-05-12 21:52:33,611][0m Trial 0 finished with value: 0.054540874891433534 and parameters: {'rf_num_estimators': 320, 'rf_max_depth': 10, 'rf_min_samples_split': 11}. Best is trial 0 with value: 0.054540874891433534.[0m
[32m[I 2023-05-12 21:52:39,559][0m Trial 1 finished with value: 0.06832370219148874 and parameters: {'rf_num_estimators': 300, 'rf_max_depth': 11, 'rf_min_samples_split': 29}. Best is trial 1 with value: 0.06832370219148874.[0m
[32m[I 2023-05-12 21:52:46,480][0m Trial 2 finished with value: 0.04580617618017366 and parameters: {'rf_num_estimators': 463, 'rf_max_depth': 9, 'rf_min_samples_split': 27}. Best is trial 1 with value: 0.06832370219148874.[0m
[32m[I 2023-05-12 21:52:52,003][0m Trial 3 finished with value: 0.03862400894612504 and parameters: {'rf_num_estimators': 338, 'rf_max_depth': 8, 'rf_min_samples_split': 27}. Be

#### Getting the hyperparameters ready.

In [19]:
best_hyperparameters = {
    'n_estimators': study.best_params.pop('rf_num_estimators'),
    'max_depth': study.best_params.pop('rf_max_depth'),
    'min_samples_split': study.best_params.pop('rf_min_samples_split'),
}

#### Fitting the model to the data.
We will do this twice, once with the optimized hyperparameters, and
once with the vanilla hyperparameters, as there is a chance that the
default ones may perform better.

In [22]:
model_vanilla = RandomForestClassifier()
model_optimized = RandomForestClassifier(**best_hyperparameters)
model_vanilla.fit(X_train, y_train)
model_optimized.fit(X_train, y_train)

#### Getting models` predictions.

In [23]:
y_pred_vanilla = model_vanilla.predict(X_test)
y_pred_optimized = model_optimized.predict(X_test)

#### Calculating models` accuracies
This is done by counting the number of true predictions (whether negatives or positives)
divided by the number of all labels.

In [24]:
number_of_all_labels = len(y_test) * 15  # as we have 15 labels in each multilabel-encoding

accuracy_vanilla = (y_test == y_pred_vanilla).sum() / number_of_all_labels
accuracy_optimized = (y_test == y_pred_optimized).sum() / number_of_all_labels

print(f'Accuracy of Vanilla RF: {accuracy_vanilla:.4f}')
print(f'Accuracy of Optimized RF: {accuracy_optimized:.4f}')

Accuracy of Vanilla RF: 0.9423
Accuracy of Optimized RF: 0.9148


#### Training the model on the whole dataset
Weirdly enough, we see that the optimized version has a worst performance in accuracy!
So we will go with the default RandomForestClassifier settings and train it on the whole dataset.

In [25]:
model = RandomForestClassifier()
model.fit(X, y)

#### Predicting the topics.

In [26]:
def predict_topic(row: pd.DataFrame) -> list[str]:
    """
    Function for predicting the `topic`s of a body of text.

    Args:
        row: a row of a pandas dataframe.
    
    Returns:
        a list of weird-formatted topics.
    """
    text = [row['body']]
    text = vectorizer_whole.transform(text)

    topics = model.predict(text)[0]

    topics = str([str(label_to_topic[i]) for i, t in enumerate(topics) if t == 1])

    return topics

#### Adding the `topics` column.

In [27]:
df['topics'] = df.apply(predict_topic, axis=1)

#### Checking the final dataframe.

In [28]:
df

Unnamed: 0,first_words,last_words,source_video_id,body,start,end,topics
0,Well knew. This morning police need your help,"gunpoint, beating him and stealing his cell ph...",18246,Well knew. This morning police need your help ...,464928,505910,['9ff54ded-904b-4e0c-85ce-a3617f5cb913']
1,a call. San Francisco firefighters rescued a man,all the way down to the ocean.,12387,a call. San Francisco firefighters rescued a m...,359020,385526,['9ff54ded-904b-4e0c-85ce-a3617f5cb913']
2,"Paul. Meanwhile, the state set a record in","night through conservation, some 4000 conserva...",16859,"Paul. Meanwhile, the state set a record in ene...",60704,101238,['83a09c6b-5f2f-421f-ae50-b38acca7e008']
3,Emergency crews in Florida continue to search for,"in Florida to more than 850,000 homes.",18246,Emergency crews in Florida continue to search ...,505290,534958,['9ff54ded-904b-4e0c-85ce-a3617f5cb913']
4,But even though the state never ordered rolling,feel since their power got cut out needlessly.,16859,But even though the state never ordered rollin...,100910,283606,[]
5,"aid. And today, president Joe Biden and first",to view the destruction caused by Hurricane Ian.,18246,"aid. And today, president Joe Biden and first ...",546494,593818,['83a09c6b-5f2f-421f-ae50-b38acca7e008']
6,"In the last month, there have been numerous",are necessary to crack down on those hackers.,18246,"In the last month, there have been numerous da...",614910,700574,[]
7,and the warriors are playing the Boston Celtics,that. We'll see if they get it tonight.,12387,and the warriors are playing the Boston Celtic...,419994,649502,['b49207eb-96eb-4b73-b534-adc0ef85022a']
8,And San Leandro police searching for the person,footage to try to piece together more informat...,16859,And San Leandro police searching for the perso...,578612,619638,['9ff54ded-904b-4e0c-85ce-a3617f5cb913']
9,The updated Bivalent Coronavirus booster shot ...,on their vaccinations getting severe illness f...,16859,The updated Bivalent Coronavirus booster shot ...,619310,660654,['96326734-fd82-4350-b45c-513e7eb9147c']


#### Saving the final dataframe.

In [29]:
df.to_csv('to_fill_finalized.csv', index=False)