## Importing Dependencies

In [None]:
import kaggle
import spacy
nlp = spacy.load(
     'en_core_web_lg',
      disable=['parser', 'ner'])
import re
import zipfile
import os
import csv
import concurrent
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
from multiprocessing import Pool, cpu_count

## Downloading and loading dataset

In [None]:
! kaggle datasets download -p 'Datasets' 'miguelaenlle/massive-stock-news-analysis-db-for-nlpbacktests'

In [None]:
with zipfile.ZipFile('Datasets/massive-stock-news-analysis-db-for-nlpbacktests.zip','r') as zip_ref:
    zip_ref.extractall('Datasets')

os.remove('Datasets/massive-stock-news-analysis-db-for-nlpbacktests.zip')

In [None]:
df1 = pd.read_csv('Datasets/analyst_ratings_processed.csv')
df2 = pd.read_csv('Datasets/raw_analyst_ratings.csv')
df3 = pd.read_csv('Datasets/raw_partner_headlines.csv')

## Dropping irrelevant columns and merging the dataset

In [None]:
dfs = [df1, df2, df3]
dfs = [df.drop('Unnamed: 0', axis=1) for df in dfs]
df1, df2, df3 = dfs

del dfs

In [None]:
series1 = df1['title']
series2 = df2['headline']
series3 = df3['headline']

series = pd.concat([series1, series2, series3])
del df1, df2, df3, series1, series2, series3

print(len(series))

## Text Preprocessing and Cleaning

In [None]:
series = series.drop_duplicates()
print(len(series))

In [None]:
series = series.replace('', pd.NA)
series.isna().sum()

In [None]:
series = series.astype(str)

In [None]:
series = series.str.strip().str.lower()

In [None]:
series = series.str.replace(r'[^&$+\-\/\w\s]', '', regex=True)

In [None]:
series.head(20)

## Stopword Removal

In [None]:
def remove_stopwords(chunk):
    docs = list(nlp.pipe(chunk))
    processed_texts = []
    for doc in docs:
        tokens_without_stopwords = [token.text for token in doc if not token.is_stop]
        processed_texts.append(' '.join(tokens_without_stopwords))
    return pd.Series(processed_texts)

num_threads = 32

chunk_size = 1000
series_chunks = [series[i:i+chunk_size] for i in range(0, len(series), chunk_size)]

with concurrent.futures.ThreadPoolExecutor(max_workers=num_threads) as executor:
    processed_chunks = list(tqdm(executor.map(remove_stopwords, series_chunks), total=len(series_chunks)))

series = pd.concat(processed_chunks)

In [None]:
series.head(10)

In [None]:
series.to_csv('/content/drive/MyDrive/Datasets/stopword_removed.csv')

## Tokenizing Headlines in Batches

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Datasets/stopword_removed.csv')
df = df.drop('Unnamed: 0', axis=1)
df.columns = ['title']
series = df.squeeze()
series.name = None
series = series.astype('str')

del df

series.head()

In [None]:
def tokenize_texts(texts):
    tokenized_texts = []
    for doc in nlp.pipe(texts, batch_size=1000):
        tokenized_texts.append([''.join(token.text for token in doc if token.text.strip()) for token in doc])
    return tokenized_texts

In [None]:
partitions = np.array_split(series, 18)

del series

In [None]:
for i in range(18):
    with concurrent.futures.ThreadPoolExecutor(max_workers=32) as executor:
        processed_partition = list(tqdm(executor.map(tokenize_texts, partitions[17]), total=len(partitions[17]), desc='Batch 17 Tokenization'))

In [None]:
def combine_tokens(tokenized_headlines):
    combined_headlines = []

    for headline_tokens in tokenized_headlines:
        combined_tokens = []
        current_token = ''

        for token in headline_tokens:
            if token[0] != '':
                current_token += token[0]
            else:
                if current_token:
                    combined_tokens.append(current_token)
                    current_token = ''

        if current_token:
            combined_tokens.append(current_token)

        combined_headlines.append(combined_tokens)

    return combined_headlines

combined_headlines = combine_tokens(processed_partition)
combined_headlines[-1]

In [None]:
def write_to_csv(data, filename):
    mode = 'a' if os.path.exists(filename) else 'w'
    with open(filename, mode, newline='') as csvfile:
        writer = csv.writer(csvfile, delimiter='\t')
        writer.writerows(data)

write_to_csv(combined_headlines, '/content/drive/MyDrive/Datasets/tokens.csv')

In [None]:
def read_from_csv(filename):
    data = []
    with open(filename, 'r', newline='') as csvfile:
        reader = csv.reader(csvfile, delimiter='\t')
        for row in reader:
            data.append(row)
    return data

loaded_data = read_from_csv('/content/drive/MyDrive/Datasets/tokens.csv')
loaded_data[-1]

In [None]:
len(loaded_data)

## Lemmatization

In [None]:
def lemmatize_headlines(headline):
    return [token.lemma_ for token in nlp(" ".join(headline))]

num_cores = cpu_count()

pool = Pool(num_cores)

lemmatized_headlines = list(tqdm(pool.imap(lemmatize_headlines, loaded_data), total=len(loaded_data), desc="Lemmatizing Headlines"))

pool.close()
pool.join()

In [None]:
write_to_csv(lemmatized_headlines, '/content/drive/MyDrive/Datasets/lemmatized_headlines.csv')