In [90]:
import pandas as pd
from nltk.tokenize import TweetTokenizer 
import re
import string

This notebook handles the pre-processing of the dataset: 
https://www.kaggle.com/datasets/hsankesara/medium-articles  

It generates 1 .txt files with cleaned content named: `articles.txt`. 

# Methods

In [None]:
def clean_line(line):
    tokenizer = TweetTokenizer()
    line = line.replace("’", "'")
    allowed_punctuation_char = "'"
    allowed_punctuation_word = "'" + "#" + "-"
    
    # Modify regex to preserve periods in abbreviations like 'U.S.', 'P.M.', etc.
    # 1. Substitute periods in known abbreviations with a placeholder
    line = re.sub(r'(?<=\b[A-Z])\.(?=[A-Z]\b)', '', line)  # Remove periods in multi-letter abbreviations
    
    line = re.sub(rf'[^\w\s{allowed_punctuation_word}{allowed_punctuation_char}(\w+\'\w+)]|_|\d', ' ',  line)
    
    words = tokenizer.tokenize(line)
    result = ''
    for i, word in enumerate(words):
        if word in allowed_punctuation_char:
            result = result.rstrip()
            result += word.lower() + ' '
        elif word not in string.punctuation:
            translated_word = word.lower().replace('.', '')
            if len(translated_word) == 0:
                continue
            result += translated_word + ' '
    return result.rstrip()

In [None]:
def write_sentences_to_file(sentences, file_path):
    try:
        with open(file_path, 'w') as file:
            for sentence in sentences:
                file.write(sentence + '\n')
        print(f"Sentences successfully written to {file_path}")
    except Exception as e:
        print(f"An error occurred while writing to the file: {e}")

# Data processing

In [91]:
file_path = '/Users/rosameliacarioni/University/MSc/1_year/4_period/language engineering/word_predictor/data/articles.csv'
df = pd.read_csv(file_path)


## Extact column of interest

In [92]:
df = df[['text']]
df

Unnamed: 0,text
0,"Oh, how the headlines blared:\nChatbots were T..."
1,If you’ve ever found yourself looking up the s...
2,Machine learning is increasingly moving from h...
3,If your understanding of A.I. and Machine Lear...
4,Want to learn about applied Artificial Intelli...
...,...
332,Click here to share this article on LinkedIn »...
333,These are my opinions on where deep neural net...
334,Everyone who has been remotely tuned in to rec...
335,One of the biggest misconceptions around is th...


## Transform the df into a list

In [93]:
sentences_list = df['text'].tolist()        
sentences_list = [text.replace('\n', ' ') for text in sentences_list]

In [94]:
sentences_list[0]

"Oh, how the headlines blared: Chatbots were The Next Big Thing. Our hopes were sky high. Bright-eyed and bushy-tailed, the industry was ripe for a new era of innovation: it was time to start socializing with machines. And why wouldn’t they be? All the road signs pointed towards insane success. At the Mobile World Congress 2017, chatbots were the main headliners. The conference organizers cited an ‘overwhelming acceptance at the event of the inevitable shift of focus for brands and corporates to chatbots’. In fact, the only significant question around chatbots was who would monopolize the field, not whether chatbots would take off in the first place: One year on, we have an answer to that question. No. Because there isn’t even an ecosystem for a platform to dominate. Chatbots weren’t the first technological development to be talked up in grandiose terms and then slump spectacularly. The age-old hype cycle unfolded in familiar fashion... Expectations built, built, and then..... It all k

## Iterate over the sentences and clean them

In [99]:
cleaned_sentences = []
for sentence in sentences_list:
    new_sentence = clean_line(sentence)
    cleaned_sentences.append(new_sentence)

In [101]:
cleaned_sentences[0]

"oh how the headlines blared chatbots were the next big thing our hopes were sky high bright eyed and bushy tailed the industry was ripe for a new era of innovation it was time to start socializing with machines and why wouldn't they be all the road signs pointed towards insane success at the mobile world congress chatbots were the main headliners the conference organizers cited an overwhelming acceptance at the event of the inevitable shift of focus for brands and corporates to chatbots' in fact the only significant question around chatbots was who would monopolize the field not whether chatbots would take off in the first place one year on we have an answer to that question no because there isn't even an ecosystem for a platform to dominate chatbots weren't the first technological development to be talked up in grandiose terms and then slump spectacularly the age old hype cycle unfolded in familiar fashion expectations built built and then it all kind of fizzled out the predicted par

## Store the cleaned sentences

In [103]:
output_file = '/Users/rosameliacarioni/University/MSc/1_year/4_period/language engineering/word_predictor/data/clean_data/articles.txt'
write_sentences_to_file(cleaned_sentences, output_file)

Sentences successfully written to /Users/rosameliacarioni/University/MSc/1_year/4_period/language engineering/word_predictor/data/clean_data/articles.txt
