In [9]:
import string
from nltk.tokenize import TweetTokenizer 
import re

This notebook handles the pre-processing of the dataset used in a course assignment, relating to news articules. 

It generates 1 .txt files with cleaned content named: `news.txt`. 

# Methods

In [2]:
def clean_line(line):
    tokenizer = TweetTokenizer()
    line = line.replace("â€™", "'")
    allowed_punctuation_char = "'"
    allowed_punctuation_word = "'" + "#" + "-"
    
    # Modify regex to preserve periods in abbreviations like 'U.S.', 'P.M.', etc.
    # 1. Substitute periods in known abbreviations with a placeholder
    line = re.sub(r'(?<=\b[A-Z])\.(?=[A-Z]\b)', '', line)  # Remove periods in multi-letter abbreviations
    
    line = re.sub(rf'[^\w\s{allowed_punctuation_word}{allowed_punctuation_char}(\w+\'\w+)]|_|\d', ' ',  line)
    
    words = tokenizer.tokenize(line)
    result = ''
    for i, word in enumerate(words):
        if word in allowed_punctuation_char:
            result = result.rstrip()
            result += word.lower() + ' '
        elif word not in string.punctuation:
            translated_word = word.lower().replace('.', '')
            if len(translated_word) == 0:
                continue
            result += translated_word + ' '
    return result.rstrip()

In [3]:
def extract_sentences_and_clean_them(file_path):
    sentences = []
    with open(file_path, 'r', encoding='utf-8', errors='ignore') as file:
        for line in file:  
            stripped_sentence = line.strip()
            cleaned_sentence = clean_line(stripped_sentence)
            sentences.append(cleaned_sentence)
    return sentences

In [4]:
def write_sentences_to_file(sentences, file_path):
    try:
        with open(file_path, 'w') as file:
            for sentence in sentences:
                file.write(sentence + '\n')
        print(f"Sentences successfully written to {file_path}")
    except Exception as e:
        print(f"An error occurred while writing to the file: {e}")

# Data processing

In [5]:
file_path = '/Users/rosameliacarioni/University/MSc/1_year/4_period/language engineering/word_predictor/data/news.2010.en.shuffled'


In [6]:
sentences = extract_sentences_and_clean_them(file_path)

In [7]:
len(sentences)

6797225

In [8]:
output_file = '/Users/rosameliacarioni/University/MSc/1_year/4_period/language engineering/word_predictor/data/clean_data/news.txt'
write_sentences_to_file(sentences, output_file)

Sentences successfully written to /Users/rosameliacarioni/University/MSc/1_year/4_period/language engineering/word_predictor/data/clean_data/news.txt
