In [1]:
import pandas as pd
from nltk.tokenize import TweetTokenizer 
import re
import string

This notebook handles the pre-processing of the dataset: 
https://www.kaggle.com/datasets/sbhatti/news-summarization  

It generates 2 .txt files with cleaned content named: `news_summarization.txt` and `news_content.txt`. The first contains summaries of news articules, and the second one the actual content. 

# Methods

In [2]:
def clean_line(line):
    tokenizer = TweetTokenizer()
    line = line.replace("’", "'")
    allowed_punctuation_char = "'"
    allowed_punctuation_word = "'" + "#" + "-"
    
    # Modify regex to preserve periods in abbreviations like 'U.S.', 'P.M.', etc.
    # 1. Substitute periods in known abbreviations with a placeholder
    line = re.sub(r'(?<=\b[A-Z])\.(?=[A-Z]\b)', '', line)  # Remove periods in multi-letter abbreviations
    
    line = re.sub(rf'[^\w\s{allowed_punctuation_word}{allowed_punctuation_char}(\w+\'\w+)]|_|\d', ' ',  line)
    
    words = tokenizer.tokenize(line)
    result = ''
    for i, word in enumerate(words):
        if word in allowed_punctuation_char:
            result = result.rstrip()
            result += word.lower() + ' '
        elif word not in string.punctuation:
            translated_word = word.lower().replace('.', '')
            if len(translated_word) == 0:
                continue
            result += translated_word + ' '
    return result.rstrip()

In [3]:
def write_sentences_to_file(sentences, file_path):
    try:
        with open(file_path, 'w') as file:
            for sentence in sentences:
                file.write(sentence + '\n')
        print(f"Sentences successfully written to {file_path}")
    except Exception as e:
        print(f"An error occurred while writing to the file: {e}")

# Data processing

In [4]:
file_path = '/Users/rosameliacarioni/University/MSc/1_year/4_period/language engineering/word_predictor/data/news_summarization.csv'
df = pd.read_csv(file_path)

In [7]:
# Check out how dataframe looks 
df

Unnamed: 0.1,Unnamed: 0,ID,Content,Summary,Dataset
0,0,f49ee725a0360aa6881ed1f7999cc531885dd06a,New York police are concerned drones could bec...,Police have investigated criminals who have ri...,CNN/Daily Mail
1,1,808fe317a53fbd3130c9b7563341a7eea6d15e94,By . Ryan Lipman . Perhaps Australian porn sta...,Porn star Angela White secretly filmed sex act...,CNN/Daily Mail
2,2,98fd67bd343e58bc4e275bbb5a4ea454ec827c0d,"This was, Sergio Garcia conceded, much like be...",American draws inspiration from fellow country...,CNN/Daily Mail
3,3,e12b5bd7056287049d9ec98e41dbb287bd19a981,An Ebola outbreak that began in Guinea four mo...,World Health Organisation: 635 infections and ...,CNN/Daily Mail
4,4,b83e8bcfcd51419849160e789b6658b21a9aedcd,By . Associated Press and Daily Mail Reporter ...,A sinkhole opened up at 5:15am this morning in...,CNN/Daily Mail
...,...,...,...,...,...
870516,870516,5d28cab74ffb4ea584cbb857d64a72a2157bf19f,The state of Oregon is ready to throw in the t...,Oregon is the only one of 16 states that has f...,CNN/Daily Mail
870517,870517,6f3e12375fc400cf9dc3ad77b8191226e740e293,"MADRID, Spain (CNN) -- A 92-year-old woman wit...","Two women, both from Uruguay, arrested after a...",CNN/Daily Mail
870518,870518,,A day after a 40-year-old man miraculously sur...,– Rescuers in Niagara Falls still haven't foun...,Multi-News
870519,870519,9af32ebbdd03e1d543d5493e93b4ac8c8e489851,"By . Deni Kirkova . PUBLISHED: . 09:27 EST, 23...","Women browse, evaluate and shop through an onl...",CNN/Daily Mail


## Extract columns of interest: content and summaries 

In [5]:
content = df[['Content']]
content

Unnamed: 0,Content
0,New York police are concerned drones could bec...
1,By . Ryan Lipman . Perhaps Australian porn sta...
2,"This was, Sergio Garcia conceded, much like be..."
3,An Ebola outbreak that began in Guinea four mo...
4,By . Associated Press and Daily Mail Reporter ...
...,...
870516,The state of Oregon is ready to throw in the t...
870517,"MADRID, Spain (CNN) -- A 92-year-old woman wit..."
870518,A day after a 40-year-old man miraculously sur...
870519,"By . Deni Kirkova . PUBLISHED: . 09:27 EST, 23..."


In [None]:
summary = df[['Summary']]
summary

## Transform the dfs into lists 

In [None]:
sentences_summary = summary['Summary'].tolist() 
sentences_summary = [text.replace('\n', ' ') for text in sentences_summary]

In [17]:
sentences_content = content['Content'].dropna().astype(str).tolist()
sentences_content 

["New York police are concerned drones could become tools for terrorists, and are investigating ways to stop potential attacks. Until now police haven't acknowledged drones as a potential weapon, but the NYPD has now said the technology has advanced enough that someone could use them to carry out an air assault using chemical weapons and firearms. Police want to develop technology which will allow them to take control of drones as well as scan the skies for them before major events. The NYPD says drones carrying explosives are the number one threat as they investigate ways to stop attacks . Deputy Chief Salvatore DiPace, left, was concerned about an incident last year where a drone was landed in front of German Chancellor Angela Merkel and 'could have took the chancellor and her people out' A drone which was flown over a packed football stadium in Manchester, England, just over a week ago, resulting in the suspected pilot being arrested . They are consulting with the military and membe

In [18]:
sentences_content_no_n = [text.replace('\n', ' ') for text in sentences_content]


## Iterate over the sentences cleaning them

In [None]:
cleaned_sentences_summary = []
for sentence in sentences_summary:
    new_sentence = clean_line(sentence)
    cleaned_sentences_summary.append(new_sentence)

In [23]:
cleaned_sentences_content = []
for sentence in sentences_content_no_n:
    new_sentence = clean_line(sentence)
    cleaned_sentences_content.append(new_sentence)

## Store the cleaned sentences

In [None]:
output_file_summarization = '/Users/rosameliacarioni/University/MSc/1_year/4_period/language engineering/word_predictor/data/clean_data/news_summarization.txt'
write_sentences_to_file(cleaned_sentences_summary, output_file_summarization)

In [24]:
output_file_content = '/Users/rosameliacarioni/University/MSc/1_year/4_period/language engineering/word_predictor/data/clean_data/news_content.txt'
write_sentences_to_file(cleaned_sentences_content, output_file_content)

Sentences successfully written to /Users/rosameliacarioni/University/MSc/1_year/4_period/language engineering/word_predictor/data/clean_data/news_content.txt
