In [7]:
import os 
import string
from nltk.tokenize import TweetTokenizer 
import re

This notebook handles the pre-processing of the dataset used in the paper: Mining, analyzng, and modeling text written on mobile devices, by K. Vertanen and P.O. Kristensoon. Paper: 
https://www.cambridge.org/core/journals/natural-language-engineering/article/mining-analyzing-and-modeling-text-written-on-mobile-devices/A60B599D7E92B5DB9CBDE243A80626C3 


It generates 1 .txt files with cleaned content named: `mobile_text.txt`. 

# Methods

In [8]:
def clean_line(line):
    tokenizer = TweetTokenizer()
    line = line.replace("’", "'")
    allowed_punctuation_char = "'"
    allowed_punctuation_word = "'" + "#" + "-"
    
    # Modify regex to preserve periods in abbreviations like 'U.S.', 'P.M.', etc.
    # 1. Substitute periods in known abbreviations with a placeholder
    line = re.sub(r'(?<=\b[A-Z])\.(?=[A-Z]\b)', '', line)  # Remove periods in multi-letter abbreviations
    
    line = re.sub(rf'[^\w\s{allowed_punctuation_word}{allowed_punctuation_char}(\w+\'\w+)]|_|\d', ' ',  line)
    
    words = tokenizer.tokenize(line)
    result = ''
    for i, word in enumerate(words):
        if word in allowed_punctuation_char:
            result = result.rstrip()
            result += word.lower() + ' '
        elif word not in string.punctuation:
            translated_word = word.lower().replace('.', '')
            if len(translated_word) == 0:
                continue
            result += translated_word + ' '
    return result.rstrip()

In [None]:
def extract_sentences_and_clean_them(folder_path):
    sentences = []
    
    # Iterate over all files in the given folder path
    for filename in os.listdir(folder_path):
        # Check if the file has a .txt extension
        if filename.endswith('.txt'):
            file_path = os.path.join(folder_path, filename)
            with open(file_path, 'r', encoding='utf-8') as file:
                for line in file:
                    # Strip the <s> and </s> tags and any leading/trailing whitespace
                    stripped_sentence = line.strip()
                    cleaned_sentence = clean_line(stripped_sentence)
                    sentences.append(cleaned_sentence)
    
    return sentences

In [9]:
def write_sentences_to_file(sentences, file_path):
    try:
        with open(file_path, 'w') as file:
            for sentence in sentences:
                file.write(sentence + '\n')
        print(f"Sentences successfully written to {file_path}")
    except Exception as e:
        print(f"An error occurred while writing to the file: {e}")

# Data preprocessing

# sentences folder 
#### This contains all the sentences extracted from data_1 and data_2 folders 

In [11]:
sentences_path = '/Users/rosameliacarioni/University/MSc/1_year/4_period/language engineering/word_predictor/data/mobiletext/sentences'

In [12]:
all_sentences = extract_sentences_and_clean_them(sentences_path)

In [None]:
len(all_sentences)

15302165


In [None]:
all_sentences

['Is this reasonable variation',
 'And whats the easiest editing software',
 'Its useful life has been served so under the knife it went',
 'This makes no sence',
 'I got HF unit',
 'We went to pick up the outback last night and the salesmen was going over the warrantee information with my parents',
 'There is an option in Sync to turn autoplay off but it still plays on connect',
 'We shall see I suppose',
 'there was no noticeable wear on anything but for some reason it just wanted to short stroke after a few magazines when it warmed up',
 'ie better protected from dirt water and debris',
 'The Satellite radio is not so good but we all are aware of that issue',
 'Pardon me but I was going to move it to the correct thread',
 'Havent even tallied my putt count yet',
 'on a positive side though from the motor forward i have to say AWESOME You could also make a depron box with a piece of elastic to go around your head',
 'and will check ground tomorrow',
 'Thats where Id be worried about 

## Store the content 

In [13]:
output_file = '/Users/rosameliacarioni/University/MSc/1_year/4_period/language engineering/word_predictor/data/clean_data/mobile_text.txt'
write_sentences_to_file(all_sentences, output_file)

Sentences successfully written to /Users/rosameliacarioni/University/MSc/1_year/4_period/language engineering/word_predictor/data/clean_data/mobile_text.txt
