In [1]:
import string
import os
from nltk.tokenize import TweetTokenizer
import os.path
import re

This notebook handles the pre-processing of the twitter and blog dataset available [here in RPub](https://rpubs.com/NAyako/1036093). 

It generates 1 .txt files with cleaned content named: `twitter.txt`. 

# Methods

In [5]:
def clean_line(line):
    tokenizer = TweetTokenizer()
    line = line.replace("’", "'")
    allowed_punctuation_char = "'"
    allowed_punctuation_word = "'" + "#" + "-"
    
    # Modify regex to preserve periods in abbreviations like 'U.S.', 'P.M.', etc.
    # 1. Substitute periods in known abbreviations with a placeholder
    line = re.sub(r'(?<=\b[A-Z])\.(?=[A-Z]\b)', '', line)  # Remove periods in multi-letter abbreviations
    
    line = re.sub(rf'[^\w\s{allowed_punctuation_word}{allowed_punctuation_char}(\w+\'\w+)]|_|\d', ' ',  line)
    
    words = tokenizer.tokenize(line)
    result = ''
    for i, word in enumerate(words):
        if word in allowed_punctuation_char:
            result = result.rstrip()
            result += word.lower() + ' '
        elif word not in string.punctuation:
            translated_word = word.lower().replace('.', '')
            if len(translated_word) == 0:
                continue
            result += translated_word + ' '
    return result.rstrip()


def process_files(file_or_dir):
    """
    This function recursively processes all files in a directory.
    """
    if os.path.isdir(file_or_dir):
        for root, dirs, files in os.walk(file_or_dir):
            for file in files:
                process_files(os.path.join(root, file))
    else:
        print('Processing:', file_or_dir)
        stream = open(file_or_dir, mode='r', encoding='utf-8', errors='ignore')
        text = stream.readlines()
        file_out = open("../data/twitter.txt", "a")  # append mode
        for line in text:
            file_out.write(clean_line(line) + "\n")
        file_out.close()
        print('Done!')

**Testing the function with small test cases:**

In [6]:
s = 'tryin\' to dislodge the jerky. CHAT:test U.S.today 😆'
clean_line(s)

"tryin' to dislodge the jerky chat test us today"

## Process and clean the data into a txt file

In [8]:
file = "../data/twitter"
process_files(file)

Processing: ../data/twitter/en_US.blogs.txt
Done!
Processing: ../data/twitter/en_US.twitter.txt
Done!
Processing: ../data/twitter/en_US.news.txt
Done!
