# Data Cleaning

- [Step 1: Removing duplicates](#step-1-removing-duplicates)
- [Step 2: One-Hot-encoding labels](#step-2-one-hot-encoding-labels)
- [Step 3: Removing HTML, brackets & special characters](#step-3-removing-html-brackets--special-characters)
- [Step 4: Handling contractions](#step-4-handling-contractions)
- [Step 5: Applying spell check](#step-5-applying-spell-check)
- [Step 6: Converting to lowercase](#step-6-converting-to-lowercase)
- [Step 7: Train-Test split the data](#step-7-train-test-split-the-data)
- [Step 8: Saving preprocessed data](#step-8-saving-preprocessed-data)

In [None]:
# Imports

import re
import contractions

import pandas as pd

from symspellpy import SymSpell, Verbosity
from sklearn.model_selection import train_test_split

In [None]:
data = pd.read_csv('../data/imdb_train.csv')

print(data.shape)

data.head(3)

##### Step 1: Removing duplicates

In [None]:
duplicates = data[data.duplicated()]

print('Duplicates found:', len(duplicates))

In [None]:
# Removing duplicates

data = data.drop_duplicates()

print('Remaining reviews:', data.shape[0])

##### Step 2: One-Hot-encoding labels

In [None]:
data['sentiment'] = data['sentiment'].map({'negative': 0, 'positive': 1})

data.sample(10)

##### Step 3: Removing HTML, brackets & special characters

In [None]:
# Removing HTML line-breaks + links

def remove_html_links(review_text):
    # Remove HTML line breaks <br />
    text = re.sub(r'<.*?>', ' ', review_text)
    # Remove http(s) links
    text = re.sub(r'http\S+', ' ', text)
    # Remove dots between capital letters
    text = re.sub(r'(?<=\b[A-Z])\.(?=[A-Z]\b)', '', text)
    # Remove parentheses with only numbers inside
    text = re.sub(r'\(\d+\)', '', text)
    # Remove parentheses with content where all words are capitalized
    text = re.sub(r'\(([A-Z][a-z]*(?: [A-Z][a-z]*)*)\)', '', text)
    # Remove all dots between letters and '!' or '?'
    text = re.sub(r'(?<=[a-zA-Z])\.+(?=[!?])', '', text)
    # Replace multiple '!', '?' or '-' with just one of each in sequence
    text = re.sub(r'[!?-]+', lambda x: ''.join(sorted(set(x.group(0)), key=x.group(0).find)), text)
    # Replace sequences of more than two identical letters with exactly two
    text = re.sub(r'(.)\1{2,}', r'\1\1', text)
    # Replace '@' between letters with a space
    text = re.sub(r'(?<=[a-zA-Z])@(?!\s)', 'a', text)
    # Replace '\', '/' and '>' with a space
    text = re.sub(r'[\\/>]', ' ', text)
    # Replace multiple spaces with a single space
    text = re.sub(r'\s+', ' ', text)
    # Replace ' & ' with 'and'
    cleaned_text = re.sub(r' \& ', ' and ', text)
    
    return cleaned_text

data['review'] = data['review'].apply(remove_html_links)

In [None]:
# Replacing brackets

# def replace_brackets(text):
#     text = re.sub(r'[\[{]', '(', text)
#     text = re.sub(r'[\]}]', ')', text)
#     return text

##### Step 4: Handling contractions

In [None]:
def expand_contractions(text):
    return contractions.fix(text)

data['review'] = data['review'].apply(expand_contractions)

##### Step 5: Applying spell check

In [None]:
sym_spell = SymSpell(max_dictionary_edit_distance = 2, prefix_length = 7)

sym_spell.load_dictionary('../data/frequency_dictionary_en_82_765.txt', term_index = 0, count_index = 1)

preserve = {'.', '?', '!', ',', '-', ':', ';', '(', ')'}

skip_chars = {'I'}

def spell_check(text):
    # Splitting text into words and punctuation marks
    tokens = re.findall(r'\w+|\S', text)
    corrected_tokens = []
    for token in tokens:
        if token.isalnum() and token not in preserve and token not in skip_chars:
            suggestions = sym_spell.lookup(token, Verbosity.CLOSEST, max_edit_distance = 2)
            corrected_token = suggestions[0].term if suggestions else token
            corrected_tokens.append(corrected_token)
        else:
            corrected_tokens.append(token)
    # Setting up corrected string
    result = ''
    for token in corrected_tokens:
        if token.isalnum() or token in preserve:
            if token in preserve:
                result += token
            else:
                result += ' ' + token
    return result.strip()

data['review'] = data['review'].apply(spell_check)

##### Step 6: Converting to lowercase

In [None]:
data['review'] = data['review'].str.lower()

##### Step 7: Train-Test split the data

In [None]:
train_data, test_data = train_test_split(data, test_size = 0.2, random_state = 42)

##### Step 8: Saving preprocessed data

In [None]:
train_data.to_csv('../data/train_data.csv', index = False)

test_data.to_csv('../data/test_data.csv', index = False)