## Data Cleaning and Preprocessing

In order to clean and preprocess the data, I created a data cleaning and preprocessing function with the following capabilities:

- Stopword removal
- Lemmatization
- Lowercase
- Punctuation cleaning
- Number cleaning

In [None]:
# Data preprocessing libraries
import pandas as pd
import numpy as np
import string
import re
import nltk
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Individual cleaning functions

def remove_stopwords(text):
  stop_words=set(stopwords.words('english'))
  words=word_tokenize(text)
  sentence=[w for w in words if w not in stop_words]
  return " ".join(sentence)

def lemmatize_text(text):
  wordlist=[]
  lemmatizer = WordNetLemmatizer()
  sentences=sent_tokenize(text)
  for sentence in sentences:
      words=word_tokenize(sentence)
      for word in words:
          wordlist.append(lemmatizer.lemmatize(word))
  return ' '.join(wordlist)

def lowercase_text(text):
  return text.lower()

def remove_punctuations(text):
  additional_punctuations = ['’', '…'] # punctuations not in string.punctuation
  for punctuation in string.punctuation:
    text = text.replace(punctuation, '')

  for punctuation in additional_punctuations:
    text = text.replace(punctuation, '')

  return text

def remove_numbers(text):
  if text is not None:
    text = text.replace(r'^\d+\.\s+','')

  text = re.sub("[0-9]", '', text)
  return text

# Unified boolean controlled cleaning function
def clean_and_preprocess_data(text, lowercase=True, clean_stopwords=True, clean_punctuations=True, clean_links=True,
                              clean_emojis=True, clean_spaces=True, clean_numbers=True,  lemmatize=True):

  if clean_stopwords == True:
    text = remove_stopwords(text)

  if clean_punctuations == True:
    text = remove_punctuations(text)

  if clean_numbers == True:
    text = remove_numbers(text)

  if lemmatize == True:
    text = lemmatize_text(text)

  if lowercase == True:
    return text.lower()

  return text


## Preprocessing and cleaning the raw data

In [None]:
data = pd.read_csv("subtitles.csv")

data['subtitles'] = data['subtitles'].apply(lambda x: clean_and_preprocess_data(x, lemmatize=False, clean_numbers=False, clean_stopwords=False, clean_punctuations=False, lowercase=False))
print(text_df.head())

# Saving the preprocessed data into a .csv file
data.to_csv("subtitles_cleaned.csv")