In [1]:
# Imports

import re
import nltk

import numpy as np
import pandas as pd

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# Data

In [2]:
REDDIT_DATA_PATH = "../data/raw/Reddit_Data.csv"
df = pd.read_csv(REDDIT_DATA_PATH)
df.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


# Preprocessing

In [3]:
df.dropna(inplace=True)
df.drop_duplicates(inplace=True)
df = df[~(df["clean_comment"].str.strip() == "")]

In [4]:
stop_words_to_include = {"not", "but", "however", "no", "yet"}

def preprocess_comment(comment):
    """
    This function performs the following tasks on a comment:
        1) Converts the comment to lowercase,
        2) Strips the trailing and leading whitespaces,
        3) Removes newline characters,
        4) Removes non-alphanumeric characters except punctuations,
        5) Removes stopwords except a few important ones for sentiment analysis,
        6) Lemmatizes the comment.
    """
    comment = comment.lower()
    comment = comment.strip()
    comment = re.sub(r"\n", " ", comment)
    comment = re.sub(r"[^A-Za-z0-9\s!?.,]", "", comment)
    stop_words = set(stopwords.words("english")) - stop_words_to_include
    comment = " ".join(
        [word for word in comment.split() if word not in stop_words]
    )
    lemmatizer = WordNetLemmatizer()
    comment = " ".join(
        [lemmatizer.lemmatize(word) for word in comment.split()]
    )
    return comment

In [5]:
df["clean_comment"] = df["clean_comment"].apply(preprocess_comment)
df.head()

Unnamed: 0,clean_comment,category
0,family mormon never tried explain still stare ...,1
1,buddhism much lot compatible christianity espe...,1
2,seriously say thing first get complex explain ...,-1
3,learned want teach different focus goal not wr...,0
4,benefit may want read living buddha living chr...,1


In [6]:
PREPROCESSED_DATA_PATH = "../data/processed/reddit_preprocessed.csv"
df.to_csv(PREPROCESSED_DATA_PATH, index=False)