In [14]:
import sqlalchemy
import configparser
import pandas as pd
import numpy as np
import re
import nltk

In [2]:
config = configparser.ConfigParser()
config_file = '../config.ini'
config.read(config_file)
default = config['DEFAULT-SQLALCHEMY']
engine = sqlalchemy.create_engine('mysql+mysqlconnector://{0}:{1}@{2}/{3}'.
                                            format(default['DB_USER'], default['DB_PASSWORD'], 
                                                    default['DB_IP'], default['DB_DATABASE']))

In [3]:
query = """
select *
from
reddit_data
"""

df = pd.read_sql(query, engine)

df.head()

Unnamed: 0,title,selftext,category,subreddit,datetime
0,The reason I love alcohol at 18. It makes me f...,I,hot,mentalhealth\n,2023-10-08 07:21:22
1,Work-Related Anxiety/Depression,Lately I've been having some pretty serious an...,hot,mentalhealth\n,2023-10-08 05:30:05
2,Does anyone else feel like they don’t belong a...,It does not matter who is around me or who I a...,hot,mentalhealth\n,2023-10-08 03:31:24
3,Working in a supermarket is hell on earth.,I've worked in a supermarket for 4 years now. ...,hot,mentalhealth\n,2023-10-08 07:06:59
4,I feel like I’m not valid,TW: Mentions of abuse\n\nI don’t even remember...,hot,mentalhealth\n,2023-10-08 08:25:54


In [4]:
df = df[['title', 'selftext']]
df.head()

Unnamed: 0,title,selftext
0,The reason I love alcohol at 18. It makes me f...,I
1,Work-Related Anxiety/Depression,Lately I've been having some pretty serious an...
2,Does anyone else feel like they don’t belong a...,It does not matter who is around me or who I a...
3,Working in a supermarket is hell on earth.,I've worked in a supermarket for 4 years now. ...
4,I feel like I’m not valid,TW: Mentions of abuse\n\nI don’t even remember...


In [5]:
# remove numbers in dataframe

df = df.replace('\d+', '', regex=True)
df.head()

Unnamed: 0,title,selftext
0,The reason I love alcohol at . It makes me fee...,I
1,Work-Related Anxiety/Depression,Lately I've been having some pretty serious an...
2,Does anyone else feel like they don’t belong a...,It does not matter who is around me or who I a...
3,Working in a supermarket is hell on earth.,I've worked in a supermarket for years now. P...
4,I feel like I’m not valid,TW: Mentions of abuse\n\nI don’t even remember...


In [6]:
# remove punctuations in dataframe
# edit some punctuations to space instead

df = df.replace('[.,]+', '', regex=True)
df = df.replace('[/]+', ' ', regex=True)
df.head()

Unnamed: 0,title,selftext
0,The reason I love alcohol at It makes me feel...,I
1,Work-Related Anxiety Depression,Lately I've been having some pretty serious an...
2,Does anyone else feel like they don’t belong a...,It does not matter who is around me or who I a...
3,Working in a supermarket is hell on earth,I've worked in a supermarket for years now Pr...
4,I feel like I’m not valid,TW: Mentions of abuse\n\nI don’t even remember...


In [7]:
# change all to lowercase in dataframe

df = df.apply(lambda x: x.astype(str).str.lower())
df.head()

Unnamed: 0,title,selftext
0,the reason i love alcohol at it makes me feel...,i
1,work-related anxiety depression,lately i've been having some pretty serious an...
2,does anyone else feel like they don’t belong a...,it does not matter who is around me or who i a...
3,working in a supermarket is hell on earth,i've worked in a supermarket for years now pr...
4,i feel like i’m not valid,tw: mentions of abuse\n\ni don’t even remember...


In [8]:
# drop any values in the dataframe

print("Original Data: {}".format(df.shape))

df = df.dropna()

print("New Data: {}".format(df.shape))
df.head()

Original Data: (14805, 2)
New Data: (14805, 2)


Unnamed: 0,title,selftext
0,the reason i love alcohol at it makes me feel...,i
1,work-related anxiety depression,lately i've been having some pretty serious an...
2,does anyone else feel like they don’t belong a...,it does not matter who is around me or who i a...
3,working in a supermarket is hell on earth,i've worked in a supermarket for years now pr...
4,i feel like i’m not valid,tw: mentions of abuse\n\ni don’t even remember...


In [9]:
# combine selftext to title

df = df.stack().reset_index(level=1, drop=True).to_frame('title').reset_index(drop=True)
df = df.rename(columns={'title':'text'})
df

Unnamed: 0,text
0,the reason i love alcohol at it makes me feel...
1,i
2,work-related anxiety depression
3,lately i've been having some pretty serious an...
4,does anyone else feel like they don’t belong a...
...,...
29605,i (f) am in college i have a group of friends ...
29606,what should i do?
29607,please can somebody help me? i really don't kn...
29608,team co-captain


In [10]:
# return df with titile and selftext not single word

mask = (df['text'].str.len() == 1)
df = df[~mask].reset_index(drop=True)

print("New Data: {}".format(df.shape))
df.head()

New Data: (29587, 1)


Unnamed: 0,text
0,the reason i love alcohol at it makes me feel...
1,work-related anxiety depression
2,lately i've been having some pretty serious an...
3,does anyone else feel like they don’t belong a...
4,it does not matter who is around me or who i a...


In [11]:
# convert short forms to full terms e.g. can't --> cannot

def contraction(s):
    # s = re.sub(r"doesn't", "does not", s)
    # s = re.sub(r"don't", "do not", s)
    # s = re.sub(r"didn't", "did not", s)
    s = re.sub(r"won't", "will not", s)
    # s = re.sub(r"wouldn't", "would not", s)
    # s = re.sub(r"haven't", "have not", s)
    # s = re.sub(r"hasn't", "has not", s)
    # s = re.sub(r"hadn't", "had not", s)
    s = re.sub(r"shan't", "shall not", s)
    # s = re.sub(r"can't", "cannot", s)
    s = re.sub(r"\'re", " are", s)
    s = re.sub(r"\'s", " is", s)
    s = re.sub(r"\'ll", " will", s)
    s = re.sub(r"n't", " not", s)
    s = re.sub(r"\'ve", " have", s)
    s = re.sub(r"\'m", " am", s)
    
    return s

df['preprocessed-text'] = df['text'].apply(lambda x: contraction(x))

df

Unnamed: 0,text,preprocessed-text
0,the reason i love alcohol at it makes me feel...,the reason i love alcohol at it makes me feel...
1,work-related anxiety depression,work-related anxiety depression
2,lately i've been having some pretty serious an...,lately i have been having some pretty serious ...
3,does anyone else feel like they don’t belong a...,does anyone else feel like they don’t belong a...
4,it does not matter who is around me or who i a...,it does not matter who is around me or who i a...
...,...,...
29582,i (f) am in college i have a group of friends ...,i (f) am in college i have a group of friends ...
29583,what should i do?,what should i do?
29584,please can somebody help me? i really don't kn...,please can somebody help me? i really do not k...
29585,team co-captain,team co-captain


In [12]:
# remove non alpha characters

df['preprocessed-text'] = df['preprocessed-text'].replace('[^A-Za-z]+', ' ', regex=True)
df.head()

Unnamed: 0,text,preprocessed-text
0,the reason i love alcohol at it makes me feel...,the reason i love alcohol at it makes me feel ...
1,work-related anxiety depression,work related anxiety depression
2,lately i've been having some pretty serious an...,lately i have been having some pretty serious ...
3,does anyone else feel like they don’t belong a...,does anyone else feel like they don t belong a...
4,it does not matter who is around me or who i a...,it does not matter who is around me or who i a...


In [13]:
# remove extra spaces

df['preprocessed-text'] = df['preprocessed-text'].replace(' +', ' ', regex=True)
df.head()

Unnamed: 0,text,preprocessed-text
0,the reason i love alcohol at it makes me feel...,the reason i love alcohol at it makes me feel ...
1,work-related anxiety depression,work related anxiety depression
2,lately i've been having some pretty serious an...,lately i have been having some pretty serious ...
3,does anyone else feel like they don’t belong a...,does anyone else feel like they don t belong a...
4,it does not matter who is around me or who i a...,it does not matter who is around me or who i a...


In [16]:
# remove stopwords

from nltk.corpus import stopwords

stop_words = stopwords.words('english')
df['preprocessed-text'] = df['preprocessed-text'].apply(lambda x: ' '.join([x for x in x.split() if x not in stop_words]))

df.head()

Unnamed: 0,text,preprocessed-text
0,the reason i love alcohol at it makes me feel...,reason love alcohol makes feel numb numb real ...
1,work-related anxiety depression,work related anxiety depression
2,lately i've been having some pretty serious an...,lately pretty serious anxiety depression work ...
3,does anyone else feel like they don’t belong a...,anyone else feel like belong anywhere
4,it does not matter who is around me or who i a...,matter around total strangers people love alwa...


In [18]:
# lemmatization

from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

word_lemmatizer = WordNetLemmatizer()
df['preprocessed-text'] = df['preprocessed-text'].apply(lambda x: ' '.join(word_lemmatizer.lemmatize(word) for word in word_tokenize(x)))

df.head()

Unnamed: 0,text,preprocessed-text
0,the reason i love alcohol at it makes me feel...,reason love alcohol make feel numb numb real r...
1,work-related anxiety depression,work related anxiety depression
2,lately i've been having some pretty serious an...,lately pretty serious anxiety depression work ...
3,does anyone else feel like they don’t belong a...,anyone else feel like belong anywhere
4,it does not matter who is around me or who i a...,matter around total stranger people love alway...
