# Data Preprocessing

In [1]:
import os
import pandas as pd
import re

os.chdir("..")

### Load dataset

In [2]:
counsel_chat = pd.read_parquet("data/raw/counsel-chat.pkl", engine="pyarrow")
print(counsel_chat.shape)

(2775, 10)


In [3]:
counsel_chat.head()

Unnamed: 0,questionID,questionTitle,questionText,questionLink,topic,therapistInfo,therapistURL,answerText,upvotes,views
0,0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,https://counselchat.com/questions/do-i-have-to...,depression,Jennifer MolinariHypnotherapist & Licensed Cou...,https://counselchat.com/therapists/jennifer-mo...,It is very common for people to have multiple ...,3,1971
1,0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,https://counselchat.com/questions/do-i-have-to...,depression,"Jason Lynch, MS, LMHC, LCAC, ADSIndividual & C...",https://counselchat.com/therapists/jason-lynch...,"I've never heard of someone having ""too many i...",2,386
2,0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,https://counselchat.com/questions/do-i-have-to...,depression,Shakeeta TorresFaith Based Mental Health Couns...,https://counselchat.com/therapists/shakeeta-to...,Absolutely not. I strongly recommending worki...,2,3071
3,0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,https://counselchat.com/questions/do-i-have-to...,depression,"Noorayne ChevalierMA, RP, CCC, CCAC, LLP (Mich...",https://counselchat.com/therapists/noorayne-ch...,Let me start by saying there are never too man...,2,2643
4,0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,https://counselchat.com/questions/do-i-have-to...,depression,"Toni Teixeira, LCSWYour road to healing begins...",https://counselchat.com/therapists/toni-teixei...,I just want to acknowledge you for the courage...,1,256


### Data cleaning

In [4]:
# Drop unwanted columns
counsel_chat = counsel_chat.drop(columns=["questionLink", "therapistInfo", "therapistURL"])
print(counsel_chat.shape)

(2775, 7)


In [5]:
# Drop rows with empty entries
counsel_chat = counsel_chat.dropna()
counsel_chat.shape

(2612, 7)

In [6]:
# Keep entries with upvote(s) or 200+ views
counsel_chat = counsel_chat[(counsel_chat.upvotes > 0) | (counsel_chat.views > 200)]
print(counsel_chat.shape)

(1131, 7)


In [7]:
counsel_chat.head()

Unnamed: 0,questionID,questionTitle,questionText,topic,answerText,upvotes,views
0,0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,depression,It is very common for people to have multiple ...,3,1971
1,0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,depression,"I've never heard of someone having ""too many i...",2,386
2,0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,depression,Absolutely not. I strongly recommending worki...,2,3071
3,0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,depression,Let me start by saying there are never too man...,2,2643
4,0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,depression,I just want to acknowledge you for the courage...,1,256


### Basic preprocessing

In [8]:
# check for issues in answerText
for i, row in counsel_chat.iterrows():
    txt = row.answerText
    if "\n" in txt:
        print(i, txt, sep="\n", end="\n" + "-"*100 + "\n\n")

781
Basically, your response is the fight, flight or freeze reaction
which is hard wired into the human nervous system, specifically, the autonomic nervous system (called the autonomic response).  This system
is responsible for regulating the heart, digestion, respiratory rate
as well as other aspects which match the symptoms you describe.  It is largely an unconscious function, but
with training it can be managed such as a public speaking course in which you
prepare and even have some exposure to smaller experiences to help you prepare
and cope for larger presentations.  One of the greatest applications of
addressing this response is the NASA program which exposes the astronauts to situations
they will encounter while in space which will trigger the response so that when
it does happen (not if but when) that they are able to move through the
challenge with success.  You can learn
more about his by going to TED Talks “What I learned about going blind in space” It is
a truly inspiring l

In [9]:
# handle special cases
counsel_chat.loc[1092, "answerText"] = re.sub(r"\xa0{10,}", "<next-cell>", counsel_chat.loc[1092, "answerText"])
counsel_chat = counsel_chat.drop(index=1526)
print(counsel_chat.shape)

(1130, 7)


In [10]:
# remove newline characters
counsel_chat["answerText"] = counsel_chat["answerText"].apply(
    lambda txt: re.sub(r"\n", " ", txt)
)

In [11]:
for i, row in counsel_chat.iterrows():
    txt = row.answerText
    if "\n" in txt:
        print(i, txt, sep="\n", end="\n" + "-"*100 + "\n\n")

### Deduplication (1) - Aggregation of answers of duplicated questions

In [12]:
# aggregate answers of the same questions
counsel_chat_multiple_answers = counsel_chat.drop(
    columns=["upvotes", "views"]
).groupby(
    ["questionID", "questionTitle", "questionText", "topic"]
).agg(list).reset_index(level=[1, 2, 3])
print(counsel_chat_multiple_answers.shape)

(557, 4)


In [13]:
counsel_chat_multiple_answers.head()

Unnamed: 0_level_0,questionTitle,questionText,topic,answerText
questionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,depression,[It is very common for people to have multiple...
1,My apartment manager won't let me keep an emot...,I have been diagnosed with general anxiety and...,depression,[This can be a difficult situation. Typically...
2,I feel like my mother doesn't support me,My mother is combative with me when I say I do...,depression,[Do you live with your mom and have constant i...
3,Why do I feel like I don't belong anywhere?,There are many people willing to lovingly prov...,depression,[I truly understand what you are saying. I wan...
4,How can I help my girlfriend?,My girlfriend just quit drinking and she becam...,depression,[You're probably not going to like my answer.Y...


In [14]:
# join the list of answers with newline character
counsel_chat_multiple_answers["answerText"] = counsel_chat_multiple_answers["answerText"].apply(
    lambda txts: "\n".join(txts)
)
counsel_chat_multiple_answers.head()

Unnamed: 0_level_0,questionTitle,questionText,topic,answerText
questionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,depression,It is very common for people to have multiple ...
1,My apartment manager won't let me keep an emot...,I have been diagnosed with general anxiety and...,depression,"This can be a difficult situation. Typically,..."
2,I feel like my mother doesn't support me,My mother is combative with me when I say I do...,depression,Do you live with your mom and have constant in...
3,Why do I feel like I don't belong anywhere?,There are many people willing to lovingly prov...,depression,I truly understand what you are saying. I want...
4,How can I help my girlfriend?,My girlfriend just quit drinking and she becam...,depression,You're probably not going to like my answer.Yo...


### Save dataset (1) - multiple answers

In [15]:
# counsel_chat_multiple_answers.to_csv("data/processed/counsel-chat-multiple-answers.csv")

### Deduplication (2) - keeping the best answer only

In [16]:
# Sort data by question ID, number of upvotes and number of views
counsel_chat = counsel_chat.sort_values(
    by=["questionID", "upvotes", "views"], ascending=[True, False, False]
)
counsel_chat.head(10)

Unnamed: 0,questionID,questionTitle,questionText,topic,answerText,upvotes,views
0,0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,depression,It is very common for people to have multiple ...,3,1971
2,0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,depression,Absolutely not. I strongly recommending worki...,2,3071
3,0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,depression,Let me start by saying there are never too man...,2,2643
1,0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,depression,"I've never heard of someone having ""too many i...",2,386
9,0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,depression,You have been through so much and it sounds li...,1,1809
7,0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,depression,The most direct answer is no. I would venture ...,1,1064
8,0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,depression,It is never too late to get help and begin mak...,1,1024
10,0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,depression,"Absolutely not! In fact, most people have man...",1,774
11,0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,depression,This is a great question! I personally don't b...,1,438
5,0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,depression,It's not really a question of whether you have...,1,435


In [17]:
# drop duplicates
counsel_chat = counsel_chat.drop_duplicates(subset=["questionID"]).set_index("questionID", drop=True)
print(counsel_chat.shape)

(557, 6)


In [18]:
counsel_chat.head()

Unnamed: 0_level_0,questionTitle,questionText,topic,answerText,upvotes,views
questionID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0,Do I have too many issues for counseling?,I have so many issues to address. I have a his...,depression,It is very common for people to have multiple ...,3,1971
1,My apartment manager won't let me keep an emot...,I have been diagnosed with general anxiety and...,depression,"This can be a difficult situation. Typically,...",2,1026
2,I feel like my mother doesn't support me,My mother is combative with me when I say I do...,depression,Do you live with your mom and have constant in...,2,187
3,Why do I feel like I don't belong anywhere?,There are many people willing to lovingly prov...,depression,I truly understand what you are saying. I want...,1,62
4,How can I help my girlfriend?,My girlfriend just quit drinking and she becam...,depression,You're probably not going to like my answer.Yo...,3,824


### Save dataset (2) - best answer

In [19]:
# counsel_chat.to_csv("data/processed/counsel-chat-best-answer.csv")