# PsyVBot
---

### This is used to scape data from Reddit API and create a dataset

In [28]:
import os
import praw
import nltk
import string
import threading
import numpy as np
import pandas as pd
from dotenv import load_dotenv
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer

In [2]:
load_dotenv()

CLIENT_ID = os.getenv('CLIENT_ID')
SECRET_KEY = os.getenv('SECRET_KEY')

In [3]:
user_agent = "PsyVBot/0.1 by /u/_thaveesha_"
reddit = praw.Reddit(
    client_id = CLIENT_ID,
    client_secret = SECRET_KEY,
    user_agent = user_agent
)

In [9]:
# to store depression text data
depression_text = []
# to store anxiety text data
anxiety_text = []

def depression_data_loop():
    for submission in reddit.subreddit('depression').hot(limit=None):
        for comment in submission.comments:
            depression_text.append(comment.body)
            print('Length of depression_text list: ', len(depression_text), end='\n')
            df1 = pd.DataFrame(depression_text, columns=['text'])
            df1.to_csv('../datasets/depression_reddit_dataset.csv', header=False, encoding='utf-8', index=False)

def anxiety_data_loop():
    for submission in reddit.subreddit('Anxiety').hot(limit=None):
        for comment in submission.comments:
            anxiety_text.append(comment.body)
            print('Length of anxiety_text list: ', len(anxiety_text), end='\n') 
            df2 = pd.DataFrame(anxiety_text, columns=['text'])
            df2.to_csv('../datasets/anxiety_reddit_dataset.csv', header=False, encoding='utf-8', index=False)

if __name__ =="__main__":
    # creating thread
    t1 = threading.Thread(target=depression_data_loop)
    t2 = threading.Thread(target=anxiety_data_loop)
 
    # starting thread 1
    t1.start()
    # starting thread 2
    t2.start()

Length of anxiety_text list:  1
Length of anxiety_text list:  2
Length of anxiety_text list:  3
Length of anxiety_text list:  4
Length of anxiety_text list:  5
Length of anxiety_text list:  6
Length of anxiety_text list:  7
Length of anxiety_text list:  8
Length of anxiety_text list:  9
Length of anxiety_text list:  10
Length of anxiety_text list:  11
Length of anxiety_text list:  12
Length of anxiety_text list:  13
Length of anxiety_text list:  14
Length of anxiety_text list:  15
Length of anxiety_text list:  16
Length of anxiety_text list:  17
Length of anxiety_text list:  18
Length of anxiety_text list:  19
Length of anxiety_text list:  20
Length of anxiety_text list:  21
Length of anxiety_text list:  22
Length of anxiety_text list:  23
Length of anxiety_text list:  24
Length of depression_text list:  1
Length of depression_text list:  2
Length of depression_text list:  3
Length of depression_text list:  4
Length of depression_text list:  5
Length of depression_text list:  6
Length 

In [14]:
df1 = pd.read_csv('../datasets/depression_reddit_dataset.csv')
df1['type'] = 'depression'
df1.columns = ['text', 'type']
df1.tail()

Unnamed: 0,text,type
1541,"It depended for me. Sometimes a few days, some...",depression
1542,I have regular depression but I get reoccurrin...,depression
1543,Amm... It's supposed to come in episodes?\n\n\...,depression
1544,Trying another therapist is a very good idea. ...,depression
1545,Same life is weird lol,depression


In [17]:
df2 = pd.read_csv('../datasets/anxiety_reddit_dataset.csv')
df2['type'] = 'anxiety'
df2.columns = ['text', 'type']
df2.tail()

Unnamed: 0,text,type
2988,"Breathing exercises, 4-7-8 stopped them comple...",anxiety
2989,"Take my meds, listen to music, think about the...",anxiety
2990,Yea I do breathwork too!!! With a guided medit...,anxiety
2991,It's just your body still physically reacting ...,anxiety
2992,I wish you the best \n\nPlease report to us to...,anxiety


In [32]:
df = pd.concat([df1, df2], ignore_index=True)
df

Unnamed: 0,text,type
0,Same here. Everyday for so long now.,depression
1,I have an awesome friend who turns all my depr...,depression
2,Me too friend. Hang in there. Hopefully this f...,depression
3,The only time I left my room the past 6 months...,depression
4,I know it's only a small thing.. but even tho...,depression
...,...,...
4534,"Breathing exercises, 4-7-8 stopped them comple...",anxiety
4535,"Take my meds, listen to music, think about the...",anxiety
4536,Yea I do breathwork too!!! With a guided medit...,anxiety
4537,It's just your body still physically reacting ...,anxiety


In [33]:
df.to_csv('../datasets/reddit_dataset.csv', header=False, encoding='utf-8', index=False)
df = pd.read_csv('../datasets/reddit_dataset.csv')
df.columns = ['text', 'type']
df.tail()

Unnamed: 0,text,type
4533,"Breathing exercises, 4-7-8 stopped them comple...",anxiety
4534,"Take my meds, listen to music, think about the...",anxiety
4535,Yea I do breathwork too!!! With a guided medit...,anxiety
4536,It's just your body still physically reacting ...,anxiety
4537,I wish you the best \n\nPlease report to us to...,anxiety


In [34]:
def remove_punctuation(text):
    # check if the input is a string
    if isinstance(text, str):
        # create a translation table with punctuation marks mapped to None
        translator = str.maketrans('', '', string.punctuation)
        # remove punctuation using the translation table
        return text.translate(translator)
    else:
        # if the input is not a string, return the input unchanged
        return text

# apply the remove_punctuation function to the 'A' column
df['text'] = df['text'].apply(lambda x: remove_punctuation(x))
df.head()

Unnamed: 0,text,type
0,I have an awesome friend who turns all my depr...,depression
1,Me too friend Hang in there Hopefully this fee...,depression
2,The only time I left my room the past 6 months...,depression
3,I know its only a small thing but even though...,depression
4,I could have written this myself Wow,depression


In [22]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/thaveesha/Developer/projects/PsyVBot/nltk_data.
[nltk_data]     ..
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/thaveesha/Developer/projects/PsyVBot/nltk_data.
[nltk_data]     ..
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/thaveesha/Developer/projects/PsyVBot/nltk_data.
[nltk_data]     ..
[nltk_data]   Package punkt is already up-to-date!


True

In [35]:
# Tokenize sentences into words
df['text'] = df['text'].apply(lambda x: word_tokenize(x.lower()))

# Remove stop words
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: [word for word in x if word not in stop_words])

# Apply lemmatization
lemmatizer = WordNetLemmatizer()
df['text'] = df['text'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

df.head()

Unnamed: 0,text,type
0,"[awesome, friend, turn, depression, trait, sel...",depression
1,"[friend, hang, hopefully, feeling, pass, soon]",depression
2,"[time, left, room, past, 6, month, trip, hospi...",depression
3,"[know, small, thing, even, though, youre, bed,...",depression
4,"[could, written, wow]",depression


In [36]:
df.to_csv('../datasets/cleaned_reddit_dataset.csv', header=False, encoding='utf-8', index=False)
df = pd.read_csv('../datasets/cleaned_reddit_dataset.csv')
df.columns = ['text', 'type']
df.tail()

Unnamed: 0,text,type
4532,"['breathing', 'exercise', '478', 'stopped', 'c...",anxiety
4533,"['take', 'med', 'listen', 'music', 'think', 'f...",anxiety
4534,"['yea', 'breathwork', 'guided', 'meditation', ...",anxiety
4535,"['body', 'still', 'physically', 'reacting', 'm...",anxiety
4536,"['wish', 'best', 'please', 'report', 'u', 'tom...",anxiety
