## Reddit API
- In the following document we will try to extract reddit post:
    - As a first approach we will collect data marked by keyboards relatedt to adhd found in related subreddits
    - To improve our model we can modify our model to identify by it's own (using old data for example) the subreddits that could be intereesting 
    to scrap.


    General composition of a post:
    -  id: The post’s ID
    -  title: The post’s title
    -  text: The post’s text
    -  author: The post’s author
    -  created_utc: The post’s creation time in UTC
    -  score: The post’s score
    -  num_comments: The number of comments on the post
    -  permalink: The post’s permalink

## Connections


In [1]:
import os
from dotenv import load_dotenv
import praw

#### Connection to the Reddit APi

In [9]:

# Load environment variables from .env file
load_dotenv()

try:
    reddit = praw.Reddit(
        client_id=os.getenv("REDDIT_CLIENT_ID"),
        client_secret=os.getenv("REDDIT_CLIENT_SECRET"),
        user_agent=os.getenv("REDDIT_USER_AGENT"),
        username=os.getenv("REDDIT_USERNAME"),
        password=os.getenv("REDDIT_PASSWORD")
    )
    print(f"Connected! Logged in as: {reddit.user.me()}")
except Exception as e:
    print("An error occurred:", e)

Connected! Logged in as: ProfessorMiddle1326


#### Connect to Mongdb

In [31]:
from pymongo import MongoClient

# Load environment variables from .env file
mongo='127.0.0.1'

try:
    # Connect to MongoDB
    myclient = MongoClient(
                        "mongodb://"+mongo+":27017/") #Mongo URI format
    db=myclient['Ingestion_db']
    
    print("Connected to MongoDB successfully!")
except Exception as e:
    print("An error occurred while connecting to MongoDB:", e)

Connected to MongoDB successfully!


 #### Connect to reddis
 

In [17]:
import redis

r = redis.Redis(host='127.0.0.1', port=6379, db=0)

In [36]:
r.delete('reddit_posts')

1

In [32]:
query=db.reddit_ingestion.find({},{'_id':0,'id':1})
result=list(query)
print(result)

[{'id': '1et3kj0'}, {'id': '1gk5ftv'}, {'id': '1f0k9en'}, {'id': '1g7tbb2'}, {'id': '1dy8uqs'}, {'id': '1g65j41'}, {'id': '1g4vvkg'}, {'id': '1f5ioxz'}, {'id': '1grrk8s'}, {'id': '1bmo2m1'}, {'id': '1h2qod8'}, {'id': '1ew84dl'}, {'id': '1ejdhxs'}, {'id': '1bumqny'}, {'id': '1fhwhju'}, {'id': '1dfw2rj'}, {'id': '1cznhgt'}, {'id': '1eiiiwb'}, {'id': '1f554ps'}, {'id': '1e656lw'}, {'id': '1eo6cxc'}, {'id': '1eu7kfx'}, {'id': '1b1ckqt'}, {'id': '1egxdv1'}, {'id': '1680tdo'}, {'id': '1fjvqvb'}, {'id': '16bnu2r'}, {'id': '1f7sxm9'}, {'id': '1ervz6i'}, {'id': '16ys17h'}, {'id': '1f17b8g'}, {'id': '1e3qk7c'}, {'id': '1awdptq'}, {'id': '1fmq2s8'}, {'id': '1c2b9vo'}, {'id': '1gwse30'}, {'id': '1fqb1v9'}, {'id': 'y6b6mq'}, {'id': '1gqjflv'}, {'id': '1dewz72'}, {'id': '1gfyu6g'}, {'id': '11mane9'}, {'id': '1gwdmji'}, {'id': '1elh37m'}, {'id': '1dnreeb'}, {'id': '1gw1cs3'}, {'id': '1dfci9p'}, {'id': '1crvslx'}, {'id': '1ffgv0r'}, {'id': '15s869r'}, {'id': '1g6pw2m'}, {'id': '1cbehdl'}, {'id': '14df

In [33]:
for el in result:
    if(r.sadd('reddit_posts', el['id'])):
        continue        
    else:
        print("Post already exists in redis",el['id'])
        continue

In [39]:
r.smembers('reddit_posts')

{b'1gveo42',
 b'1d4k78a',
 b'12mz3xk',
 b'109tre3',
 b'1h2lyyt',
 b'1h4wx4r',
 b'zu7nap',
 b'1gzb42w',
 b'1g31rzj',
 b'1gzjmu4',
 b'1drg1pb',
 b'1gb2yxz',
 b'jih256',
 b'1gxil6l',
 b'1fzge8j',
 b'p2kquy',
 b'1gxias3',
 b'1gec6qo',
 b'1bcwuax',
 b'wjyza4',
 b'14477j9',
 b'1aoaxkn',
 b'1gvk4fv',
 b'1gy8052',
 b'1fzw5oz',
 b'1gs9jq9',
 b'gamgj3',
 b'1dsscp8',
 b'1h3e55z',
 b'155ots7',
 b'1cpjpop',
 b'1h3ojhd',
 b'w50kn2',
 b'1g0qxbc',
 b'1fof11t',
 b'11hok97',
 b'1f8bw9j',
 b'1gmftgl',
 b'hr6rs9',
 b'1gi34dy',
 b'1gr9sn7',
 b'1gq8auw',
 b'192emfj',
 b'1gzjn03',
 b'1gul3y4',
 b'1gy3qfc',
 b'1g89ykl',
 b'1h4esu2',
 b'1gir5by',
 b'1ekkv9p',
 b'mf1699',
 b'1e97bnk',
 b'1gyz5aj',
 b'12j3j3j',
 b'17gnogz',
 b'1bahdwc',
 b'1elr34l',
 b'18wb4q0',
 b'1btdusj',
 b'1fupcaz',
 b'1f6kx90',
 b'1fpb03s',
 b'ojma3v',
 b'1gvwydk',
 b'1bpyeps',
 b'1e2o2do',
 b'1gz4a98',
 b'ko1pmt',
 b'1g1ckey',
 b'1et3kj0',
 b'1fm857i',
 b'1gnroj4',
 b'1g83rze',
 b'ery5ye',
 b'1gun9zs',
 b'1ctm7hy',
 b'1h1g4bd',
 b'1h4zdec

In [14]:
print(r.type('reddit_posts'))  # Output: set, string, list, hash, etc.
list_elements = r.lrange('reddit_posts', 0, -1)
print("List of elements in the key 'reddit_posts':", list_elements)


b'string'


ResponseError: WRONGTYPE Operation against a key holding the wrong kind of value

## Scrap Data Using Research on reddit

#### Scrapp usging the search bar in reddit (search using the keywords):
- Define the keywords for research 
["adhd", "diagnose","energy", "brain", "test", "distracted", "forgetful", "doctor","work","task","disord","struggl","focu","dysfunct","forgot","lazi","prescrib","medic","medicin","pill","self diagnosis","self medication"]
- Different sorting technics ["relevance", "hot", "top", "new", "comments"]


In [20]:
Querykeywords=["adhd", "diagnose","energy", "brain", "test", "distracted", "forgetful", "doctor"
                  ,"work","task","disord","struggl","focu","dysfunct","forgot","lazi","prescrib","medic","medicin","pill","self diagnosis","self medication"]
sortingTechniques=["relevance", "hot", "top", "new", "comments"]


In [18]:
import datetime

def get_month(datetime):
    return datetime.month

def get_year(datetime):
    return datetime.year

def get_utc_time(timestamp):
    # Convert timestamp to UTC datetime
    utc_time = datetime.datetime.utcfromtimestamp(timestamp)
    return utc_time

In [21]:
import pandas as pd
# Subreddit to target
subreddit_name = "ADHD"
subreddit = reddit.subreddit(subreddit_name)
posts = []

#Querykeywords=["self diagnosis","self medication"]

for keyword in Querykeywords:
    for sorting in sortingTechniques:
        print("Searching for keyword:", keyword, "using sorting technique:", sorting)
        for post in subreddit.search(query=keyword,sort=sorting,syntax='cloudsearch',time_filter='all',limit=10000):# 'hot', 'new', or 'top' post    
            datecreated=get_utc_time(post.created_utc)
            year=datecreated.year
            if(year>2019) and (r.sadd('reddit_posts', post.id)):
                posts.append({
                "id":post.id,
                "title": post.title,
                "author": str(post.author),
                "score": post.score,
                "num_comments": post.num_comments,
                "upvote_ratio": post.upvote_ratio,
                "url": post.url,
                "subreddit": post.subreddit.display_name,
                "created_at": post.created_utc,
                "self_text": post.selftext, 
                "searchQuery":keyword,  
                "augmented":0
            })
        
db.reddit_posts.insert_many(posts)
          
            
    
    



Searching for keyword: adhd using sorting technique: relevance


  utc_time = datetime.datetime.utcfromtimestamp(timestamp)


Searching for keyword: adhd using sorting technique: hot
Searching for keyword: adhd using sorting technique: top
Searching for keyword: adhd using sorting technique: new
Searching for keyword: adhd using sorting technique: comments
Searching for keyword: diagnose using sorting technique: relevance
Searching for keyword: diagnose using sorting technique: hot
Searching for keyword: diagnose using sorting technique: top
Searching for keyword: diagnose using sorting technique: new
Searching for keyword: diagnose using sorting technique: comments
Searching for keyword: energy using sorting technique: relevance
Searching for keyword: energy using sorting technique: hot
Searching for keyword: energy using sorting technique: top
Searching for keyword: energy using sorting technique: new
Searching for keyword: energy using sorting technique: comments
Searching for keyword: brain using sorting technique: relevance
Searching for keyword: brain using sorting technique: hot
Searching for keyword: 

InsertManyResult([ObjectId('6759b2886ff094aec1f11277'), ObjectId('6759b2886ff094aec1f11278'), ObjectId('6759b2886ff094aec1f11279'), ObjectId('6759b2886ff094aec1f1127a'), ObjectId('6759b2886ff094aec1f1127b'), ObjectId('6759b2886ff094aec1f1127c'), ObjectId('6759b2886ff094aec1f1127d'), ObjectId('6759b2886ff094aec1f1127e'), ObjectId('6759b2886ff094aec1f1127f'), ObjectId('6759b2886ff094aec1f11280'), ObjectId('6759b2886ff094aec1f11281'), ObjectId('6759b2886ff094aec1f11282'), ObjectId('6759b2886ff094aec1f11283'), ObjectId('6759b2886ff094aec1f11284'), ObjectId('6759b2886ff094aec1f11285'), ObjectId('6759b2886ff094aec1f11286'), ObjectId('6759b2886ff094aec1f11287'), ObjectId('6759b2886ff094aec1f11288'), ObjectId('6759b2886ff094aec1f11289'), ObjectId('6759b2886ff094aec1f1128a'), ObjectId('6759b2886ff094aec1f1128b'), ObjectId('6759b2886ff094aec1f1128c'), ObjectId('6759b2886ff094aec1f1128d'), ObjectId('6759b2886ff094aec1f1128e'), ObjectId('6759b2886ff094aec1f1128f'), ObjectId('6759b2886ff094aec1f112

In [41]:
# Fetch all documents from the 'reddit_posts' collection
all_documents = db.reddit_staging.find({},{'_id':0,'Gender':0,'Mention of Solutions':0,
                                           'Personal_Experience':0,'Self-Diagnosis':0,
                                           'Self-Medication':0,'Sentiment':0,'Topic':0,'augmented':0})

# Convert the documents to a list and print them
documents_list = list(all_documents)
print(documents_list[0])

{'id': '1et3kj0', 'title': 'Diagnosed with Inattentive ADHD at 31. Explains so many things from my childhood.', 'author': 'amadnomad', 'score': 392, 'num_comments': 107, 'upvote_ratio': 0.98, 'url': 'https://www.reddit.com/r/ADHD/comments/1et3kj0/diagnosed_with_inattentive_adhd_at_31_explains_so/', 'subreddit': 'ADHD', 'created_at': 1723748809.0, 'self_text': "Please go out and get tested if you are still on the fence. I always assumed ADHD was only hyperactive. A lot of concerns about day dreaming, zoning out and inattentiveness came into play during my consult. I didn't even consider my lack of sleep being tied to ADHD. But now that I have a diagnoses, it explains quite a bit from my past. I wasn't just lazy and disorganized. \n\n  \nAgain, please go get tested if you suspect anything.", 'searchQuery': 'adhd'}


In [42]:
# Remove the 'augmented' column from each JSON object in the list
for document in documents_list:
    document['staged']=1

# Verify the change
print(documents_list[0])

{'id': '1et3kj0', 'title': 'Diagnosed with Inattentive ADHD at 31. Explains so many things from my childhood.', 'author': 'amadnomad', 'score': 392, 'num_comments': 107, 'upvote_ratio': 0.98, 'url': 'https://www.reddit.com/r/ADHD/comments/1et3kj0/diagnosed_with_inattentive_adhd_at_31_explains_so/', 'subreddit': 'ADHD', 'created_at': 1723748809.0, 'self_text': "Please go out and get tested if you are still on the fence. I always assumed ADHD was only hyperactive. A lot of concerns about day dreaming, zoning out and inattentiveness came into play during my consult. I didn't even consider my lack of sleep being tied to ADHD. But now that I have a diagnoses, it explains quite a bit from my past. I wasn't just lazy and disorganized. \n\n  \nAgain, please go get tested if you suspect anything.", 'searchQuery': 'adhd', 'staged': 1}


In [26]:
db_ingest=myclient['Ingestion_db']

In [43]:
try:
    db_ingest.reddit_ingestion.insert_many(documents_list)
except Exception as e:
    print("An error occurred while inserting documents:", e)

In [32]:
db_stage=myclient['Staging_db']

In [30]:
db=myclient['reddit']

In [36]:
reddits_post_augmented=db.reddit_posts.find({},{'_id':0})

In [37]:
reddits_post_augmented_list = list(reddits_post_augmented)

for document in reddits_post_augmented_list:
    document['augmented']=0
    
    



In [39]:
len(reddits_post_augmented_list)

9588

In [40]:
db_stage.reddit_llm.insert_many(reddits_post_augmented_list)

InsertManyResult([ObjectId('6780f8344eb12a9ee92cab7e'), ObjectId('6780f8344eb12a9ee92cab7f'), ObjectId('6780f8344eb12a9ee92cab80'), ObjectId('6780f8344eb12a9ee92cab81'), ObjectId('6780f8344eb12a9ee92cab82'), ObjectId('6780f8344eb12a9ee92cab83'), ObjectId('6780f8344eb12a9ee92cab84'), ObjectId('6780f8344eb12a9ee92cab85'), ObjectId('6780f8344eb12a9ee92cab86'), ObjectId('6780f8344eb12a9ee92cab87'), ObjectId('6780f8344eb12a9ee92cab88'), ObjectId('6780f8344eb12a9ee92cab89'), ObjectId('6780f8344eb12a9ee92cab8a'), ObjectId('6780f8344eb12a9ee92cab8b'), ObjectId('6780f8344eb12a9ee92cab8c'), ObjectId('6780f8344eb12a9ee92cab8d'), ObjectId('6780f8344eb12a9ee92cab8e'), ObjectId('6780f8344eb12a9ee92cab8f'), ObjectId('6780f8344eb12a9ee92cab90'), ObjectId('6780f8344eb12a9ee92cab91'), ObjectId('6780f8344eb12a9ee92cab92'), ObjectId('6780f8344eb12a9ee92cab93'), ObjectId('6780f8344eb12a9ee92cab94'), ObjectId('6780f8344eb12a9ee92cab95'), ObjectId('6780f8344eb12a9ee92cab96'), ObjectId('6780f8344eb12a9ee92cab

In [15]:
query=db_ingest.reddit_ingestion.find({'staged':-1},{'_id':0,'id':1})
liste=[]
for document in query:
    liste.append(document['id'])
print(liste)

['1eo6cxc', '1f7sxm9', '1cu4hku', '1e1tp5j', '197lq81', '1gfbg21', '1dczf6c', '18fybh1', '1cq6nu1', '1b7bpq4', '1er0m8a', '15t6ubb', '12ajtwk', '19f8tw3', 'uu7el5', '12kntyg', 'y8bucp', '1eagqfk', '10sbhvi', '1gj5fc7', 'yvi6iy', '1ghlv8b', '13akjsa', '16840d5', '1be9bbt', '1ftdncb', 'oo0mck', '1dkbsm0', '1dwhg9r', 'zy0r3a', '1gp7sj7', '10g22pl', '1b0qgdr', 'r64p2l', '1fy2p0y', 'zl0q28', 'okswlm', 'xd0mo6', 'xnrvxq', '18u1sam', '1eg7dxd', 'ho6bmu', 'k5gs84', 'sx7hov', '1egm6vn', '1c5rxei', '1gp3cjb', '152csat', '1f268er', '17qvupr', '186r040', '1es8xam', '1cggq6w', '16e7urw', '1gne2nn', '15520ax', '1h4wp08', '1h4plib', '1h4wvto', '1h4r01s', '1h4ouzf', '1h449ad', '1h4esu2', '1h3ztjt', '1h40gpy', '1h4n61t', '1h4kwuw', '1h3pl9r', '1h4lcyg', '1h3xhzt', '1h4n9th', '1h32k6f', '1h3teyb', '1h47qku', '1h4k3at', '1h2ard0', '1h2w9nk', '1h2mul3', '1h38fha', '1h25ii2', '1h2fdis', '1h1pvl0', '1h1ylms', '1h1ml4z', '1h29qmd', '1h23tpi', '1h18p3j', '1h2a4tm', '1h1sy2i', '1h16a19', '1h1dkzj', '1h24rb3', 

In [27]:
for id in liste:
    db_ingest.reddit_ingestion.update_one({'id':id},{'$set':{'staged':1}})

In [28]:
query=db_ingest.reddit_ingestion.find({'staged':0}).limit(5)

In [29]:
liste=[]
for document in query:
    liste.append(document)
print(liste)


[{'_id': ObjectId('6780f8724eb12a9ee92cd367'), 'id': 'k8fl7d', 'title': "I love how so many of these posts are long walls of text yet most of us don't have the attention span to get through a paragraph", 'author': 'statusconference', 'score': 4341, 'num_comments': 246, 'upvote_ratio': 0.99, 'url': 'https://www.reddit.com/r/ADHD/comments/k8fl7d/i_love_how_so_many_of_these_posts_are_long_walls/', 'subreddit': 'ADHD', 'created_at': 1607343305.0, 'self_text': 'Bolding your text and spacing things out make it easier for your readers! \n\nPutting a TL;DR summary at the start or end of your post is also a big plus.', 'searchQuery': 'adhd', 'staged': 0}, {'_id': ObjectId('6780f8724eb12a9ee92cd3ef'), 'id': '1h4i8ka', 'title': "I've completely screwed myself", 'author': 'Strange_Edge', 'score': 3, 'num_comments': 8, 'upvote_ratio': 1.0, 'url': 'https://www.reddit.com/r/ADHD/comments/1h4i8ka/ive_completely_screwed_myself/', 'subreddit': 'ADHD', 'created_at': 1733099722.0, 'self_text': "I'm in a c

In [33]:
query=db_stage.reddit_llm.find({'staged':0},{'_id':0,'id':1})
liste=[]
for document in query:
    liste.append(document['id'])
print(len(liste))

203


In [35]:
for id in liste:
    db_stage.reddit_llm.update_many({'id': id}, {'$unset': {'staged': 1}})