# Import your Libraries

In [9]:
# Scraping Reddit with Python
import praw
# Make easy to read
from rich import print

# Ge the Python Dotenv

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()  # take environment variables from .env.

HUGGINGFACE_TOKEN  = os.getenv("HUGGINGFACE_TOKEN")
REDDIT_CLIENT_ID  = os.getenv("REDDIT_CLIENT_ID")
REDDIT_CLIENT_SECRET  = os.getenv("REDDIT_CLIENT_SECRET")
REDDIT_USERNAME  = os.getenv("REDDIT_USERNAME")
REDDIT_PASSWORD  = os.getenv("REDDIT_PASSWORD")
REDDIT_USER_AGENT  = os.getenv("REDDIT_USER_AGENT")

# Called Reddits API
1. User-Agent (string) - A short description of what the bot does. For example: "Python:Sentdex Sentiment Analysis v0.1 (by /u/sentdex) , scraped_utd_reddit_bot"
2. Client-ID (string) - 14 character key that is generated when you create an app. For example: "Jxk7Jxk7Jxk7Jxk"

In [10]:
reddits = praw.Reddit(client_id = REDDIT_CLIENT_ID ,
                      client_secret =  REDDIT_CLIENT_SECRET ,
                      username =  REDDIT_USERNAME ,
                      password =  REDDIT_PASSWORD ,
                        user_agent = REDDIT_USER_AGENT )

# Called a Subreddits

In [11]:
subreddit = reddits.subreddit("utdallas")

# Get Hot

In [12]:
hot_utd = subreddit.hot(limit = 3)
print(f"The data type of hot_utd is {type(hot_utd)}")
print("Top 3 posts on r/utdallas")
for submission in hot_utd:
    print(submission.title)

# Get all of the submission from utd dallas. 
1. Get Reddit each Post
    1. Commets ( I have to go through the comment forest)
        1. get the id of the comment , which are the reply to the original post
        2.Go through the comment forest and get the id of the comment

In [31]:
def get_reddit_data(subreddit, subreddit_limit:int = 10 , comment_limit:int = 10 ):
    """
    This function takes in a subreddit and returns a dictionary with the
    """
    all_elements = subreddit.top( limit =  subreddit_limit  , time_filter = "all")
    ## index 
    titles = [None] * subreddit_limit
    self_texts = [None] * subreddit_limit
    index = 0 
    ## filter the elements where the selftext is not empt
    all_element_dict = dict()
    for submission in all_elements:
        ## store the id of the submission
        id = submission.id
        ## create a dictionary for each submission
        all_element_dict[ id ] = dict()
        ## store the title of the submission
        all_element_dict[ id ]["title"] = submission.title
        titles[ index ] = submission.title
        ## store the selftext of the submission
        all_element_dict[ id ]["selftext"] = submission.selftext
        self_texts[ index ] = submission.selftext
        index = index + 1
        ## store the author of the submission
        try:
            all_element_dict[ id ]["author"] = submission.author.name
        except AttributeError as e:
            all_element_dict[ id ]["author"] = "unknown"
        ## store the number of comments of the submission
        all_element_dict[ id ]["num_comments"] = submission.num_comments
        ## store the permalink of the submission
        all_element_dict[ id ]["permalink"] = submission.permalink
        ## store the url of the submission
        all_element_dict[ id ]["url"] = submission.url
        ## store all the comments in text format from reddits
        all_element_dict[ id ]["comments"] = dict()
        ## get all the comments (including the replies to the comments)
        submission.comments.replace_more( limit = comment_limit )
        ## create an function with a yield similar to a dfs
        for comment in submission.comments.list():
            try:
                ## store the id of the comment
                if comment.id not in all_element_dict[ id ]["comments"]:
                    all_element_dict[ id ]["comments"][ comment.id ] = dict()
                ## store the body of the comment
                all_element_dict[ id ]["comments"][ comment.id ]["body"] = comment.body
                ## store the author of the comment
                try:
                    all_element_dict[ id ]["comments"][ comment.id ]["author"] = comment.author.name
                except:
                    all_element_dict[ id ]["comments"][ comment.id ]["author"] = "unknown"
                    
                ## store the score of the comment
                all_element_dict[ id ]["comments"][ comment.id ]["score"] = comment.score
                ## store the replies to the comments
                if comment.replies != None and len(comment.replies) > 0 :
                    all_element_dict[ id ]["comments"][ comment.id ]["replies"] = dict()
                ## store the replies to the comments
                for reply in comment.replies:
                    try:
                        ## store the id of the reply
                        if reply.id not in all_element_dict[ id ]["comments"][ comment.id ]["replies"]:
                            all_element_dict[ id ]["comments"][ comment.id ]["replies"][ reply.id ] = dict()
                        ## store the body of the reply
                        all_element_dict[ id ]["comments"][ comment.id ]["replies"][ reply.id ]["body"] = reply.body
                        ## store the author of the reply
                        try:
                            all_element_dict[ id ]["comments"][ comment.id ]["replies"][ reply.id ]["author"] = reply.author.name
                        except:
                            all_element_dict[ id ]["comments"][ comment.id ]["replies"][ reply.id ]["author"] = "unknown"
                        ## store the score of the reply
                        all_element_dict[ id ]["comments"][ comment.id ]["replies"][ reply.id ]["score"] = reply.score
                    ## there error caomes from the called NonType object has no attributes
                    except AttributeError as e:
                        print(f"The author of the reply is {reply.author}")
                        print(e)
                        break
            except:
                print(f"The author of the comment is {comment.author}")
                print(e)
                break
            
        ## tree of the comments
    return all_element_dict , titles , self_texts
all_element_dict   , titles , self_texts= get_reddit_data( subreddit = subreddit, subreddit_limit = 100 , comment_limit = 100 )
#print(all_element_dict)
    

# Load the dictionary to a json files 

In [32]:
# Convert the dictionary to a json files names utd_reddit.json
import json
## get the size of the dictionary
print(f"The size of the dictionary is {len(all_element_dict)}")
with open("utd_reddit.json", "w") as f:
    json.dump(all_element_dict, f)


# Convert the list into a pandas dataframe

In [33]:
## Convert the two list into a panda dataframe
import pandas as pd
df = pd.DataFrame({"title":titles, "selftext":self_texts})
print(df.head())

# Upload the Panda DatFrame to a Hugging Face Dataset and Push to the Hub

In [None]:
from datasets import Dataset
datatset_pd = Dataset.from_pandas(df)
## upload the dataset to huggingface
datatset_pd.upload_dataset("utd_reddit_pd" ,token  = HUGGINGFACE_TOKEN)

# Load the Json file in to Dataset Dictionary a librarues create by Hugging Face 

In [34]:
from datasets import load_dataset

datatset = load_dataset("json", data_files = "utd_reddit.json")

Using custom data configuration default-185618a055c924a1


Downloading and preparing dataset json/default to /home/null/.cache/huggingface/datasets/json/default-185618a055c924a1/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 1868.29it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 197.26it/s]
                                                            

Dataset json downloaded and prepared to /home/null/.cache/huggingface/datasets/json/default-185618a055c924a1/0.0.0/0f7e3662623656454fcd2b650f34e886a7db4b9104504885bd462096cc7a9f51. Subsequent calls will reuse this data.


100%|██████████| 1/1 [00:01<00:00,  1.15s/it]


# Upload the Dataset to Hugging Face Hub

In [35]:
datatset.push_to_hub("utd_reddit" , token = HUGGINGFACE_TOKEN)

Pushing split train to the Hub.
Pushing dataset shards to the dataset hub: 100%|██████████| 1/1 [00:04<00:00,  4.64s/it]


FileNotFoundError: https://huggingface.co/datasets/Rami/utd_reddit/resolve/main/data/train-00000-of-00001-5025aa51fdffbdb4.parquet