# Acquisition of Data from Reddit
### Using web crawling method

In [27]:
# Import Libraries
# PRAW is a Python wrapper for the Reddit API
# Pandas is for creating dataframe

import praw          
import pandas as pd
import datetime as dt

In [28]:
# Authentication process from Reddit
# Required Client_id, Client_secret and user_agent

reddit = praw.Reddit(client_id="Eo_DjUw2l7iIog", 
                     client_secret="7rQQlaoThKY982iPhMAtCfwzg8Y", 
                     user_agent="DataMining")

In [29]:
# Selected trendy subreddits for laptop brands

# 1) r/SuggestALaptop
# 2) r/mac
# 3) r/Dell
# 4) r/ASUS
# 5) r/thinkpad
# 6) r/AcerOfficial

## Scraping from subreddits

### Subreddits under hot tag

In [30]:
posts1 = []
ml_subreddit = reddit.subreddit('SuggestALaptop')
for post in ml_subreddit.hot(limit=2000):
    posts1.append([post.title, post.score, post.id, post.subreddit, post.url, post.num_comments, post.selftext, post.created])
posts1 = pd.DataFrame(posts1,columns=['title', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created'])
print(posts1.shape)

(988, 8)


In [31]:
posts2 = []
ml_subreddit = reddit.subreddit('mac')
for post in ml_subreddit.hot(limit=2000):
    posts2.append([post.title, post.score, post.id, post.subreddit, post.url, post.num_comments, post.selftext, post.created])
posts2 = pd.DataFrame(posts2,columns=['title', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created'])
print(posts2.shape)

(1000, 8)


In [32]:
posts3 = []
ml_subreddit = reddit.subreddit('Dell')
for post in ml_subreddit.hot(limit=2000):
    posts3.append([post.title, post.score, post.id, post.subreddit, post.url, post.num_comments, post.selftext, post.created])
posts3 = pd.DataFrame(posts3,columns=['title', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created'])
print(posts3.shape)

(997, 8)


In [33]:
posts4 = []
ml_subreddit = reddit.subreddit('ASUS')
for post in ml_subreddit.hot(limit=2000):
    posts4.append([post.title, post.score, post.id, post.subreddit, post.url, post.num_comments, post.selftext, post.created])
posts4 = pd.DataFrame(posts4,columns=['title', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created'])
print(posts4.shape)

(1000, 8)


In [34]:
posts5 = []
ml_subreddit = reddit.subreddit('thinkpad')
for post in ml_subreddit.hot(limit=2000):
    posts5.append([post.title, post.score, post.id, post.subreddit, post.url, post.num_comments, post.selftext, post.created])
posts5 = pd.DataFrame(posts5,columns=['title', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created'])
print(posts5.shape)

(1000, 8)


In [35]:
posts6 = []
ml_subreddit = reddit.subreddit('AcerOfficial')
for post in ml_subreddit.hot(limit=1000):
    posts6.append([post.title, post.score, post.id, post.subreddit, post.url, post.num_comments, post.selftext, post.created])
posts6 = pd.DataFrame(posts6,columns=['title', 'score', 'id', 'subreddit', 'url', 'num_comments', 'body', 'created'])
print(posts6.shape)

(994, 8)


In [36]:
# Merging several dataframes

df_merged = pd.concat([posts1, posts2, posts3, posts4, posts5, posts6])
df_merged.shape

(5979, 8)

In [37]:
# Fixing the date column

import datetime as dt

def get_date(created):
    return dt.datetime.fromtimestamp(created)

_timestamp = df_merged["created"].apply(get_date)

df_merged_hottag = df_merged.assign(timestamp = _timestamp)
df_merged_hottag.head(3)

Unnamed: 0,title,score,id,subreddit,url,num_comments,body,created,timestamp
0,Ignore Private Messages Suggesting Laptops,45,g94qma,SuggestALaptop,https://www.reddit.com/r/SuggestALaptop/commen...,0,The primary aim of r/SuggestALaptop is to prov...,1588036000.0,2020-04-28 09:06:12
1,/R/SuggestALaptop Stress Test Project! Submit ...,42,ekkvox,SuggestALaptop,https://www.reddit.com/r/SuggestALaptop/commen...,2,Laptops this gen often have thermal or TDP th...,1578295000.0,2020-01-06 15:23:07
2,"I need a laptop for gaming, coding, video edit...",16,gqt1pd,SuggestALaptop,https://www.reddit.com/r/SuggestALaptop/commen...,15,\*\*LAPTOP QUESTIONNAIRE\*\*\n\n&#x200B;\n\n\*...,1590510000.0,2020-05-27 00:20:20


In [38]:
# Removing new lines in every string column

df_merged_hottag = df_merged_hottag.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\n',  ' ', regex=True)
df_merged_hottag.columns = map(str.upper, df_merged_hottag.columns)
print(df_merged_hottag.shape)

(5979, 9)


In [39]:
# Saving the dataframe to csv format

df_merged_hottag.to_csv('reddit_encoded.csv', index = False, header=True, encoding='utf-8-sig')

## Scraping threads under the subreddit topics

### Threads under subreddit hot tag

In [40]:
# Scraping the threads of comments

L = []
for i in df_merged_hottag['ID']:
    submission = reddit.submission(id=i)

    submission.comments.replace_more(limit=0)
    for top_level_comment in submission.comments:
        L.append([i, top_level_comment.body])

In [41]:
L = pd.DataFrame(L,columns=['ID', 'COMMENTS'])
print(L.shape)

(14526, 2)


In [42]:
# Creating dataframe with attributes id, subreddit and comments

df2 = df_merged_hottag[['ID', 'SUBREDDIT']]
df_comments = pd.merge(L, df2, on ='ID', how = 'inner')
print(df_comments.shape)
df_comments.head(3)

(14526, 3)


Unnamed: 0,ID,COMMENTS,SUBREDDIT
0,ekkvox,If you have questions our [Discord Server](ht...,SuggestALaptop
1,ekkvox,Can anyone help me out with purchasing a lapto...,SuggestALaptop
2,gqt1pd,Ask chewy,SuggestALaptop


In [43]:
# Removing new lines in every string column

df_comments = df_comments.replace({r'\s+$': '', r'^\s+': ''}, regex=True).replace(r'\n',  ' ', regex=True)
print(df_comments.shape)

(14526, 3)


In [45]:
# Saving it to csv format

df_comments.to_csv('comments_encoded.csv', index = False, header=True, encoding='utf-8-sig')