# Subreddit CSV creator

This notebook creates a csv of the subreddit with required fields. Only a small filter of number of subscribers more than 2000 is kept

Get the original csv file from here: https://www.kaggle.com/rayraegah/subreddits

In [1]:
from pathlib import Path
import pandas as pd
import numpy as np
from tqdm import tqdm_notebook as tqdm
from reddit_crawler import get_default_reddit_inst

In [2]:
csv_file = Path('/scratch/arka/Ark_git_files/subr/subreddits_public.csv')
sub_red_info = pd.read_csv(csv_file)
sub_red_info.subscribers_count = pd.to_numeric(
    sub_red_info.subscribers_count, errors='coerce').fillna(0).astype(np.int_)

In [3]:
min_sub_count = 2000
sub_red_info = sub_red_info[sub_red_info.subscribers_count > min_sub_count]

In [4]:
red_crawl = get_default_reddit_inst()

In [5]:
sub_red_info.head()

Unnamed: 0,base10_id,base36_reddit_id,creation_epoch,subreddit_name,subscribers_count
0,40718.0,t5_vf2,1137700000.0,nsfw,936786
3,95487.0,t5_21of,1140336000.0,olympics,397323
4,96552.0,t5_22i0,1140417000.0,de,108579
5,96554.0,t5_22i2,1140417000.0,es,10734
12,98766.0,t5_247i,1140584000.0,ru,3177


In [6]:
x1 = red_crawl.subreddit('news')

In [7]:
x1.allow_videogifs

False

In [8]:
header_list = sub_red_info.columns.tolist() + ['allw_imgs', 'allw_vids', 'allw_gifs', 'over18', 'public_description']

In [9]:
header_list

['base10_id',
 'base36_reddit_id',
 'creation_epoch',
 'subreddit_name',
 'subscribers_count',
 'allw_imgs',
 'allw_vids',
 'allw_gifs',
 'over18',
 'public_description']

In [10]:
sub_red_info = sub_red_info.reindex(columns=header_list)

In [11]:
i = 0
pain_list = []
for ind, row in tqdm(sub_red_info.iterrows(), total=len(sub_red_info)):
    t = red_crawl.subreddit(row.subreddit_name)
    try:
        row.allw_imgs = t.allow_images
        row.allw_vids = t.allow_videos
        row.allw_gifs = t.allow_videogifs
        row.over18 = t.over18
        row.public_description = t.public_description
        sub_red_info.loc[ind] = row
    except Exception as e:
        i += 1
        x = row.subreddit_name
        pain_list.append(x)
        print(i, x, 'woops')
        row.allw_imgs = True
        row.allw_vids = True
        row.allw_gifs = True
        row.over18 = False
        sub_red_info.loc[ind] = row


HBox(children=(IntProgress(value=0, max=21767), HTML(value='')))

1 911truth woops
2 smart woops
3 hack woops
4 justice woops
5 linkedin woops
6 Whores woops
7 blackpower woops
8 PhilosophyOfTech woops
9 dragonball woops
10 UniversityofReddit woops
11 amateur_fuck_videos woops
12 ass_cleavage woops
13 surfinggirls woops
14 holocaust woops
15 truebestof woops
16 merica woops
17 sfw_wtf woops
18 ubisoft woops
19 Gone_Mild woops
20 watchpeopledie woops
21 LaughYouUpvote woops
22 SleepingBeauties woops
23 TheAngieCompetition woops
24 Humiliation woops
25 voyeurism woops
26 TheRedPill woops
27 meinmyplace woops
28 milliondollarextreme woops
29 Hentai_Porn woops
30 Handjob_Porn woops
31 Ebony_Porn woops
32 Celebrity_Porn woops
33 Blowjob_Porn woops
34 Anal_Porn woops
35 FULLCOMMUNISM woops
36 Rai woops
37 BestOfCamGirls woops
38 VoyeurSex woops
39 badmathematics woops
40 bigtities woops
41 treatemright woops
42 crowdgroped woops
43 bootyhadmelike woops
44 NSFW__Lesbian woops
45 DirtyKikFriends woops
46 NSFW_SexToday woops
47 Ultraleft woops
48 boobsorkitte

The exceptions have been ignored.

The rules can be added later after pruning the dataset

In [12]:
sub_red_info.head()

Unnamed: 0,base10_id,base36_reddit_id,creation_epoch,subreddit_name,subscribers_count,allw_imgs,allw_vids,allw_gifs,over18,public_description
0,40718.0,t5_vf2,1137700000.0,nsfw,936786,True,False,True,True,##Not Safe For Work\n\n^(For the porn lover in...
3,95487.0,t5_21of,1140336000.0,olympics,397323,True,True,True,False,Links and discussion about the Olympics and Pa...
4,96552.0,t5_22i0,1140417000.0,de,108579,True,True,True,False,"Das Sammelbecken für alle Deutschsprachler, ha..."
5,96554.0,t5_22i2,1140417000.0,es,10734,True,True,True,False,Reddit en Español para los españoles de España...
12,98766.0,t5_247i,1140584000.0,ru,3177,True,True,True,False,Реддит по-русски! Реддит для русскоговорящих п...


In [13]:
sub_red_info.to_csv('./req_subreddits.csv', index=False, header=True)

In [14]:
header_list

['base10_id',
 'base36_reddit_id',
 'creation_epoch',
 'subreddit_name',
 'subscribers_count',
 'allw_imgs',
 'allw_vids',
 'allw_gifs',
 'over18',
 'public_description']

In [15]:
sub_red_info_without_pub_desc = sub_red_info[['base10_id',
 'base36_reddit_id',
 'creation_epoch',
 'subreddit_name',
 'subscribers_count',
 'allw_imgs',
 'allw_vids',
 'allw_gifs',
 'over18']]

In [17]:
sub_red_info_without_pub_desc.to_csv('./req_subr_without_pub_desc.csv', index=False, header=True)