## Set-up

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
import numpy as np

# Import NLP libraries
import re
import nltk
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('words')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True

In [None]:
# Load dataframe

path = '/content/drive/Shareddrives/CMPE295-TeamEquality/RQAR Code/2. Data Cleaning/comments.csv'
comments_df = pd.read_csv(path)
comments_df = comments_df.drop_duplicates()

path = '/content/drive/Shareddrives/CMPE295-TeamEquality/RQAR Code/2. Data Cleaning/submissions.csv'
submissions_df = pd.read_csv(path)
submissions_df = submissions_df.drop_duplicates()

## Clean/Fetch API

In [None]:
def normalize_text(text):
        toko_tokenizer = ToktokTokenizer()
        wordnet_lemmatizer = WordNetLemmatizer()
        puncts = ['/', ',', '.', '"', ':', ')', '(', '-', '!', '?', '|', ';', '$', '&', '/', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '•',  '~', '@', '£', 
         '·', '_', '{', '}', '©', '^', '®', '`',  '<', '→', '°', '€', '™', '›',  '♥', '←', '×', '§', '″', '′', 'Â', '█', '½', 'à', '…', 
         '“', '★', '”', '–', '●', 'â', '►', '−', '¢', '²', '¬', '░', '¶', '↑', '±', '¿', '▾', '═', '¦', '║', '―', '¥', '▓', '—', '‹', '─', 
         '▒', '：', '¼', '⊕', '▼', '▪', '†', '■', '’', '▀', '¨', '▄', '♫', '☆', 'é', '¯', '♦', '¤', '▲', 'è', '¸', '¾', 'Ã', '⋅', '‘', '∞', 
         '∙', '）', '↓', '、', '│', '（', '»', '，', '♪', '╩', '╚', '³', '・', '╦', '╣', '╔', '╗', '▬', '❤', 'ï', 'Ø', '¹', '≤', '‡', '√', ]

        def clean_text(text):
            text = str(text)
            text = text.replace('\n', '')
            text = text.replace('\r', '')
            text = text.replace('/', ' ')
            for punct in puncts:
                if punct in text:
                    text = text.replace(punct, '')
            return text.lower()

        def remove_duplicates(text):
            text = text.split(" ")
            for i in range(0, len(text)):
                text[i] = "".join(text[i])
            UniqW = Counter(text)
            text = " ".join(UniqW.keys())
            return text

        def clean_numbers(text):
            if bool(re.search(r'\d', text)):
                text = re.sub('[0-9]{5,}', '#####', text)
                text = re.sub('[0-9]{4}', '####', text)
                text = re.sub('[0-9]{3}', '###', text)
                text = re.sub('[0-9]{2}', '##', text)
            return text

        contraction_dict = {"ain't": "is not", "aren't": "are not","can't": "cannot", "'cause": "because", "could've": "could have", "couldn't": "could not", "didn't": "did not",  "doesn't": "does not", "don't": "do not", "hadn't": "had not", "hasn't": "has not", "haven't": "have not", "he'd": "he would","he'll": "he will", "he's": "he is", "how'd": "how did", "how'd'y": "how do you", "how'll": "how will", "how's": "how is",  "I'd": "I would", "I'd've": "I would have", "I'll": "I will", "I'll've": "I will have","I'm": "I am", "I've": "I have", "i'd": "i would", "i'd've": "i would have", "i'll": "i will",  "i'll've": "i will have","i'm": "i am", "i've": "i have", "isn't": "is not", "it'd": "it would", "it'd've": "it would have", "it'll": "it will", "it'll've": "it will have","it's": "it is", "let's": "let us", "ma'am": "madam", "mayn't": "may not", "might've": "might have","mightn't": "might not","mightn't've": "might not have", "must've": "must have", "mustn't": "must not", "mustn't've": "must not have", "needn't": "need not", "needn't've": "need not have","o'clock": "of the clock", "oughtn't": "ought not", "oughtn't've": "ought not have", "shan't": "shall not", "sha'n't": "shall not", "shan't've": "shall not have", "she'd": "she would", "she'd've": "she would have", "she'll": "she will", "she'll've": "she will have", "she's": "she is", "should've": "should have", "shouldn't": "should not", "shouldn't've": "should not have", "so've": "so have","so's": "so as", "this's": "this is","that'd": "that would", "that'd've": "that would have", "that's": "that is", "there'd": "there would", "there'd've": "there would have", "there's": "there is", "here's": "here is","they'd": "they would", "they'd've": "they would have", "they'll": "they will", "they'll've": "they will have", "they're": "they are", "they've": "they have", "to've": "to have", "wasn't": "was not", "we'd": "we would", "we'd've": "we would have", "we'll": "we will", "we'll've": "we will have", "we're": "we are", "we've": "we have", "weren't": "were not", "what'll": "what will", "what'll've": "what will have", "what're": "what are",  "what's": "what is", "what've": "what have", "when's": "when is", "when've": "when have", "where'd": "where did", "where's": "where is", "where've": "where have", "who'll": "who will", "who'll've": "who will have", "who's": "who is", "who've": "who have", "why's": "why is", "why've": "why have", "will've": "will have", "won't": "will not", "won't've": "will not have", "would've": "would have", "wouldn't": "would not", "wouldn't've": "would not have", "y'all": "you all", "y'all'd": "you all would","y'all'd've": "you all would have","y'all're": "you all are","y'all've": "you all have","you'd": "you would", "you'd've": "you would have", "you'll": "you will", "you'll've": "you will have", "you're": "you are", "you've": "you have"}

        def _get_contractions(contraction_dict):
            contraction_re = re.compile('(%s)' % '|'.join(contraction_dict.keys()))
            return contraction_dict, contraction_re

        contractions, contractions_re = _get_contractions(contraction_dict)

        def replace_contractions(text):
            def replace(match):
                return contractions[match.group(0)]
            return contractions_re.sub(replace, text)

        def remove_stopwords(text, is_lower_case=True):
            stop_words = stopwords.words('english')
            stop_words.extend(['subreddit', 'subreddits', 'reddit', 'sub', 'nan']) # Remove reddit related words
            stop_words.extend(['question','like', 'post', 'find', 'finding', 'help', 'want', 'look', 'ask', 'people', 'something', 'thing', 'community', 'talk']) # Remove helper words
            stop_words.extend(['http', 'com', 'ww']) # Remove link
            tokens = toko_tokenizer.tokenize(text)
            tokens = [token.strip() for token in tokens]
            if is_lower_case:
                filtered_tokens = [token for token in tokens if token not in stop_words]
            else:
                filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
            filtered_text = ' '.join(filtered_tokens)    
            return filtered_text

        def lemmatizer(text):
            tokens = toko_tokenizer.tokenize(text)
            tokens = [token.strip() for token in tokens]
            tokens = [wordnet_lemmatizer.lemmatize(token) for token in tokens]
            return ' '.join(tokens)

        def trim_text(text):
            tokens = toko_tokenizer.tokenize(text)
            tokens = [token.strip() for token in tokens]
            return ' '.join(tokens)

        def remove_non_english(text):
            words = set(nltk.corpus.words.words())
            text = " ".join(w for w in nltk.wordpunct_tokenize(text) if w.lower() in words or not w.isalpha())
            return text

        text_norm = clean_text(text)
        text_norm = remove_duplicates(text_norm)
        text_norm = clean_numbers(text_norm)
        text_norm = replace_contractions(text_norm)
        text_norm = remove_stopwords(text_norm)
        text_norm = lemmatizer(text_norm)
        text_norm = trim_text(text_norm)
        text_norm = remove_non_english(text_norm)
        return text_norm

In [None]:
def subreddit_fetch(df):
    # Lowercase body text
    new_df = df.copy()
    new_df["body"] = new_df["body"].str.lower()

    # Drop comment if comment was written by a bot (ie AutoModerator)
    new_df = new_df[new_df.author != "AutoModerator"]

    # Drop comment if it does not contain r/...
    new_df = new_df[new_df["body"].str.contains("r/")]

    # Extract r/... answers
    new_df["suggested_subreddits"] = new_df["body"].str.extractall('(r/[a-z0-9]+)').groupby(level=0)[0].apply(set).apply(list)
    new_df = new_df[new_df["suggested_subreddits"].notnull()]
    new_df = new_df.explode("suggested_subreddits")

    # Drop duplicate recommendation in each question
    new_df = new_df.drop_duplicates(subset=['submission_id','suggested_subreddits'], keep='first').reset_index(drop=True)

    return new_df

### User Subreddit Answer

In [None]:
subred_df = subreddit_fetch(comments_df)

In [None]:
subred_df

Unnamed: 0,subreddit,id,submission_id,parent_id,body,author,permalink,upvotes,created_utc,suggested_subreddits
0,findareddit,giiua6s,ksr76x,t3_ksr76x,r/mathhelp,SequoiaBoi,https://www.reddit.com/r/findareddit/comments/...,2,1610094328,r/mathhelp
1,findareddit,gihrvbm,ksrhx2,t3_ksrhx2,"i wouldn’t say pointless, but r/isladeoro is a...",HomeBoundBinkie,https://www.reddit.com/r/findareddit/comments/...,1,1610069880,r/isladeoro
2,findareddit,gihosmw,ksrltv,t3_ksrltv,r/piracy\n\nread the megathread/wiki,concerned_citizen_3,https://www.reddit.com/r/findareddit/comments/...,1,1610068274,r/piracy
3,findareddit,gik9ihm,ksrrjl,t3_ksrrjl,r/healthover30,princesskeestrr,https://www.reddit.com/r/findareddit/comments/...,2,1610128022,r/healthover30
4,findareddit,gij6j0t,ksrrjl,t1_gii7k3a,r/nostupidquestions,Helix_One,https://www.reddit.com/r/findareddit/comments/...,3,1610106364,r/nostupidquestions
...,...,...,...,...,...,...,...,...,...,...
102195,findareddit,ggzqius,kk0ds6,t3_kk0ds6,"there is /r/fitness, but for whatever it's wor...",over_clox,https://www.reddit.com/r/findareddit/comments/...,2,1608909573,r/fitness
102196,findareddit,gh0f0ve,kk0qqv,t3_kk0qqv,r/iamverysmart might work,EskilPotet,https://www.reddit.com/r/findareddit/comments/...,1,1608926049,r/iamverysmart
102197,findareddit,ggzyhav,kk0sf0,t3_kk0sf0,r/tipofmytongue and r/namethatsong\n\nmake sur...,001Guy001,https://www.reddit.com/r/findareddit/comments/...,1,1608915278,r/tipofmytongue
102198,findareddit,ggzyhav,kk0sf0,t3_kk0sf0,r/tipofmytongue and r/namethatsong\n\nmake sur...,001Guy001,https://www.reddit.com/r/findareddit/comments/...,1,1608915278,r/namethatsong


### User Questions

In [None]:
# Combine title and selftext to get key vocab for user question
submissions_df['question_description']= submissions_df["title"].astype(str) +" "+ submissions_df["selftext"].astype(str)
submissions_df['cleaned_question_description'] = submissions_df['question_description'].apply(lambda x: normalize_text(x))

submissions_df

Unnamed: 0,subreddit,id,title,selftext,full_link,link_flair_text,upvotes,num_comments,created_utc,question_description,cleaned_question_description
0,findareddit,ksr76x,A sub to ask a more advanced real life math/3d...,At my place on employment we are making tunnel...,https://www.reddit.com/r/findareddit/comments/...,Unanswered,2,2,1610066796,A sub to ask a more advanced real life math/3d...,advanced real life math 3d geometry place empl...
1,findareddit,ksrhx2,A subreddit where people do basically pointles...,kind of like r/theydidthemath but for things b...,https://www.reddit.com/r/findareddit/comments/...,Unanswered,1,1,1610067779,A subreddit where people do basically pointles...,basically pointless thing pretty cool kind r b...
2,findareddit,ksrltv,Subreddit for ripping/bootleg/cracked mac prog...,,https://www.reddit.com/r/findareddit/comments/...,,1,4,1610068141,Subreddit for ripping/bootleg/cracked mac prog...,ripping bootleg cracked mac program
3,findareddit,ksrpse,A subreddit specifically to discuss possible c...,,https://www.reddit.com/r/findareddit/comments/...,Unanswered,1,2,1610068510,A subreddit specifically to discuss possible c...,specifically discus possible covid symptom off...
4,findareddit,ksrrjl,Reddit for asking specific questions?,Like “why does x happen when I do y?” Or “is t...,https://www.reddit.com/r/findareddit/comments/...,Unanswered,1,7,1610068648,Reddit for asking specific questions? Like “wh...,specific question x happen thing normal
...,...,...,...,...,...,...,...,...,...,...,...
52034,findareddit,kk0qqv,"A place for posting absurd comments made by ""w...",,https://www.reddit.com/r/findareddit/comments/...,Unanswered,1,1,1608910191,"A place for posting absurd comments made by ""w...",place posting absurd comment made woke
52035,findareddit,kk0sf0,Subreddit where people help you to find music,Have this song going on around my head for a w...,https://www.reddit.com/r/findareddit/comments/...,Unanswered,1,2,1608910381,Subreddit where people help you to find music ...,music song going around head need
52036,findareddit,kk0z8i,Anti Christmas,Any reddits for people who don’t like Christma...,https://www.reddit.com/r/findareddit/comments/...,Unanswered,1,1,1608911121,Anti Christmas Any reddits for people who don’...,anti dont feel freak need relate someone thanks
52037,findareddit,kk1djz,Looking for a sub for general lung health. The...,,https://www.reddit.com/r/findareddit/comments/...,Unanswered,1,3,1608912643,Looking for a sub for general lung health. The...,looking general lung health one keep lung heal...


In [None]:
# Drop incomplete question
submissions_df[submissions_df['cleaned_question_description'] == '']

Unnamed: 0,subreddit,id,title,selftext,full_link,link_flair_text,upvotes,num_comments,created_utc,question_description,cleaned_question_description
57,findareddit,kt5f6m,What is a subreddit to help you find websites?,,https://www.reddit.com/r/findareddit/comments/...,Unanswered,1,1,1610121933,What is a subreddit to help you find websites?...,
435,findareddit,kobjxd,Any subreddit for online freakouts?,,https://www.reddit.com/r/findareddit/comments/...,Unanswered,1,1,1609508993,Any subreddit for online freakouts? nan,
539,findareddit,kp62nw,A subreddit to talk about tv repairing,,https://www.reddit.com/r/findareddit/comments/...,Unanswered,1,1,1609622612,A subreddit to talk about tv repairing nan,
745,findareddit,kpz6dg,Parlerwatch but for thedonald.,,https://www.reddit.com/r/findareddit/comments/...,Unanswered,1,1,1609727670,Parlerwatch but for thedonald. nan,
1247,findareddit,l3p147,A findareddit for porn,,https://www.reddit.com/r/findareddit/comments/...,Unanswered,1,3,1611450162,A findareddit for porn nan,
...,...,...,...,...,...,...,...,...,...,...,...
50881,findareddit,khi53a,Video-enabled subreddits?,,https://www.reddit.com/r/findareddit/comments/...,Unanswered,1,1,1608559900,Video-enabled subreddits? nan,
50922,findareddit,kgi455,Is there a subreddit where i can just talk to ...,,https://www.reddit.com/r/findareddit/comments/...,Unanswered,1,2,1608418156,Is there a subreddit where i can just talk to ...,
51279,findareddit,kg7d65,Subreddit for flamethrower?,,https://www.reddit.com/r/findareddit/comments/...,,2,6,1608382968,Subreddit for flamethrower? nan,
51360,findareddit,kf4xqs,Pro-suicide subreddit?,[deleted],https://www.reddit.com/r/findareddit/comments/...,Unanswered,1,2,1608235171,Pro-suicide subreddit? [deleted],


In [None]:
submissions_df = submissions_df[submissions_df['cleaned_question_description'] != '']

## QA Join

In [None]:
df_QA = submissions_df.merge(subred_df, left_on='id', right_on='submission_id', how='inner')
df_QA = df_QA[['id_x', 'title', 'cleaned_question_description', 'id_y', 'upvotes_y', 'suggested_subreddits']]
df_QA = df_QA.rename(columns={"id_x": "question_id", 
                      "title": "question",
                      "cleaned_question_description": "question_vocab", 
                      "id_y":"comment_id",
                      "upvotes_y":"comment_upvotes"})
df_QA = df_QA.sort_values(['question_id', 'comment_upvotes'], ascending=[True, False]) # Sort subreddits based on upvotes
df_QA = df_QA.drop_duplicates().reset_index(drop=True)
df_QA

Unnamed: 0,question_id,question,question_vocab,comment_id,comment_upvotes,suggested_subreddits
0,eib8be,Looking for a subreddit about redundancy (sort...,looking redundancy sort place good idealess wo...,fcp3r91,2,r/antiwork
1,eibax6,ideas for playing/teaching toddlers,idea teaching toddler 3 year old brother alway...,fcq4bew,1,r/kidscrafts
2,eibax6,ideas for playing/teaching toddlers,idea teaching toddler 3 year old brother alway...,fcq4bew,1,r/toddlers
3,eibax6,ideas for playing/teaching toddlers,idea teaching toddler 3 year old brother alway...,fcq4bew,1,r/kidsafevideos
4,eibbys,A subreddit where people tell stories in the c...,tell story comment word looking kind r instead...,fcq40v6,1,r/onewordeach
...,...,...,...,...,...,...
101577,o0oakk,A subreddit to make actual online friends?,make actual friend title,h1wc8ld,2,r/penpals
101578,o0oakk,A subreddit to make actual online friends?,make actual friend title,h1xrbe6,1,r/makenewfriendshere
101579,o0ox9h,What sub for questions about terms and conditions,question term condition video make directed di...,h21ibzt,1,r/legaladviceofftopic
101580,o0pibq,A sub where people can draw plans for my garden,draw plan garden landscape rubbish thinking id...,h1xwdcc,2,r/howto


Example question

In [None]:
df_QA_merged = df_QA.groupby(['question_id'])['suggested_subreddits'].sum().reset_index()
df_QA_merged[df_QA_merged['question_id'] == 'fqwr63']

Unnamed: 0,question_id,suggested_subreddits
9831,fqwr63,r/bulliesgetknockedoutr/bullybackfirer/pettyre...


In [None]:
df_QA[df_QA['question_id'] == 'fqwr63']

Unnamed: 0,question_id,question,question_vocab,comment_id,comment_upvotes,suggested_subreddits
24127,fqwr63,Looking for a sub reddit where bullies get des...,looking bully get,flslqjs,2,r/bulliesgetknockedout
24128,fqwr63,Looking for a sub reddit where bullies get des...,looking bully get,flsm0zk,2,r/bullybackfire
24129,fqwr63,Looking for a sub reddit where bullies get des...,looking bully get,fltykhx,1,r/pettyrevenge
24130,fqwr63,Looking for a sub reddit where bullies get des...,looking bully get,fltykhx,1,r/nuclearrevenge
24131,fqwr63,Looking for a sub reddit where bullies get des...,looking bully get,fltykhx,1,r/prorevenge
24132,fqwr63,Looking for a sub reddit where bullies get des...,looking bully get,flsqr38,1,r/thebullwins


### Subreddit Overview


In [None]:
subreddits = df_QA['suggested_subreddits'].unique()
print("Number of Unique Subreddits:", len(subreddits))

# Subreddits suggested more than once
popular_subreddit = df_QA[df_QA.duplicated(['suggested_subreddits'], keep=False)]
print("Number of Subreddits Recommended More than once:", len(popular_subreddit['suggested_subreddits'].unique()))

Number of Unique Subreddits: 17571
Number of Subreddits Recommended More than once: 7174


Example Subreddit for different questions

In [None]:
df_QA[df_QA['suggested_subreddits'] == 'r/antiwork']

Unnamed: 0,question_id,question,question_vocab,comment_id,comment_upvotes,suggested_subreddits
0,eib8be,Looking for a subreddit about redundancy (sort...,looking redundancy sort place good idealess wo...,fcp3r91,2,r/antiwork
910,ejelux,Where can I post picture about a bad work envi...,picture bad work environment picture know,fcx8zua,1,r/antiwork
3156,emesp1,"I'm lazy and I hate jobs, and I dreaded the id...",lazy hate job idea job long got one useless de...,fdojmd2,13,r/antiwork
18458,fc7gd7,Is there a sub where I can complain about my job?,complain job,fj9ge6q,2,r/antiwork
19450,febh14,Subreddit that makes fun of job postings that ...,make fun job posting ton qualification low pay...,fjn2jzc,15,r/antiwork
21904,fkrmzc,A subreddit where people ask help with / share...,share success story fighting large corporation...,fkup0u6,2,r/antiwork
40073,gy6sc5,Is there a subreddit about people who protest ...,protest mindless culture live r neutral standp...,ft8rn5w,3,r/antiwork
43198,hf1h1k,Any subreddits dedicated to fucking off and st...,starting new life leave country drop sub could...,fvwrhlr,2,r/antiwork
44577,hjj2ka,A sub where I can complain about my shitty job?,complain job,fwnahrn,1,r/antiwork
46753,hqhuut,Is there a reddit for people who take COVID-19...,take covid ## seriously need vent quarantine s...,fxza2ws,3,r/antiwork


### Export full dataset

In [None]:
path = '/content/drive/Shareddrives/CMPE295-TeamEquality/RQAR Code/2. Data Cleaning/data/full_QA_pair.csv'
df_QA.to_csv(path, index=False)

### Train Test Split

In [None]:
# Keeping subreddit with the most upvotes
# df_QA_subset = df_QA.drop_duplicates(subset=['question_id'], keep='first').reset_index(drop=True)
# df_QA_subset

Unnamed: 0,question_id,question,question_vocab,comment_id,comment_upvotes,suggested_subreddits
0,eib8be,Looking for a subreddit about redundancy (sort...,looking redundancy sort place good idealess wo...,fcp3r91,2,r/antiwork
1,eibax6,ideas for playing/teaching toddlers,idea teaching toddler 3 year old brother alway...,fcq4bew,1,r/kidscrafts
2,eibbys,A subreddit where people tell stories in the c...,tell story comment word looking kind r instead...,fcq40v6,1,r/onewordeach
3,eibcq9,I need a subreddit where I can ask legal quest...,need legal question regarding tech general one...,fcq3vhk,1,r/entrepreneur
4,eibd7q,Is there a subreddit for finding people to pla...,finding play game stuff,fcq3qfg,1,r/gamerpals
...,...,...,...,...,...,...
42326,o0na8i,A sub to answer my questions about customizing...,answer question jacket looking forward getting...,h1wg35a,22,r/battlejackets
42327,o0o8q8,"A sub for creative writing, lyrics, poetry?",creative writing lyric poetry,h1y1jk5,2,r/writingprompts
42328,o0oakk,A subreddit to make actual online friends?,make actual friend title,h1wc8ld,2,r/penpals
42329,o0ox9h,What sub for questions about terms and conditions,question term condition video make directed di...,h21ibzt,1,r/legaladviceofftopic


In [None]:
# from sklearn.model_selection import train_test_split
# df_QA_train, df_QA_test = train_test_split(df_QA_subset, test_size=0.2, random_state=0)

# path = '/content/drive/Shareddrives/CMPE295-TeamEquality/RQAR Code/2. Data Cleaning/data/train_QA_pair.csv'
# df_QA_train.to_csv(path, index=False)

# path = '/content/drive/Shareddrives/CMPE295-TeamEquality/RQAR Code/2. Data Cleaning/data/test_QA_pair.csv'
# df_QA_test.to_csv(path, index=False)

# Full Dataframe

In [None]:
path = '/content/drive/Shareddrives/CMPE295-TeamEquality/RQAR Code/2. Data Cleaning/data/full_QA_pair.csv'
full_df_QA = pd.read_csv(path)
full_df_QA

Unnamed: 0,question_id,question,question_vocab,comment_id,comment_upvotes,suggested_subreddits
0,eib8be,Looking for a subreddit about redundancy (sort...,looking redundancy sort place good idealess wo...,fcp3r91,2,r/antiwork
1,eibax6,ideas for playing/teaching toddlers,idea teaching toddler 3 year old brother alway...,fcq4bew,1,r/kidscrafts
2,eibax6,ideas for playing/teaching toddlers,idea teaching toddler 3 year old brother alway...,fcq4bew,1,r/toddlers
3,eibax6,ideas for playing/teaching toddlers,idea teaching toddler 3 year old brother alway...,fcq4bew,1,r/kidsafevideos
4,eibbys,A subreddit where people tell stories in the c...,tell story comment word looking kind r instead...,fcq40v6,1,r/onewordeach
...,...,...,...,...,...,...
101577,o0oakk,A subreddit to make actual online friends?,make actual friend title,h1wc8ld,2,r/penpals
101578,o0oakk,A subreddit to make actual online friends?,make actual friend title,h1xrbe6,1,r/makenewfriendshere
101579,o0ox9h,What sub for questions about terms and conditions,question term condition video make directed di...,h21ibzt,1,r/legaladviceofftopic
101580,o0pibq,A sub where people can draw plans for my garden,draw plan garden landscape rubbish thinking id...,h1xwdcc,2,r/howto


In [None]:
import random

question_id = full_df_QA['question_id'].tolist()
question_id = list(np.unique(question_id))

test_size = 0.2
test_question_id = random.sample(question_id, round(len(question_id)*test_size))
train_question_id = list(set(question_id) - set(test_question_id))

df_QA_train = full_df_QA[full_df_QA['question_id'].isin(train_question_id)].reset_index(drop=True)
df_QA_test = full_df_QA[full_df_QA['question_id'].isin(test_question_id)].reset_index(drop=True)

Training size: 80552
Validation size: 21030


In [None]:
print("Training size:", len(train_question_id))
print("Testing size:", len(test_question_id))

Training size: 33865
Validation size: 8466


In [None]:
df_QA_train

Unnamed: 0,question_id,question,question_vocab,comment_id,comment_upvotes,suggested_subreddits
0,eib8be,Looking for a subreddit about redundancy (sort...,looking redundancy sort place good idealess wo...,fcp3r91,2,r/antiwork
1,eibax6,ideas for playing/teaching toddlers,idea teaching toddler 3 year old brother alway...,fcq4bew,1,r/kidscrafts
2,eibax6,ideas for playing/teaching toddlers,idea teaching toddler 3 year old brother alway...,fcq4bew,1,r/toddlers
3,eibax6,ideas for playing/teaching toddlers,idea teaching toddler 3 year old brother alway...,fcq4bew,1,r/kidsafevideos
4,eibcq9,I need a subreddit where I can ask legal quest...,need legal question regarding tech general one...,fcq3vhk,1,r/entrepreneur
...,...,...,...,...,...,...
80547,o0oakk,A subreddit to make actual online friends?,make actual friend title,h1wc8ld,2,r/penpals
80548,o0oakk,A subreddit to make actual online friends?,make actual friend title,h1xrbe6,1,r/makenewfriendshere
80549,o0ox9h,What sub for questions about terms and conditions,question term condition video make directed di...,h21ibzt,1,r/legaladviceofftopic
80550,o0pibq,A sub where people can draw plans for my garden,draw plan garden landscape rubbish thinking id...,h1xwdcc,2,r/howto


In [None]:
df_QA_test

Unnamed: 0,question_id,question,question_vocab,comment_id,comment_upvotes,suggested_subreddits
0,eibbys,A subreddit where people tell stories in the c...,tell story comment word looking kind r instead...,fcq40v6,1,r/onewordeach
1,eibd7q,Is there a subreddit for finding people to pla...,finding play game stuff,fcq3qfg,1,r/gamerpals
2,eibd7q,Is there a subreddit for finding people to pla...,finding play game stuff,fcq3qfg,1,r/playdate
3,eibd7q,Is there a subreddit for finding people to pla...,finding play game stuff,fcq3qfg,1,r/lfg
4,eibfq2,Subreddit for being the one without a child,one without child f looking experience couple ...,fcoqos0,4,r/childfree
...,...,...,...,...,...,...
21025,o0ht2p,A subreddit for stoned thoughts?,stoned thought shower tend make sense least in...,h1vbdja,7,r/stonerengineering
21026,o0ht2p,A subreddit for stoned thoughts?,stoned thought shower tend make sense least in...,h1vbdja,7,r/highdeas
21027,o0ht2p,A subreddit for stoned thoughts?,stoned thought shower tend make sense least in...,h1vr7w5,3,r/stonerphilosophy
21028,o0ht2p,A subreddit for stoned thoughts?,stoned thought shower tend make sense least in...,h1v15cr,1,r/highideas


In [None]:
path = '/content/drive/Shareddrives/CMPE295-TeamEquality/RQAR Code/2. Data Cleaning/data/train_QA_pair.csv'
df_QA_train.to_csv(path, index=False)

path = '/content/drive/Shareddrives/CMPE295-TeamEquality/RQAR Code/2. Data Cleaning/data/test_QA_pair.csv'
df_QA_test.to_csv(path, index=False)