In [1]:
import pandas as pd
from datetime import datetime
import numpy as np

In [2]:
# Change ConvoKit's .json format to .csv for faster load-in
#askmen_df = pd.read_json('.../convokit/conversations.json').T
#askwomen_df = pd.read_json('.../convokit/askwomen_conversations.json').T
#askmen_df.to_csv("askmen_convokit.csv", index=False)
#askwomen_df.to_csv("askwomen_convokit.csv", index=False)

In [3]:
# SQL query used to pull from Google Cloud
#SELECT title, num_comments, selftext, created_utc, domain, subreddit, stickied, score, ups, downs, permalink
#FROM `fh-bigquery.reddit_posts.20*` 
#WHERE lower(subreddit)="askmen"
#ORDER BY created_utc;

In [4]:
# load in convokit (ck) data
ck_askmen = pd.read_csv('askmen_convokit.csv')
ck_askwomen = pd.read_csv('askwomen_convokit.csv')

#load in BigQuery data
bq_askmen = pd.read_csv('fh_bq_askmen.csv')
bq_askwomen = pd.read_csv('fh_bq_askwomen.csv')
# change column name to match that of ck dataset
bq_askmen.rename(columns={'created_utc':'timestamp'}, inplace=True)
bq_askwomen.rename(columns={'created_utc':'timestamp'}, inplace=True)

In [5]:
df_names = ['ck_askmen', 'ck_askwomen', 'bq_askmen', 'bq_askwomen']
dfs = [ck_askmen, ck_askwomen, bq_askmen, bq_askwomen]
for i in range(len(dfs)):
    print(df_names[i], "length:", len(dfs[i]))

ck_askmen length: 318805
ck_askwomen length: 306189
bq_askmen length: 318890
bq_askwomen length: 252547


In [6]:
display(ck_askmen.head(2))
display(bq_askmen.head(2))

Unnamed: 0,title,num_comments,domain,timestamp,subreddit,gilded,stickied,author_flair_text
0,is being a gamer girl a turn off?,7,self.AskMen,1325461225,AskMen,-1,False,
1,"of reddit, would you rather date a nerdy ""ok"" ...",0,self.AskMen,1325439243,AskMen,-1,False,


Unnamed: 0,title,num_comments,selftext,timestamp,domain,subreddit,stickied,score,ups,downs,permalink
0,Do I send him a gift?,1,[removed],1448928617,self.AskMen,AskMen,False,1,1.0,0.0,/r/AskMen/comments/3uxdhc/do_i_send_him_a_gift/
1,What's on your bucket list?,50,"I'm 20, and have decided that since I don't ha...",1448928624,self.AskMen,AskMen,False,14,14.0,0.0,/r/AskMen/comments/3uxdhu/whats_on_your_bucket...


In [7]:
# get time range of datasets
i=0
for df in [ck_askmen, ck_askwomen, bq_askmen, bq_askwomen]:
    earliest = min(df['timestamp'])
    latest = max(df['timestamp'])
    print(df_names[i], "time range:", (earliest, latest))
    print(datetime.utcfromtimestamp(earliest).strftime('%Y-%m-%d'), 
          datetime.utcfromtimestamp(latest).strftime('%Y-%m-%d'))
    i+=1
    #datetime.utcfromtimestamp(int(x))

ck_askmen time range: (1283149520, 1541029966)
2010-08-30 2018-10-31
ck_askwomen time range: (1279330223, 1541030167)
2010-07-17 2018-10-31
bq_askmen time range: (1448928617, 1567295869)
2015-12-01 2019-08-31
bq_askwomen time range: (1448928517, 1567295937)
2015-12-01 2019-08-31


In [8]:
# filter convokit data to time before bq data; drop unneeded columns
ck_askmen_cut = ck_askmen[ck_askmen['timestamp'] < 1448928617]
ck_askmen_cut.drop(columns=['author_flair_text','gilded'],inplace=True)
ck_askwomen_cut = ck_askwomen[ck_askwomen['timestamp'] < 1448928517]
ck_askwomen_cut.drop(columns=['author_flair_text','gilded'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [9]:
display(ck_askmen_cut.head(2))
display(bq_askmen.head(2))

Unnamed: 0,title,num_comments,domain,timestamp,subreddit,stickied
0,is being a gamer girl a turn off?,7,self.AskMen,1325461225,AskMen,False
1,"of reddit, would you rather date a nerdy ""ok"" ...",0,self.AskMen,1325439243,AskMen,False


Unnamed: 0,title,num_comments,selftext,timestamp,domain,subreddit,stickied,score,ups,downs,permalink
0,Do I send him a gift?,1,[removed],1448928617,self.AskMen,AskMen,False,1,1.0,0.0,/r/AskMen/comments/3uxdhc/do_i_send_him_a_gift/
1,What's on your bucket list?,50,"I'm 20, and have decided that since I don't ha...",1448928624,self.AskMen,AskMen,False,14,14.0,0.0,/r/AskMen/comments/3uxdhu/whats_on_your_bucket...


In [10]:
# concat the datasets to get final r/ask... dataset
askmen_df = pd.concat([ck_askmen_cut, bq_askmen], ignore_index=True)
askwomen_df = pd.concat([ck_askwomen_cut, bq_askwomen], ignore_index=True)
# convert timestamp column to python datetime
askmen_df['datetime'] = askmen_df['timestamp'].apply(lambda x: datetime.utcfromtimestamp(int(x)).strftime('%Y-%m-%d %H:%M:%S'))
askwomen_df['datetime'] = askwomen_df['timestamp'].apply(lambda x: datetime.utcfromtimestamp(int(x)).strftime('%Y-%m-%d %H:%M:%S'))

In [11]:
# save to csv
askmen_df.to_csv("askmen.csv", index=False)
askwomen_df.to_csv("askwomen.csv", index=False)

## Cleaning

In [12]:
askmen_df = pd.read_csv('askmen.csv')
askwomen_df = pd.read_csv('askwomen.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [13]:
# filter 'domain' column to only be self.AskWomen or self.AskMen
askmen_df = askmen_df[askmen_df['domain']=='self.AskMen']
askwomen_df = askwomen_df[askwomen_df['domain']=='self.AskWomen']

In [14]:
for i in askmen_df[askmen_df['stickied']==True]['title'].to_list():
    print(i)

Net Neutrality Bullshit and Why It Matters
MOD POST: Asking Questions 101
Men of AskMen, when was the last time you self checked for testicular cancer?
Halloween Costume Thread
Do you care about net neutrality? Why or why not?
WHAT DO I WANT FOR CHRISTMAS? NO GIFT POSTS
What are your New Year's Resolutions
How do you plan to fuck up your 2018 before it begins?
REMINDER: Please do a search, read the rules, and check out the FAQs before posting
FAQ Friday: Do you think you’re attractive/unattractive?
Mod Post: Reminder to Report Content You Think Doesn't Belong Here
MOD POST: Self-pitying validation comments will result in a temp ban
FAQ Week: How Do I Make Friends?
FAQ Week: How did you know someone was "The One", and how did you move on from them if it didn't work out?
FAQ Week: How do I approach a guy/let him know I'm into him/how do I know if a guy is into me?
FAQ Week: What quality is attractive/what small thing turns you on/what makes you feel sexy?


In [15]:
for i in askwomen_df[askwomen_df['stickied']==True]['title'].to_list():
    print (i)

omg THE PROM! Megathread
[Mod post] A note to our users in regards to our policy on inclusive question phrasing (and majority demographics)
�� Halloween/Dia de Muertos MEGATHREAD ��
[Megathread]What's your New Year's Resolution?
The PROM megathread!
Mother's Day Megathread
�� 2017 Halloween Megathread ��
It is once again the time of year for the AskWomen Winter Holiday Megathread!
2017 Best of Ask Women Nominations
Valentines Day Mega thread! Check in here for all things gifts, food, and plans for Valentine's Day.
�� October Holiday Megathread: Your center for all discussions about Halloween, Dia De Muertos, and any other holidays you will be celebrating in the month of October ��
Community Survey Reminder
Last Chance to Participate in the AskWomen Community Survey! Closing Soon!
Whats your favorite podcast/what podcasts are you listening to?
Casual Convo Fridays.


In [21]:
# filter out spam by removing posts that contain a link and has less than 3 comments
def remove_spam_and_mod_posts(df, domain):
    """Return dataframe with potential spam posts removed."""
    df = df.copy()
    print("starting:", len(df))
    # filter 'domain' column to only be self.AskWomen or self.AskMen
    new_df = df[df['domain']==domain]
    print('after domain filter:', len(new_df))
    # remove mod posts
    keywords = ['reminder','mod','survey']
    new_df = new_df[~new_df['title'].str.lower().str.contains('|'.join(keywords))]
    new_df = new_df[(new_df['title'].str.contains('\?')) | (new_df['stickied']==False)] #pinned questionless posts are mods
    print('after mod filter:', len(new_df))
    #remove megathreads 
    new_df = new_df[~new_df['title'].str.lower().str.contains('megathread')] 
    print('after megathread filter:', len(new_df))
    new_df = new_df[new_df['num_comments']>3]
    print('after engagement filter:', len(new_df))
    
    print("Total: Removed", len(df)-len(new_df),"posts out of", len(df), 
         "or", (len(df)-len(new_df))/len(df))
    return new_df


In [22]:
askmen = remove_spam_and_mod_posts(askmen_df, 'self.AskMen')
askmen.head()

starting: 435089
after domain filter: 435089
after mod filter: 433114
after megathread filter: 433103
after engagement filter: 229752
Total: Removed 205337 posts out of 435089 or 0.47194252210467286


Unnamed: 0,title,num_comments,domain,timestamp,subreddit,stickied,selftext,score,ups,downs,permalink,datetime
0,is being a gamer girl a turn off?,7,self.AskMen,1325461225,AskMen,False,,,,,,2012-01-01 23:40:25
3,Would you date a young single mom?,70,self.AskMen,1325482277,AskMen,False,,,,,,2012-01-02 05:31:17
4,Spitting. Why?,60,self.AskMen,1325474315,AskMen,False,,,,,,2012-01-02 03:18:35
5,"Another weight question for y'all, please!",23,self.AskMen,1325614638,AskMen,False,,,,,,2012-01-03 18:17:18
6,Any former incels or love-shy guys have some a...,10,self.AskMen,1325550588,AskMen,False,,,,,,2012-01-03 00:29:48


In [29]:
(433103 - 229752)/435089

0.4673779387665512

In [30]:
(389361 - 211826) / 391113

0.45392252366963

In [25]:
askwomen = remove_spam_and_mod_posts(askwomen_df, 'self.AskWomen')
askwomen.head()

starting: 391113
after domain filter: 391113
after mod filter: 389386
after megathread filter: 389361
after engagement filter: 211826
Total: Removed 179287 posts out of 391113 or 0.45840204749011154


Unnamed: 0,title,num_comments,domain,timestamp,subreddit,stickied,selftext,score,ups,downs,permalink,datetime
1,Women of Reddit: What do you find is the most...,51,self.AskWomen,1279330346,AskWomen,False,,,,,,2010-07-17 01:32:26
2,"Dear Women of Reddit, why when you do an AMA d...",30,self.AskWomen,1279357183,AskWomen,False,,,,,,2010-07-17 08:59:43
3,Askwomen: What do you think when you're lookin...,53,self.AskWomen,1279361828,AskWomen,False,,,,,,2010-07-17 10:17:08
4,"Dear ladies, How wet is it normally down there?",30,self.AskWomen,1279371894,AskWomen,False,,,,,,2010-07-17 13:04:54
5,"Do you dress to impress other women, men, or j...",25,self.AskWomen,1279575306,AskWomen,False,,,,,,2010-07-19 21:35:06


In [37]:
len(askmen), len(askwomen)

(229752, 211826)

In [16]:
def replace_url(df, col):
    """Replace url in text with 'URL'. """
    url_pattern = r'(http|ftp|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])'
    df[col] = df[col].str.replace(url_pattern, '', regex=True)
    return df

In [38]:
askmen.drop(columns=['domain','subreddit','stickied','permalink'],inplace=True)
askwomen.drop(columns=['domain','subreddit','stickied','permalink'],inplace=True)

In [39]:
askmen

Unnamed: 0,title,num_comments,timestamp,selftext,score,ups,downs,datetime
0,is being a gamer girl a turn off?,7,1325461225,,,,,2012-01-01 23:40:25
3,Would you date a young single mom?,70,1325482277,,,,,2012-01-02 05:31:17
4,Spitting. Why?,60,1325474315,,,,,2012-01-02 03:18:35
5,"Another weight question for y'all, please!",23,1325614638,,,,,2012-01-03 18:17:18
6,Any former incels or love-shy guys have some a...,10,1325550588,,,,,2012-01-03 00:29:48
...,...,...,...,...,...,...,...,...
435464,Is it common for male friends to watch porn to...,40,1567292102,,0.0,,,2019-08-31 22:55:02
435467,How do you make your girl scream in pleasure w...,16,1567292425,[deleted],0.0,,,2019-08-31 23:00:25
435477,What is your go to place of solitude where you...,35,1567294483,,9.0,,,2019-08-31 23:34:43
435478,What do you do to make your girl moan in pleas...,38,1567294512,,4.0,,,2019-08-31 23:35:12


In [40]:
# convert final cleaned dataset to csv file for faster load in
askmen.to_csv("cleaned_askmen.csv", index=False)
askwomen.to_csv("cleaned_askwomen.csv", index=False)