In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from jupyterthemes import jtplot
jtplot.style()

In [3]:
# extract the data
df = pd.read_csv('https://query.data.world/s/htrdsouy327xqa4w457qx6k6sjtj6r')

In [4]:
# examine the data
df.head()

Unnamed: 0,type,id,subreddit.id,subreddit.name,subreddit.nsfw,created_utc,permalink,domain,url,selftext,title,score
0,post,ftbp1i,2qh72,jokes,False,1585785543,https://old.reddit.com/r/Jokes/comments/ftbp1i...,self.jokes,,My corona is covered with foreskin so it is no...,I am soooo glad I'm not circumcised!,2
1,post,ftboup,2qh72,jokes,False,1585785522,https://old.reddit.com/r/Jokes/comments/ftboup...,self.jokes,,It's called Google Sheets.,Did you know Google now has a platform for rec...,9
2,post,ftbopj,2qh72,jokes,False,1585785508,https://old.reddit.com/r/Jokes/comments/ftbopj...,self.jokes,,The vacuum doesn't snore after sex.\n\n&amp;#x...,What is the difference between my wife and my ...,15
3,post,ftbnxh,2qh72,jokes,False,1585785428,https://old.reddit.com/r/Jokes/comments/ftbnxh...,self.jokes,,[removed],My last joke for now.,9
4,post,ftbjpg,2qh72,jokes,False,1585785009,https://old.reddit.com/r/Jokes/comments/ftbjpg...,self.jokes,,[removed],The Nintendo 64 turns 18 this week...,134


In [5]:
df.shape

(1000000, 12)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000000 entries, 0 to 999999
Data columns (total 12 columns):
 #   Column          Non-Null Count    Dtype 
---  ------          --------------    ----- 
 0   type            1000000 non-null  object
 1   id              1000000 non-null  object
 2   subreddit.id    1000000 non-null  object
 3   subreddit.name  1000000 non-null  object
 4   subreddit.nsfw  1000000 non-null  bool  
 5   created_utc     1000000 non-null  int64 
 6   permalink       1000000 non-null  object
 7   domain          1000000 non-null  object
 8   url             4472 non-null     object
 9   selftext        995525 non-null   object
 10  title           1000000 non-null  object
 11  score           1000000 non-null  int64 
dtypes: bool(1), int64(2), object(9)
memory usage: 84.9+ MB


In [40]:
# boolean masks

# where selftext isnull
no_punchline = df['selftext'].isnull()
# where title or post were removed by reddit user/mod
removed = (df['selftext'] == '[removed]') | (df['title'] == '[removed]')
# where tile or post were deleted by the original poster
deleted = (df['selftext'] == '[deleted]') | (df['title'] == '[deleted]')
# where upvotes is 0 (reddit automatically provides user 1 upvote)
at_0 = df['score'] == 0
# where upvotes is 1
at_1 = df['score'] == 1
# where upvotes > 1 less than or equal to 10
over_1_to_10 = (df['score'] > 1) & (df['score'] <= 10)
# where upvotes >10 but less than or equal to 50
over_10_to_50 = (df['score'] > 10) & (df['score'] <= 50)
# where joke has at least fifty upvotes but less than or equal to 500
over_50_to_500 = (df['score'] > 50) & (df['score'] <= 500)
# where joke has at least 500 upvotes but less than or equal to 1000
over_500_to_1000 = (df['score'] > 500) & (df['score'] <= 1000)
# where joke has more than 1000 or but less than or equal to 9000
over_1000_to_9000 = (df['score'] > 1000) & (df['score'] <= 9000)
# joke is over 9000
over_9000 = df['score'] > 9000

In [8]:
# get entries that aren't removed or deleted
df_cleaned = df[~removed][~deleted]
df_cleaned.shape

  df_cleaned = df[~removed][~deleted]


(578634, 12)

In [37]:
# making unix created_times into readable timestamps
def make_tz_timestamp(unix_ts, tz='US/Eastern'):
    from datetime import datetime
    import pytz
    timezone = pytz.timezone(tz)
    # convert unix utc to timestamp utc
    ts = datetime.utcfromtimestamp(unix_ts)
    # return is to that it is tz-aware
    #return timezone.localize(ts)
    return ts

In [38]:
df_cleaned['created_at'] = df_cleaned['created_utc'].apply(lambda x: make_tz_timestamp(x))

In [39]:
# check time span of jokes
start_date, end_date = min(df_cleaned['created_at']), max(df_cleaned['created_at'])
print(start_date)
print(end_date)
elapsed_time = end_date - start_date
print('time elapsed: {}'.format(elapsed_time))

2015-03-26 19:05:21
2020-04-01 23:59:03
time elapsed: 1833 days 04:53:42


In [43]:
# filter dfs and check their sizes
bad_jokes_df = df_cleaned[at_0]
print('bad: ', len(bad_jokes_df))
ignored_jokes_df = df_cleaned[at_1]
print('ignored: ', len(ignored_jokes_df))
poor_jokes_df = df_cleaned[over_1_to_10]
print('poor: ', len(poor_jokes_df))
mediocre_jokes_df = df_cleaned[over_10_to_50]
print('mediocre: ', len(mediocre_jokes_df))
average_jokes_df = df_cleaned[over_50_to_500]
print('average: ', len(average_jokes_df))
good_jokes_df = df_cleaned[over_500_to_1000]
print('good: ', len(good_jokes_df))
great_jokes_df = df_cleaned[over_1000_to_9000]
print('great: ', len(great_jokes_df))
best_jokes_df = df_cleaned[over_9000]
print('best: ', len(best_jokes_df))

  bad_jokes_df = df_cleaned[at_0]
  ignored_jokes_df = df_cleaned[at_1]
  poor_jokes_df = df_cleaned[over_1_to_10]


bad:  117358
ignored:  57405
poor:  219853


  mediocre_jokes_df = df_cleaned[over_10_to_50]
  average_jokes_df = df_cleaned[over_50_to_500]
  good_jokes_df = df_cleaned[over_500_to_1000]


mediocre:  107003
average:  57381
good:  6786
great:  9117
best:  3731


  great_jokes_df = df_cleaned[over_1000_to_9000]
  best_jokes_df = df_cleaned[over_9000]


In [42]:
len(bad_jokes_df)

117358

In [None]:
len()