# Libraries and settings

In [4]:
import pandas as pd
import datetime as dt
import time

#for mySQL connection
import mysql
import mysql.connector
import pymysql
from sqlalchemy import create_engine

#import pushshift API to get comments from Reddit
from pmaw import PushshiftAPI

#for envorinment variables
import os

#regex for removing links
import re

#for nltk, removing everything but letters
from nltk.tokenize import word_tokenize

#for detecting gibberish(non-english words)
from gibberish_detector import detector
Detector = detector.create_from_model('Gibberish-Detector\gibberish-detector.model')

#for removing contractions (e.g. don't -> do not)
import contractions

#for removal of accents
import unicodedata

#for spacy/lemmatization
import spacy
nlp = spacy.load('en_core_web_sm', disable = ['parser','ner'])


In [5]:
#load cython
%load_ext cython
#increase number of displayed columns to max
pd.set_option('display.max_columns', None)

The cython extension is already loaded. To reload it, use:
  %reload_ext cython


```
#for PRAW - currently not used
import praw
#the following were set using the following guide - https://github.com/reddit-archive/reddit/wiki/OAuth2
r_client_id = os.environ['reddit_client_id']
r_secret_key = os.environ['reddit_secret_key']
r_username = os.environ['reddit_username']
r_password = os.environ['reddit_password']
r_app_id = 'Sub_specific_corpus'
app_version = 'v0.1'
python_version = !python -V
r_user_agent = '{}:{}:{} (by u/{})'.format(python_version, r_app_id, app_version, r_username)
#import logging #not implemented - https://praw.readthedocs.io/en/stable/getting_started/logging.html
```

# SQL database creation

In [6]:
def connect_to_SQL():
    return mysql.connector.connect(
        host='localhost',
        user='root',
        passwd='20RUNstackHost',
    )

In [7]:
def connect_to_db(db_name):
    return mysql.connector.connect(
        host='localhost',
        user='root',
        passwd='20RUNstackHost',
        database=db_name
    )

In [8]:
def check_if_db_exists(cursor, db_name):
    cursor.execute("SHOW DATABASES LIKE '{}'".format(db_name))
    return bool(cursor.fetchone())

In [9]:
def check_if_table_exists(cursor, table_name):
    cursor.execute("SHOW TABLES LIKE '{}'".format(table_name))
    return bool(cursor.fetchone())

In [10]:
def add_db(cursor, db_name):
    cursor.execute("CREATE DATABASE {}".format(db_name))

In [11]:
def add_comments_table(cursor):
    cursor.execute("CREATE TABLE comments (author VARCHAR(40), author_fullname VARCHAR(255),body TEXT, created_utc VARCHAR(255), id VARCHAR(20) PRIMARY KEY, is_submitter VARCHAR(255), link_id VARCHAR(255), parent_id VARCHAR(255), permalink VARCHAR(255), retrieved_on VARCHAR(30), score INT, subreddit VARCHAR(255), subreddit_id VARCHAR(255))")

In [12]:
def add_table_to_db(db_name, table):
    cnx_SQL = connect_to_SQL()
    cursor_SQL = cnx_SQL.cursor()
    if not check_if_db_exists(cursor_SQL, db_name):
        add_db(cursor_SQL, db_name)
        print('Database {} created'.format(db_name))
    cnx_SQL.commit()
    cursor_SQL.close()
    cnx_SQL.close()
    cnx = connect_to_db(db_name)
    cursor = cnx.cursor()
    if not check_if_table_exists(cursor, table):
        if table == 'comments':
            add_comments_table(cursor)
            print('Table {} created'.format(table))
    cnx.commit()
    cursor.close()
    cnx.close()

## Reddit API (PRAW) - Not used yet


```
#lets see if we can use the reddit API
reddit = praw.Reddit(
    client_id=r_client_id,
    client_secret=r_secret_key,
    user_agent=r_user_agent,
    username=r_username,
    password=r_password    
)
subreddit = reddit.subreddit('askreddit')
subreddit.comments(limit=100)
for i in subreddit.comments(limit=10):
    print(i.body)
```

## PushShift

In [13]:
#initialize the pushshift pmaw API
api = PushshiftAPI()

In [14]:
def clean_comments_df(df):
    '''
    Cleans the comments dataframe by removing columns that are not needed
    '''
    return df.drop(columns=['all_awardings', 'associated_award', 'author_flair_background_color', 'author_flair_css_class','author_flair_richtext', 'author_flair_template_id', 'author_flair_text',\
        'author_flair_text_color','author_flair_type', 'author_patreon_flair', 'author_premium', 'awarders', 'collapsed_because_crowd_control', 'comment_type', 'gildings','locked','no_follow',\
        'send_replies','stickied','top_awarded_type', 'total_awards_received', 'treatment_tags', 'archived', 'body_sha1', 'can_gild',  'collapsed', 'collapsed_reason', 'controversiality',\
        'distinguished','gilded','score_hidden','subreddit_name_prefixed','subreddit_type','author_cakeday','unrepliable_reason', 'collapsed_reason_code', 'retrieved_utc', 'edited'], axis=1, errors='ignore')       

In [15]:
def get_pushshift_comments(subreddit, before, after, limit):
    '''
    Gets comments from the pushshift API
    '''
    return api.search_comments(subreddit=subreddit, metadata=True, before=before, after=after, limit=limit)

In [16]:
def from_pushshift_to_df(comments):
    '''
    Converts the comments from the pushshift API to a dataframe
    '''
    comment_list = [c for c in comments]
    comments_df = pd.DataFrame(comment_list)
    return comments_df

In [17]:
def send_df_to_db(df, db_name, table):
    '''
    Sends the dataframe to the database
    '''
    add_table_to_db(db_name, table)
    engine = create_engine('mysql+pymysql://root:20RUNstackHost@localhost/{}'.format(db_name))
    df.to_sql(table, con = engine, if_exists='append', index=False)

In [18]:
def get_comments(subreddit, before, after, limit=None):
    '''
    Gets comments from the pushshift API and sends them to the database
    '''
    comments = get_pushshift_comments(subreddit, before, after, limit)
    comments_df = from_pushshift_to_df(comments)
    comments_df = clean_comments_df(comments_df)
    send_df_to_db(comments_df, subreddit, 'comments')

### records of pulling from push-shift

```
#getting all the data at once is a bad idea lets build a loop that does it 5000 dt units at a time
#original_start_date = int(dt.datetime(2021, 1, 1, 0, 0).timestamp())
#original_end_date = int(dt.datetime(2022, 1, 1, 0, 0).timestamp())
while start_date<end_date:
    get_comments('askreddit', end_date, end_date-5000)
    end_date-=5000

#ran it 500 minutes, got around 2 million comments, moved on to the next subreddit
#ending end_date = 1640203500 (2021-12-22-8-05-00)
```

```
#getting all the data at once is a bad idea lets build a loop that does it 10000 dt units at a time
#original_start_date = int(dt.datetime(2021, 1, 1, 0, 0).timestamp())
#original_end_date = int(dt.datetime(2022, 1, 1, 0, 0).timestamp())
start_date = int(dt.datetime(2021, 1, 1, 0, 0).timestamp())
end_date = 1638904006
count = 0

while start_date<end_date:
    get_comments('askmen', end_date, end_date-10000)
    end_date-=10000
    count += 1
    print("{} iterations last end_date ={}".format(count, end_date))
#ran it for 623 min, got around 2.5 mil comments, moved on to the next subreddit
#ending end_date = 1626774006, (2021-07-20-9-40-06)
```

```
#getting all the data at once is a bad idea lets build a loop that does it 20000 dt units at a time
#original_start_date = int(dt.datetime(2021, 1, 1, 0, 0).timestamp())
#original_end_date = int(dt.datetime(2022, 1, 1, 0, 0).timestamp())
start_date = int(dt.datetime(2021, 1, 1, 0, 0).timestamp())
end_date = int(dt.datetime(2022, 1, 1, 0, 0).timestamp())
count = 0

while start_date<end_date:
    get_comments('askwomen', end_date, end_date-20000)
    end_date-=20000
    count += 1
    print("{} iterations last end_date ={}".format(count, end_date))

#ran it for 485min, got 1534382 comments, moved on to the next subreddit
#ending end_date = 1609484000 #APROX Jan 1st 2021 ran to completion 
```

```
#original_start_date = int(dt.datetime(2021, 1, 1, 0, 0).timestamp())
#original_end_date = int(dt.datetime(2022, 1, 1, 0, 0).timestamp())
start_date = int(dt.datetime(2021, 1, 1, 0, 0).timestamp())
end_date = int(dt.datetime(2022, 1, 1, 0, 0).timestamp())
count = 0

while start_date<end_date:
    get_comments('dota2', end_date, end_date-20000)
    end_date-=20000
    count += 1
    print("{} iterations, last end_date ={}".format(count, end_date))
#ran it to completion in 518 min, got 1740932 comments
#ending end_date = 1609484000 #APROX Jan 1st 2021 ran to completion
```

```
#getting all the data at once is a bad idea lets build a loop that does it 10000 dt units at a time
#original_start_date = int(dt.datetime(2021, 1, 1, 0, 0).timestamp())
#original_end_date = int(dt.datetime(2022, 1, 1, 0, 0).timestamp())
start_date = int(dt.datetime(2021, 1, 1, 0, 0).timestamp())
end_date = 1618374000
count = 0

while start_date<end_date:
    get_comments('askscience', end_date, end_date-86400)
    end_date-=86400
    count += 1
    print("{} iterations, last end_date ={}".format(count, end_date))
#ran it three times, 
#stopped it at 2105 iterations, 430 min, 217326 records?, seemed low last end_date =1619974000 to increase step size from 10k to 50k
#stopped it again at 32 iterations, 7 min, 230515 records, and decided to just increase step size to 1 day. end_date = 1618374000
#ran to completion 3rd time 103 iterations, 23 min, 294504 records, end_date = 1609474800
```

```
#getting all the data at once is a bad idea lets build a loop that does it 86400 dt units at a time (1 day)
#original_start_date = int(dt.datetime(2021, 1, 1, 0, 0).timestamp())
#original_end_date = int(dt.datetime(2022, 1, 1, 0, 0).timestamp())
start_date = int(dt.datetime(2021, 1, 1, 0, 0).timestamp())
end_date = int(dt.datetime(2022, 1, 1, 0, 0).timestamp())
count = 0

while start_date<end_date:
    get_comments('politics', end_date, end_date-86400)
    end_date-=86400
    count += 1
    print("{} iterations, last end_date ={}".format(count, end_date))
#stopped it after 751 min, 94 iterations, end_date = 1632902400, total records 2,863,608
```

```
#getting all the data at once is a bad idea lets build a loop that does it 86400 dt units at a time (1 day)
#original_start_date = int(dt.datetime(2021, 1, 1, 0, 0).timestamp())
#original_end_date = int(dt.datetime(2022, 1, 1, 0, 0).timestamp())
start_date = int(dt.datetime(2021, 1, 1, 0, 0).timestamp())
end_date = int(dt.datetime(2022, 1, 1, 0, 0).timestamp())
count = 0

while start_date<end_date:
    get_comments('TwoXChromosomes', end_date, end_date-86400)
    end_date-=86400
    count += 1
    print("{} iterations, last end_date ={}".format(count, end_date))
#ran for 410 min to completion, got 1,619,521 comments, last end_date =1609488000
```

```
#getting all the data at once is a bad idea lets build a loop that does it 86400 dt units at a time (1 day)
#original_start_date = int(dt.datetime(2021, 1, 1, 0, 0).timestamp())
#original_end_date = int(dt.datetime(2022, 1, 1, 0, 0).timestamp())
start_date = int(dt.datetime(2021, 1, 1, 0, 0).timestamp())
end_date = 1629878400
count = 0

while start_date<end_date:
    get_comments('canada', end_date, end_date-86400)
    end_date-=86400
    count += 1
    print("{} iterations, last end_date ={}".format(count, end_date))

#ran it a few time, stopped twice without request, X iter, 41 iterations
#280 min on last one, 2,129,337 comments 
```

```
#getting all the data at once is a bad idea lets build a loop that does it 86400 dt units at a time (1 day)
#original_start_date = int(dt.datetime(2021, 1, 1, 0, 0).timestamp())
#original_end_date = int(dt.datetime(2022, 1, 1, 0, 0).timestamp())
start_date = int(dt.datetime(2021, 1, 1, 0, 0).timestamp())
end_date = int(dt.datetime(2022, 1, 1, 0, 0).timestamp())
count = 0

while start_date<end_date:
    get_comments('onguardforthee', end_date, end_date-86400)
    end_date-=86400
    count += 1
    print("{} iterations, last end_date ={}".format(count, end_date))

#ran to completion, 141 min, 412,382 comments
```

```
#getting all the data at once is a bad idea lets build a loop that does it 86400 dt units at a time (1 day)
#original_start_date = int(dt.datetime(2021, 1, 1, 0, 0).timestamp())
#original_end_date = int(dt.datetime(2022, 1, 1, 0, 0).timestamp())
start_date = int(dt.datetime(2021, 1, 1, 0, 0).timestamp())
end_date = int(dt.datetime(2022, 1, 1, 0, 0).timestamp())
count = 0

while start_date<end_date:
    get_comments('trollxchromosomes', end_date, end_date-86400)
    end_date-=86400
    count += 1
    print("{} iterations, last end_date ={}".format(count, end_date))

#ran to completion in 78 min, got 186,416 comments
```

In [26]:
#getting all the data at once is a bad idea lets build a loop that does it 86400 dt units at a time (1 day)
#original_start_date = int(dt.datetime(2021, 1, 1, 0, 0).timestamp())
#original_end_date = int(dt.datetime(2022, 1, 1, 0, 0).timestamp())
start_date = 1619078398
end_date = 1638864059
count = 0

while start_date<end_date:
    get_comments('christianity', end_date, end_date-86400)
    end_date-=86400
    count += 1
    print("{} iterations, last end_date ={}".format(count, end_date))

#ran to 1639209659

# Creation of corpus

## Import data from SQL

In [41]:
def pull_comments_from_SQL(subreddit):
    engine = create_engine('mysql+pymysql://root:20RUNstackHost@localhost/{}'.format(subreddit))
    df = pd.read_sql_table('comments', con=engine)
    return df

## Cleaning

In [42]:
authors_to_remove = ['AutoModerator', 'dota2_responses_bot', 'HCE_Replacement_Bot', 'Kevin_Garnett_Bot', 'Rangers_Bot', 'DropBox_Bot', 'Website_Mirror_Bot', 'Metric_System_Bot', \
    'Fedora-Tip-Bot', 'Some_Bot', 'Brigade_Bot', 'Link_Correction_Bot', 'Porygon-Bot', 'KarmaConspiracy_Bot', 'SWTOR_Helper_Bot', 'annoying_yes_bot', 'wtf_content_bot', 'Insane_Photo_Bot', \
    'Antiracism_Bot', 'qznc_bot', 'mma_gif_bot', 'QUICHE-BOT', 'bRMT_Bot', 'hockey_gif_bot', 'nba_gif_bot', 'gifster_bot', 'imirror_bot', 'okc_rating_bot', 'tennis_gif_bot', 'nfl_gif_bot', \
    'CPTModBot', 'LocationBot', 'CreepySmileBot', 'FriendSafariBot', 'WritingPromptsBot', 'CreepierSmileBot', 'IAgreeBot', 'Cakeday-Bot', 'Meta_Bot', 'HockeyGT_Bot', 'soccer_gif_bot', \
    'gunners_gif_bot', 'xkcd_number_bot', 'GWHistoryBot', 'PokemonFlairBot', 'ChristianityBot', 'cRedditBot', 'StreetFightMirrorBot', 'FedoraTipAutoBot', 'UnobtaniumTipBot', 'astro-bot', \
    'TipMoonBot', 'PlaylisterBot', 'Wiki_Bot', 'fedora_tip_bot', 'GunnersGifsBot', 'PGN-Bot', 'GunnitBot', 'havoc_bot', 'Relevant_News_Bot', 'gfy_bot', 'RealtechPostBot', 'imgurHostBot', \
    'Gatherer_bot', 'JumpToBot', 'DeltaBot', 'Nazeem_Bot', 'PhoenixBot', 'AtheismModBot', 'IsItDownBot', 'malo_the_bot', 'RFootballBot', 'KSPortBot', 'Makes_Small_Text_Bot', 'CompileBot', \
    'SakuraiBot', 'asmrspambot', 'SurveyOfRedditBot', 'RfreebandzBOT', 'rule_bot', 'xkcdcomic_bot', 'PloungeMafiaVoteBot', 'PoliticBot', 'Dickish_Bot_Bot', 'SuchModBot', 'MultiFunctionBot', \
    'CasualMetricBot', 'xkcd_bot', 'VerseBot', 'BeetusBot', 'GameDealsBot', 'BadLinguisticsBot', 'rhiever-bot', 'gfycat-bot-sucksdick', 'chromabot', 'Readdit_Bot', 'wooshbot', \
    'disapprovalbot', 'request_bot', 'define_bot', 'dogetipbot', 'techobot', 'CaptionBot', 'rightsbot', 'colorcodebot', 'roger_bot', 'ADHDbot', 'hearing-aid_bot', 'WikipediaCitationBot', \
    'PonyTipBot', 'fact_check_bot', 'rusetipbot', 'test_bot0x00', 'classybot', 'NFLVideoBot', 'MAGNIFIER_BOT', 'WordCloudBot2', 'JotBot', 'WeeaBot', 'raddit-bot', 'comment_copier_bot', \
    'coinflipbot', 'VideoLinkBot', 'new_eden_news_bot', 'hwsbot', 'UrbanDicBot', 'hearingaid_bot', 'thankyoubot', 'GeekWhackBot', 'ExmoBot', 'CHART_BOT', 'tips_bot', 'GATSBOT', 'allinonebot', \
    'moderator-bot', 'rnfl_robot', 'StackBot', 'GooglePlusBot', 'hit_bot', 'randnumbot', 'CAH_BLACK_BOT', 'CalvinBot', 'DogeTipStatsBot', 'autourbanbot', 'GabenCoinTipBot', 'Definition_Bot', \
    'redditbots', 'redditreviewbot', 'bot', 'autowikibot', 'golferbot', 'topredditbot', 'c5bot', 'jerkbot-3hunna', 'gracefulclaritybot', 'valkyribot', 'gracefulcharitybot', 'ddlbot', \
    'NoSobStoryBot2', 'bitofnewsbot', 'conspirobot', 'tipmoonbot1', 'd3posterbot', 'serendipitybot', 'gabentipbot', 'givesafuckbot', 'SakuraiBot_test', 'ttumblrbots', 'haiku_robot', \
    'tipmoonbot2', 'MarciBestGirl', 'comfort_bot_1962', 'WaterIsWetBot', 'VRCbot', 'Prof_Acorn']

In [43]:
def remove_unwanted_authors(df):
    '''
    Removes all comments made by authors in the list "authors_to_remove" from the dataframe
    '''
    return df[~df['author'].isin(authors_to_remove)]

In [44]:
def remove_removed_and_deleted_comments(df):
    '''
    Removes all removed and deleted comments from the dataframe
    '''
    df = df[df.body != '[removed]']
    return df[df.body != '[deleted]']


In [45]:
def remove_negative_score_comments(df):
    '''
    Removes all comments with a negative score from the dataframe
    '''
    return df[df.score > 0]

In [46]:
def remove_comments_with_body_len_less_than(df, truncate_len=20):
    '''
    Removes all comments with a body length less than 20 (def) from the dataframe
    '''
    return df[df.body.str.len() > truncate_len]

In [47]:
def remove_new_line_characters(df):
    '''
    Replaces all new line characters in the body of the dataframe with a space
    '''
    return df.replace(r'\n', ' ', regex=True)

In [48]:
def lower_case_body(df):
    '''
    Converts all body text to lower case
    '''
    df.body = df.body.str.lower()
    return df

In [49]:
def remove_links(string):
    '''
    Removes all links from the string
    '''
    pattern = re.compile('\[(.*?)\]\(.*?\)')
    
    return re.sub(pattern, '', string)

In [50]:
def remove_link_stragglers(df):
    '''
    Removes all link stragglers from the body of the dataframe in a gready way (drops all str containing http)
    '''
    return df[~df.body.str.contains('http')]


In [51]:
def clean_contractions(df):
    '''
    Removes all contractions from the dataframe
    '''
    df.body = df.body.apply(lambda x: contractions.fix(x))
    return df

In [52]:
def detect_gibberish(df):
    '''
    Detects gibberish in the body of the comments
    '''
    df['gibberish'] = df.body.apply(lambda x: Detector.is_gibberish(x))
    #drop rows where gibberish is detected
    df = df[df.gibberish == False]
    return df

In [53]:
def remove_punct_and_non_alphabetic_caracters(string):
    '''
    Removes all punctuation and non alphabetic characters from the string
    '''
    words = word_tokenize(string)
    words = [word for word in words if word.isalpha()]
    string = ' '.join(words)
    return string

In [54]:
def remove_accented_chars(string):
    '''
    Removes all accented characters from the string
    '''
    return unicodedata.normalize('NFKD', string).encode('ascii', 'ignore').decode('utf-8', 'ignore')

In [55]:
def lemmatize_with_spacy(string):
    '''
    Lemmatizes the body of the comments with spacy
    '''
    doc = nlp(string)
    return " ".join([token.lemma_ for token in doc])

In [56]:
def time_elapsed(start_time):
    return str(((time.time() - start_time) / 60).__round__(2)) + ' minutes elapsed total\n'

In [57]:
def num_of_comments_removed(df, last_shape):
    return 'Number of comments removed: ' + str(last_shape - df.shape[0]) + '\n'

In [58]:
def comments_remaining_percentage(df, start_shape):
    return 'Percentage of comments remaining: ' + str(((df.shape[0] / start_shape) * 100).__round__(2)) + '%\n Total comments remaining: ' + str(df.shape[0]) + '\n'

In [59]:
def time_spent_on_step(last_step_time):
    return 'Time spent on last step: ' + str(((time.time() - last_step_time) / 60).__round__(2)) + ' minutes ' + str(((time.time()-last_step_time) % 60).__round__(2)) + ' seconds\n'

In [60]:
def clean_df_column(df, truncate_at=20, no_bots=True, no_blanks=True, filter_neg_score=True, no_newline_char=True, no_links=True, lower=True, \
    no_accents=True, no_contractions=True, no_gibberish=True, no_punct_and_numbers=True, truncate=True, lemmatize=True):

    #set up the start time and shape of the dataframe
    start_shape = df.shape[0]
    start_time = time.time()
    last_shape = start_shape

    print('Starting cleaning process... \n Original number of comments: ' + str(start_shape) + '\n')
    
    if no_bots:
        last_step_time = time.time()
        df = remove_unwanted_authors(df)
        print('Bye bots o/\n', num_of_comments_removed(df, last_shape), time_elapsed(start_time), time_spent_on_step(last_step_time), comments_remaining_percentage(df, start_shape))
        last_shape = df.shape[0]

    
    if no_blanks:
        last_step_time = time.time()
        df = remove_removed_and_deleted_comments(df)
        print('Removed & Deleted\n', num_of_comments_removed(df, last_shape), time_elapsed(start_time), time_spent_on_step(last_step_time), comments_remaining_percentage(df, start_shape))
        last_shape = df.shape[0]

    if filter_neg_score:
        last_step_time = time.time()
        df = remove_negative_score_comments(df)
        print('Remove Sub 1 scores\n', num_of_comments_removed(df, last_shape), time_elapsed(start_time), time_spent_on_step(last_step_time), comments_remaining_percentage(df, start_shape))
        last_shape = df.shape[0]

    if no_newline_char:
        last_step_time = time.time()
        df = remove_new_line_characters(df)
        print('Remove the new line character(s) \n', time_elapsed(start_time), time_spent_on_step(last_step_time), comments_remaining_percentage(df, start_shape))
        last_shape = df.shape[0]
    
    if no_links:
        last_step_time = time.time()
        df.body = df.body.apply(lambda x: remove_links(x))
        print('Remove links in comments \n', time_elapsed(start_time), time_spent_on_step(last_step_time), comments_remaining_percentage(df, start_shape))
        df = remove_link_stragglers(df)
        print('Remove straggler links \n',  num_of_comments_removed(df, last_shape), time_elapsed(start_time), time_spent_on_step(last_step_time), comments_remaining_percentage(df, start_shape))
        last_shape = df.shape[0]
    
    if lower:
        last_step_time = time.time()
        df = lower_case_body(df)
        print('lower_case_body \n', time_elapsed(start_time), time_spent_on_step(last_step_time), comments_remaining_percentage(df, start_shape))
        last_shape = df.shape[0]
    
    if no_accents:
        last_step_time = time.time()
        df.body = df.body.apply(lambda x: remove_accented_chars(x))
        print('remove_accented_chars \n', time_elapsed(start_time), time_spent_on_step(last_step_time), comments_remaining_percentage(df, start_shape))
        last_shape = df.shape[0]
    
    if no_contractions:
        last_step_time = time.time()
        df = clean_contractions(df)
        print('clean_contractions:\n', time_elapsed(start_time), time_spent_on_step(last_step_time), comments_remaining_percentage(df, start_shape))
        last_shape = df.shape[0]
    
    if no_gibberish:
        last_step_time = time.time()
        df['gibberish'] = 0
        df = detect_gibberish(df)
        print('Del gibberish:\n', num_of_comments_removed(df, last_shape), time_elapsed(start_time), time_spent_on_step(last_step_time), comments_remaining_percentage(df, start_shape))
        last_shape = df.shape[0]
    
    if no_punct_and_numbers:
        last_step_time = time.time()
        df.body = df.body.apply(lambda x: remove_punct_and_non_alphabetic_caracters(x))
        print('Remove punctuation & non alphabetic characters:\n', time_elapsed(start_time), time_spent_on_step(last_step_time), comments_remaining_percentage(df, start_shape))
        last_shape = df.shape[0]

    if truncate:
        last_step_time = time.time()
        df = remove_comments_with_body_len_less_than(df, truncate_at)
        print(f'Remove comments with # characters < {truncate_at}\n', num_of_comments_removed(df, last_shape), time_elapsed(start_time), time_spent_on_step(last_step_time), comments_remaining_percentage(df, start_shape))
        last_shape = df.shape[0]

    if lemmatize:
        last_step_time = time.time()
        lem_count = 0
        df.body = df.body.apply(lambda x: lemmatize_with_spacy(x))
        print('finally done\n', time_elapsed(start_time), time_spent_on_step(last_step_time), comments_remaining_percentage(df, start_shape))

    return df

## Saving

In [64]:
def save_clean_comments(df, subreddit):
    '''
    Saves the cleaned comments to a csv file
    ''' 
    current_time = dt.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    df.to_csv(f'Generated_Data/{subreddit}_clean_comments_{current_time}.csv', index=False)
    print(f'Saved cleaned comments to {subreddit}_clean_comments_{current_time}.csv')

In [65]:
def save_corpus(df, subreddit):
    '''
    Saves the corpus to a csv file
    '''
    #current time in YYYY-MM-DD-HH-MM-SS format
    current_time = dt.datetime.now().strftime("%Y-%m-%d-%H-%M-%S")
    df = df.body
    df.to_csv(f'Generated_Data/{subreddit}_corpus_{current_time}.csv', index=False)
    print(f'Corpus saved to {subreddit}_corpus_{current_time}.csv')

## All together now

In [63]:
def create_corpus(subreddit, truncate_at=20):
    '''
    Creates a corpus from a subreddit
    does not pull comments from reddit live
    all comments are pulled from the SQL database corresponding to the subreddit
    '''
    df = pull_comments_from_SQL(subreddit)
    df = clean_df_column(df, truncate_at=truncate_at)
    save_clean_comments(df, subreddit)
    save_corpus(df, subreddit)

### records of corpus generation

```

create_corpus('trollxchromosomes', truncate_at=20)
#completed in 19 minutes, %82 of comments remaining


create_corpus('onguardforthee', truncate_at=20)
#  34.06 minutes elapsed total
#  Time spent on last step: 30.11 minutes 6.34 seconds
#  Percentage of comments remaining: 73.8206323263382%
#  Total comments remaining: 304,423


create_corpus('askscience', truncate_at=30)
#  19.6 minutes elapsed total
#  Time spent on last step: 17.21 minutes 12.35 seconds
#  Percentage of comments remaining: 45.87917311819194%
#  Total comments remaining: 135,116
#  45% of comments were removed/deleted, highest by far


create_corpus('atheism', truncate_at=30)
# 108.27 minutes elapsed total
# Time spent on last step: 95.37 minutes 22.13 seconds
# Percentage of comments remaining: 76.63061839754937%
# Total comments remaining: 948,097


create_corpus('dota2', truncate_at=30)
#  106.29 minutes elapsed total
#  Time spent on last step: 94.73 minutes 43.92 seconds
#  Percentage of comments remaining: 65.54133073549112%
#  Total comments remaining: 1,141,030


create_corpus('askreddit', truncate_at=40)
#  116.96 minutes elapsed total
#  Time spent on last step: 102.06 minutes 3.77 seconds
#  Percentage of comments remaining: 55.71227948081808%
#  Total comments remaining: 1,159,182


create_corpus('twoxchromosomes', truncate_at=40)
#  142.96 minutes elapsed total
#  Time spent on last step: 126.23 minutes 14.06 seconds
#  Percentage of comments remaining: 61.41970372721317%
#  Total comments remaining: 994,705
#  ~400k comments removed by moderator/deleted


create_corpus('askwomen', truncate_at=40)
# 154.01 minutes elapsed total
# Time spent on last step: 134.61 minutes 36.65 seconds
# Percentage of comments remaining: 69.02238164941976%
# Total comments remaining: 1,059,067


create_corpus('canada', truncate_at=40)
#  211.27 minutes elapsed total
#  Time spent on step: 191.95 minutes 56.79 seconds
#  Percentage of comments remaining: 62.72534596449505%
#  Total comments remaining: 1,335,634


create_corpus('askmen', truncate_at=40)
# 190.8 minutes elapsed total
# Time spent on last step: 167.27 minutes 15.92 seconds
# Percentage of comments remaining: 60.26%
# Total comments remaining: 1642984
# Saved cleaned comments to askmen_clean_comments_2022-01-21-22-28-23.csv
# Corpus saved to askmen_corpus_2022-01-21-22-28-50.csv

create_corpus('politics', truncate_at=40)
#  245.88 minutes elapsed total
#  Time spent on last step: 220.65 minutes 39.28 seconds
#  Percentage of comments remaining: 62.15%
#  Total comments remaining: 1779612
# Saved cleaned comments to politics_clean_comments_2022-01-22-02-38-15.csv
# Corpus saved to politics_corpus_2022-01-22-02-38-49.csv

create_corpus('christianity', truncate_at=30)
 198.54 minutes elapsed total
 Time spent on last step: 174.79 minutes 47.32 seconds
 Percentage of comments remaining: 72.23%
 Total comments remaining: 1286950
Saved cleaned comments to christianity_clean_comments_2022-01-22-08-01-48.csv
Corpus saved to christianity_corpus_2022-01-22-08-02-14.csv

```