# 1 Set Up

In [14]:
# 1.1 Imports
import requests
import json
import time
import pandas as pd
import numpy as np
from nltk import RegexpTokenizer
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
import re

In [20]:
# Natural Language Processing libraries, initiations and functions
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from nltk.stem import WordNetLemmatizer
from nltk import word_tokenize
from nltk.corpus import stopwords
from sklearn.decomposition import LatentDirichletAllocation
import re # Delete this if scraping in same notebook
#import TextBlob

In [50]:
# 1.2 This is a function that scrapes a subreddit and turns it into a pandas dataframe. 
def fun_scrape_reddit(the_subreddit, pages = 40, verbose = 1):
    all_posts = []
    first_url = 'http://www.reddit.com/r/' + the_subreddit + '.json'
    url = first_url
    list_dicts = []
    
    # Scraping:
    for round in range(pages):
        res = requests.get(url, headers = {'User-agent':'Electronic Goddess'})
        data = res.json()
        list_posts = data['data']['children']
        
        for i in range(len(list_posts)):
            index_dictionary = {
                    'Name'      : list_posts[i]['data']['name'],
                    'Title'     : list_posts[i]['data']['title'],
                    'Selftext'  : list_posts[i]['data']['selftext'],
                    'Subreddit' : list_posts[i]['data']['subreddit']
                }
            list_dicts.append(index_dictionary)
            
        after = data['data']['after']
        if type(after) == type(None):
            print('Done!')
            break
        else:
            url = first_url +'?after=' + after
            if verbose == 1:
                print('Round: '+ str(round + 1))
            time.sleep(1)

    return pd.DataFrame(list_dicts, columns = ['Name','Title','Selftext','Subreddit'])

In [51]:
df_gaslighting = fun_scrape_reddit('gaslighting', verbose = 0)

Done!


In [52]:
df_gaslighting

Unnamed: 0,Name,Title,Selftext,Subreddit
0,t3_9iblkv,"""Gaslighting"": One of the Most Dangerous Forms...",,gaslighting
1,t3_gad1g1,I think it may be time to let go....,"I've been hearing this term ""gaslighting"" a lo...",gaslighting
2,t3_gaho7j,Thinking about making a rule for the subreddit.,None of us here are therapists or otherwise qu...,gaslighting
3,t3_ga6w7c,Did she gaslight me?,"It was a month, only a month and it was the mo...",gaslighting
4,t3_ga6lb9,"Doctor Gaslighting his ex, also a doctor. ""Cun...",,gaslighting
...,...,...,...,...
378,t3_5qkd8a,What we talk about when we talk about Donald T...,,gaslighting
379,t3_5pl6oi,Is this gaslighting or not?,,gaslighting
380,t3_5pi4fl,Alternative Facts,,gaslighting
381,t3_5hmgtv,Gaslighting to Deflect from Unethical Behavior,When your partner gaslights you to deflect fro...,gaslighting


In [5]:
# 1.3 Test Scraper
def fun_test_scrape(the_subreddit):
    url = ('http://www.reddit.com/r/' + 
        the_subreddit + '.json')
    res = requests.get(url, 
        headers = {'User-agent':'Electronic Goddess'})
    data = res.json()
    list_of_posts = data['data']['children'][0]
    return list_of_posts

#fun_test_scrape('AskReddit')

# 2 Scraping

In [54]:
df_fds = fun_scrape_reddit('FemaleDatingStrategy',verbose=0)

Done!


In [13]:
df_AskReddit

Unnamed: 0,Name,Title,Selftext,Subreddit
0,t3_g6ydky,COVID-19 [Megathread] Week of April 23-April 29,Currently a pandemic called [COVID-19](https:/...,AskReddit
1,t3_gaarw1,What is a movie you find terrible but critics ...,,AskReddit
2,t3_gabx37,"Redditors who have actully gotten married, how...",,AskReddit
3,t3_ga9q25,"""winning an argument against a genius is hard,...",,AskReddit
4,t3_gacbsj,"Breaking News, an A-list celebrity has been ar...",,AskReddit
...,...,...,...,...
987,t3_gagkvg,The newest patch notes (May 2020) for Earth ha...,,AskReddit
988,t3_gagktb,What is the most important show in television ...,,AskReddit
989,t3_gagkt1,Why are you using reddit? How did you find red...,,AskReddit
990,t3_gagks6,What is the weirdest roommate that you ever ha...,,AskReddit


## 3 Cleaning

In [None]:
# Filling Nulls
les_or_inc.fillna('', inplace=True)

# Combining the title and selftext columns
les_or_inc['all_text'] = les_or_inc['title'] + ' ' + les_or_inc['selftext']

# Resetting the Index
les_or_inc.reset_index(inplace=True)

## 3.2 Count Vectorizing

In [179]:
# Instantiations of the tokenizer, lemmatizer and Count Vectorizer (with preprocessor)
tokenizer = RegexpTokenizer(r'\w+')
lemmatizer = WordNetLemmatizer()
def preprocess(text):
    text = re.sub(r'[^a-zA-Z]',' ', text.lower())
    tokens = word_tokenize(text)
    lemmer = WordNetLemmatizer()
    stop_words = stopwords.words("english")
    return " ".join([lemmer.lemmatize(word
                        ) for word in tokens if len(word) > 1 and not word in stop_words])
cvec = CountVectorizer(analyzer     = "word",
                       tokenizer    = tokenizer.tokenize,
                       preprocessor = preprocess,
                       stop_words   = 'english',
                       min_df       = 2)

In [66]:
# Creating Cvec DataFrame of both forums
df_words = pd.DataFrame(cvec.fit_transform(df_fds['Title'] + ' ' + df_fds['Selftext']).todense(), 
                        columns=cvec.get_feature_names())

  'stop_words.' % sorted(inconsistent))


In [67]:
df_words

Unnamed: 0,aa,abandon,abandoned,ability,able,abortion,abroad,absence,absolute,absolutely,...,younger,youtu,youtube,youtuber,youtubers,yr,zero,zone,zoom,zvm
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,1,0,0,5,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
827,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
828,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
829,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
830,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# 4 E. D. A.

In [183]:
# LDA
def LDA(input_item, num_topics = 3, num_words = 5, pre_cveced = False):
    # Cols are the words. Rows are the topics
    topic_lists = []
    lda = LatentDirichletAllocation(n_components=num_topics, learning_method='online')
    
    if  pre_cveced == False: # For inserting a column and automatically cvecing things in function.
        lda.fit(cvec.fit_transform(input_item))
    elif pre_cveced == True: # For inserting a pre-cveced dataframe.
        lda.fit(input_item)
    else:
        print("pre_cveced only takes True or False")
        return
    for ix, topic in enumerate(lda.components_):
        topic_lists += [[cvec.get_feature_names()[i] for i 
                         in lda.components_[ix].argsort()[:-num_words - 1:-1]]]

    return pd.DataFrame(topic_lists, 
                        columns=[ 'Word_'  + str(i) for i in range(1, num_words +1 )],
                        index = [ 'Topic_' + str(i) for i in range(1, num_topics + 1)])

In [185]:
LDA(df_fds['Title'] + df_fds['Selftext'], num_topics= 5, num_words=10)

Unnamed: 0,Word_1,Word_2,Word_3,Word_4,Word_5,Word_6,Word_7,Word_8,Word_9,Word_10
Topic_1,scrote,endorse,gay,content,yes,relevant,sub,receipt,artist,fds
Topic_2,time,like,relationship,men,want,know,woman,man,year,life
Topic_3,like,woman,guy,men,want,feel,friend,sex,think,know
Topic_4,amp,http,png,lvm,age,webp,preview,redd,format,width
Topic_5,woman,men,amp,sociopath,beware,abuse,power,male,consciousness,sexuality


# 5 Laboratory

In [15]:
# Sentament
def Sentamentize(text):
    return TextBlob(str(text)).sentiment.polarity

In [None]:
# Most Common Words
def most_common_words(cveced_df):
    return cveced_df.sum().sort_values()[::-1]

In [None]:
# Correlations between two DFs
# Listing the correlations to the two data frames.
# 1 = represents coming from 1st subreddit.
# 0 = represents coming from 2nd subreddit.
def correlation_of_words(cvec_dfs, target_title = "is_first_df")
    return df_words.corr().sort_values([target_title])[target_title]
# df_corrs.tail(20)[18::-1]

## 5.2 Experiment: Allowing LDA function to be pre-cveced and in a DataFrame

In [183]:
# Original LDA
def LDA(input_item, num_topics = 3, num_words = 5, pre_cveced = False):
    # Cols are the words. Rows are the topics
    topic_lists = []
    lda = LatentDirichletAllocation(n_components=num_topics, learning_method='online')
    
    if  pre_cveced == False: # For inserting a column and automatically cvecing things in function.
        lda.fit(cvec.fit_transform(input_item))
    elif pre_cveced == True: # For inserting a pre-cveced dataframe.
        lda.fit(input_item)
    else:
        print("pre_cveced only takes True or False")
        return
    for ix, topic in enumerate(lda.components_):
        topic_lists += [[cvec.get_feature_names()[i] for i 
                         in lda.components_[ix].argsort()[:-num_words - 1:-1]]]

    return pd.DataFrame(topic_lists, 
                        columns=[ 'Word_' + str(i) for i in range(1, num_words+1)],
                        index = [ 'Topic_' + str(i) for i in range(1, num_topics + 1)])

In [180]:
# Starting work on allowing function to be pre-cveced and in a DataFrame
num_topics = 3
    # Cols are the words. Rows are the topics
topic_lists = []
lda = LatentDirichletAllocation(n_components=5, learning_method='online')
lda.fit(cvec.fit_transform((df_Coronavirus['Title'] + df_Coronavirus['Selftext'])))

for ix, topic in enumerate(lda.components_):
    topic_lists += [[cvec.get_feature_names()[i] for i in lda.components_[ix].argsort()[:-5 - 1:-1]]]

pd.DataFrame(topic_lists)#, columns=[ 'word_' + str(i) for i # Comment this out for ease.
                                   # in range(1, 6)], index=range(1, num_topics + 1))

  'stop_words.' % sorted(inconsistent))


Unnamed: 0,0,1,2,3,4
0,study,government,free,sars,say
1,coronavirus,covid,test,testing,u
2,coronavirus,covid,death,case,new
3,coronavirus,vaccine,reopen,stay,business
4,coronavirus,covid,pandemic,virus,mask


## 5.1 Experiment: Jargon Filter

In [100]:
from nltk.corpus import words

Note: Cut out words used in AskReddit Sub? (If "girl" has been used once or more in Ask, then cut it out of the Question DF)

In [90]:
# Returns correlated words vs Ask
test_value = df_fds
#print(test_value.head())
def fun_(val_in, df_ctrl = df_AskReddit):
    val_in['is_target'] = 1
    df_ctrl['is_target']= 0
    df_jargon = pd.concat([ val_in.drop('Subreddit', axis=1),
                            df_ctrl.drop('Subreddit', axis=1)],
                            sort=True).reset_index()
    
    df_words = pd.DataFrame(cvec.fit_transform(
        df_jargon['Title'] + ' ' + df_jargon['Selftext']).todense(), 
        columns=cvec.get_feature_names())
    df_words['is_target'] = df_jargon['is_target']
    df_corrs = df_words.corr().sort_values(['is_target'])['is_target']
    return df_corrs.drop('is_target')
test = fun_(test_value)

In [91]:
test.sort_values(ascending= False).head(20)

want       0.300836
know       0.298594
like       0.294275
dating     0.263467
men        0.261495
guy        0.259197
time       0.255947
really     0.251466
think      0.245828
thought    0.228016
woman      0.226616
year       0.222725
man        0.219374
way        0.217354
feel       0.216444
going      0.214858
need       0.213034
make       0.212173
say        0.211214
got        0.211015
Name: is_target, dtype: float64

In [92]:
# to get jargon
test_value = df_fds
#print(test_value.head())
def fun_(val_in, df_ctrl = df_AskReddit):
    val_in['is_target'] = 1
    df_ctrl['is_target']= 0
    df_jargon = pd.concat([ val_in.drop('Subreddit', axis=1),
                            df_ctrl.drop('Subreddit', axis=1)],
                            sort=True).reset_index()
    
    df_words = pd.DataFrame(cvec.fit_transform(
        df_jargon['Title'] + ' ' + df_jargon['Selftext']).todense(), 
        columns=cvec.get_feature_names())
    df_words['is_target'] = df_jargon['is_target']
    return df_words
test_df = fun_(test_value)

In [97]:
mask = test_df['is_target'] == 0
mask_words = test_df[mask].sum() > 4
test_df[mask].sum()[mask_words]

able           6
absolutely     5
actor          7
actually      24
adult          6
              ..
word           9
work          12
world         28
worst         27
year          24
Length: 224, dtype: int64

In [125]:
def preprocess(text):
    text = re.sub(r'[^a-zA-Z]',' ', text.lower())
    tokens = word_tokenize(text)
    lemmer = WordNetLemmatizer()
    #words_all = stopwords.words("english")
    list_clean = [lemmer.lemmatize(word) for word in tokens]
    #list_cut   = [word for word in list_clean if not word in words.words()]
    return " ".join(list_clean)
# )
cvec = CountVectorizer(analyzer     = "word",
                       tokenizer    = tokenizer.tokenize,
                       preprocessor = preprocess,
                       stop_words   = None,
                       min_df       = 2)

In [126]:
test_series = df_fds['Title'] + ' ' + df_fds['Selftext']

In [130]:
test_cvec = cvec.fit_transform(test_series)

In [132]:
df_words = pd.DataFrame(test_cvec.todense(), columns=cvec.get_feature_names())

In [119]:
df_fds['Selftext'].map(lambda x: len(x)).sort_values()

415        0
554        0
291        0
290        0
289        0
       ...  
773     9786
498    12361
616    14116
76     20295
639    26570
Name: Selftext, Length: 832, dtype: int64

In [122]:
test_doc = df_fds.loc[639, 'Selftext']

In [123]:
preprocess(test_doc)

'fds quora asked answered meshing ve needed ll ve heard examp le performs called workaholic changed condo yourselfer energized having tv tv establishing called controlling happened longest divorced exhibited proud talked counseling didn initiating abused assaulted talked viewing honoring suffered transmitted treated addictive suffered etc complimented planned etc changing treated didn didn having alienated identified sabotaged imprinted interferes differing disenfranchised etc having etc nots served served nonviolent committed arrested happened failed tv newsmagazines internet discussing featuring tv ll attending caused socializing planning smarter didn having maintaining caring others carpool internet modem others stronger defers equipped ethnicity dated ethnicity ethnicity aren biased ethnicity ethnicity ethnicity ethnicity showered happens happens greatest greatest creating caring replenishes'

In [135]:
test_list = [word for word in df_words.columns if not word in words.words()]

In [147]:
test_list_two = [word for word in test_list if word[-3:] != 'ing' and word[-2:] != 'ed']

In [155]:
#[word[:-3] for word in test_list if word[-3:] != 'ing']
[word[:-2] for word in test_list if word[-2:] == 'ed']

['abuse',
 'achieve',
 'acte',
 'addresse',
 'aligne',
 'allowe',
 'annoye',
 'answere',
 'apologize',
 'appeare',
 'appreciate',
 'approache',
 'approve',
 'argue',
 'arreste',
 'aske',
 'assaulte',
 'asse',
 'attacke',
 'attempte',
 'attracte',
 'avoide',
 'believe',
 'betraye',
 'bore',
 'bothere',
 'bullie',
 'calle',
 'cancele',
 'cancelle',
 'care',
 'cause',
 'challenge',
 'change',
 'chase',
 'chatte',
 'cheate',
 'choke',
 'claime',
 'cleane',
 'commente',
 'committe',
 'communicate',
 'compare',
 'complimente',
 'conflicte',
 'confronte',
 'consume',
 'contacte',
 'contribute',
 'cooke',
 'create',
 'criticize',
 'damage',
 'dare',
 'date',
 'delete',
 'delivere',
 'delude',
 'demonstrate',
 'describe',
 'destroye',
 'devastate',
 'develope',
 'die',
 'discusse',
 'disenfranchise',
 'disrespecte',
 'ditche',
 'divorce',
 'downvote',
 'droppe',
 'dumpe',
 'e',
 'emaile',
 'empowere',
 'encourage',
 'energize',
 'enjoye',
 'entere',
 'entitle',
 'equippe',
 'existe',
 'expecte'

In [151]:
test_list[3][:-3]

'accept'

### Further Jargon Ideas:

 - Attempt Stemming?
 - Remove any/all words in list of subreddits
 - Re-insert list of words and use as a filter for the column count of words

## Recycling Bin

In [None]:
test_value = 
print(test_value)
def fun_(val_in):
    return val_out
fun_(test_value)

In [6]:
from IPython.display import display, Image

def fun_display_pic(urll):
    return display(Image( url= urll))

#display(Image(url=bro_fun.find_element_by_css_selector(ATS_dict['logo']
#                ).get_attribute('src')))

In [49]:
# 1.4 Takes in the post url and spits out the image url
def fun_url_to_img(url):
    html = BeautifulSoup(requests.get(url, headers = {'User-agent':'Electronic Goddess'}).content)
    element = html.select('div[data-test-id="post-content"] > div > a > img')[0]
    return element.attrs['src']

In [None]:
# Get Sanity Check:
quick_check = requests.get(first_url, headers = {'User-agent':'Electronic Goddess'})
if int(str(quick_check)[11:14]) == 200:
    print("Get request successful.")
    time.sleep(3)
    print("Initiating Scrape...")
else:
    print("Get request not 200, instead recieved:" + str(quick_check))
    return

In [18]:
!pip install TextBlob --user # --user was needed here for the environment

[33mDEPRECATION: Python 2.7 will reach the end of its life on January 1st, 2020. Please upgrade your Python as Python 2.7 won't be maintained after that date. A future version of pip will drop support for Python 2.7. More details about Python 2 support in pip, can be found at https://pip.pypa.io/en/latest/development/release-process/#python-2-support[0m
Collecting TextBlob
  Using cached https://files.pythonhosted.org/packages/60/f0/1d9bfcc8ee6b83472ec571406bd0dd51c0e6330ff1a51b2d29861d389e85/textblob-0.15.3-py2.py3-none-any.whl
Processing /Users/pearl/Library/Caches/pip/wheels/ae/8c/3f/b1fe0ba04555b08b57ab52ab7f86023639a526d8bc8d384306/nltk-3.5-cp27-none-any.whl
Collecting click
  Using cached https://files.pythonhosted.org/packages/d2/3d/fa76db83bf75c4f8d338c2fd15c8d33fdd7ad23a9b5e57eb6c5de26b430e/click-7.1.2-py2.py3-none-any.whl
Collecting joblib
  Using cached https://files.pythonhosted.org/packages/28/5c/cf6a2b65a321c4a209efcdf64c2689efae2cb62661f8f6f4bb28547cf1bf/joblib-0.14.1-

## X Experiment: Getting Comments

## X Experiment: Getting Comments

In [25]:
df_libertarian = scrape_reddit("Libertarian")

Get request successful.
Initiating Scrape...
Round: 1
Round: 2
Round: 3
Round: 4
Round: 5
Round: 6
Round: 7
Round: 8
Round: 9
Round: 10
Round: 11
Round: 12
Round: 13
Round: 14
Round: 15
Round: 16
Round: 17
Round: 18
Round: 19
Round: 20
Round: 21
Round: 22
Round: 23
Round: 24
Round: 25
Round: 26
Round: 27
Round: 28
Round: 29
Round: 30
Round: 31
Round: 32
Round: 33
Round: 34
Round: 35
Round: 36
Round: 37
Limit likely hit.  Returning available posts.


In [84]:
mask = df_libertarian['Selftext'] != ''
df_libertarian[mask].tail(10)

Unnamed: 0,Name,Title,Selftext,Subreddit
898,t3_g5xrrf,Is this what our forefathers died for...?,The state should never misinterpret itself as ...,Libertarian
900,t3_g5ove2,"New to libertarianism, how would that solve in...",From what I understand (which is not much sinc...,Libertarian
906,t3_g5wgre,George Papadopoulos collusion denials to ‘Spyg...,[https://www.washingtonexaminer.com/news/geor...,Libertarian
907,t3_g5m0x3,Laws at the whim of bureaucrats,"""When men are caught in the trap of non-object...",Libertarian
910,t3_g5g0kk,Do Libertarians believe in anti-trust laws?,"Hey, just trying to educate myself on Libertar...",Libertarian
911,t3_g59q28,I want a tax cut for all the government servic...,Is anyone else a little miffed that we're stil...,Libertarian
916,t3_g5j1lt,Reliable news sources,Can anyone suggest credible political news sou...,Libertarian
919,t3_g62k05,So are we expected to shut down our lives just...,Are we going to bend over every time the gover...,Libertarian
920,t3_g61hc7,"Just a reminder, we have had a vaccine for COV...","Moderna had a vaccine on January 13th, and [pr...",Libertarian
936,t3_g5si37,Libertarian and Fully Free Market Minecraft Se...,"**TCB** \n***Economy, Claims, and Freedom!**...",Libertarian


In [58]:
url = ('http://www.reddit.com/r/Libertarian/comments/g59q28.json')

#url = ('http://www.reddit.com/r/Libertarian/comments/g62k05.json')
res = requests.get(url, headers = {'User-agent':'Electronic Goddess'})
test = res.json()

In [85]:
#test[1]['data']['children'][4]#['data']

In [41]:
# 1.2 This is a function that scrapes a subreddit and turns it into a pandas dataframe. 
def fun_scrape_reddit(the_subreddit, pages = 40):
    all_posts = []
    first_url = 'http://www.reddit.com/r/' + the_subreddit + '.json'
    url = first_url
    list_of_df = []
    after = 'First'
    
    # Scraping:
    for round in range(pages):
        res = requests.get(url, headers = {'User-agent':'Electronic Goddess'})
        data = res.json()
        list_of_posts = data['data']['children']
        all_posts = all_posts + list_of_posts
        after = data['data']['after']
        if type(after) == type(None):
            print('Limit likely hit.  Returning available posts.')
            break
        else:
            url = first_url +'?after=' + after
            print(url + '  Round: '+ str(round + 1) + after)
            time.sleep(1)

    # Formats the parts we care about into a list of dictionaries that'll become the dataframe
    for i in range(len(all_posts)):
        index_dictionary = {
                'Name'      : all_posts[i]['data']['name'],
                'Title'     : all_posts[i]['data']['title'],
                'Selftext'  : all_posts[i]['data']['selftext'],
                'Subreddit' : all_posts[i]['data']['subreddit']
            }
        list_of_df.append(index_dictionary)
    return pd.DataFrame(list_of_df, columns = ['Name','Title','Selftext','Subreddit'])

In [8]:
df_AskReddit = scrape_reddit('AskReddit')

Get request successful.
Initiating Scrape...
Round: 1
Round: 2
Round: 3
Round: 4
Round: 5
Round: 6
Round: 7
Round: 8
Round: 9
Round: 10
Round: 11
Round: 12
Round: 13
Round: 14
Round: 15
Round: 16
Round: 17
Round: 18
Round: 19
Round: 20
Round: 21
Round: 22
Round: 23
Round: 24
Round: 25
Round: 26
Round: 27
Round: 28
Round: 29
Round: 30
Round: 31
Round: 32
Round: 33
Round: 34
Round: 35
Round: 36
Round: 37
Round: 38
Round: 39
Limit likely hit.  Returning available posts.


In [9]:
df_tifu = scrape_reddit('tifu')

Get request successful.
Initiating Scrape...
Round: 1
Round: 2
Round: 3
Round: 4
Round: 5
Round: 6
Round: 7
Round: 8
Round: 9
Round: 10
Round: 11
Round: 12
Round: 13
Round: 14
Round: 15
Round: 16
Round: 17
Round: 18
Round: 19
Round: 20
Round: 21
Round: 22
Round: 23
Round: 24
Limit likely hit.  Returning available posts.


In [10]:
df_IncelsWithoutHate = scrape_reddit('IncelsWithoutHate')

Get request successful.
Initiating Scrape...
Round: 1
Round: 2
Round: 3
Round: 4
Round: 5
Round: 6
Round: 7
Round: 8
Round: 9
Round: 10
Round: 11
Round: 12
Round: 13
Round: 14
Round: 15
Round: 16
Round: 17
Round: 18
Round: 19
Round: 20
Round: 21
Round: 22
Round: 23
Round: 24
Round: 25
Round: 26
Round: 27
Round: 28
Round: 29
Round: 30
Round: 31
Round: 32
Round: 33
Round: 34
Round: 35
Round: 36
Round: 37
Round: 38
Round: 39
Limit likely hit.  Returning available posts.


In [11]:
df_Coronavirus = scrape_reddit('Coronavirus')

Get request successful.
Initiating Scrape...
Round: 1
Round: 2
Round: 3
Round: 4
Round: 5
Round: 6
Round: 7
Round: 8
Round: 9
Round: 10
Round: 11
Round: 12
Round: 13
Round: 14
Round: 15
Round: 16
Round: 17
Round: 18
Round: 19
Round: 20
Round: 21
Round: 22
Round: 23
Round: 24
Round: 25
Round: 26
Round: 27
Round: 28
Round: 29
Round: 30
Round: 31
Round: 32
Limit likely hit.  Returning available posts.


In [12]:
df_transgender = scrape_reddit('transgender')

Get request successful.
Initiating Scrape...
Round: 1
Round: 2
Round: 3
Round: 4
Round: 5
Round: 6
Round: 7
Round: 8
Round: 9
Round: 10
Round: 11
Round: 12
Round: 13
Round: 14
Round: 15
Round: 16
Limit likely hit.  Returning available posts.


In [22]:
#df_ = scrape_reddit('transvoice')

Get request successful.
Initiating Scrape...
Round: 1
Round: 2
Round: 3
Round: 4
Round: 5
Round: 6
Round: 7
Round: 8
Round: 9
Round: 10
Round: 11
Round: 12
Round: 13
Round: 14
Round: 15
Round: 16
Round: 17
Round: 18
Round: 19
Round: 20
Round: 21
Round: 22
Round: 23
Round: 24
Round: 25
Round: 26
Round: 27
Round: 28
Round: 29
Round: 30
Round: 31
Round: 32
Round: 33
Round: 34
Round: 35
Round: 36
Round: 37
Round: 38
Round: 39
Limit likely hit.  Returning available posts.


Unnamed: 0,Name,Title,Selftext,Subreddit
0,t3_bokzzu,A welcome to new users and some FAQ's answered.,Hi there! Welcome to r/Transvoice. Our subscri...,transvoice
1,t3_d3clhe,L's Voice Training Guide (Level 1) for MTF tra...,L's Voice Training Guide (Level 1) for MTF tra...,transvoice
2,t3_gb0v9i,i’ve been on the fence about starting t for th...,,transvoice
3,t3_gapin2,I probably sound weird but meh. Since I don't ...,,transvoice
4,t3_gb5rth,I can't use my voice and don't know why,I personally feel this is more on the strange ...,transvoice
...,...,...,...,...
992,t3_er67rm,Breathing issues,"I'm mtf very early stages of voice training, a...",transvoice
993,t3_er2zle,Looks like I have work to do.,,transvoice
994,t3_er26l9,Hey guys. Do I pass here and can I get some fe...,,transvoice
995,t3_er81gc,End of first dayof practice and wanted to shar...,,transvoice


In [19]:
# For Combining
# Identifying the y Values
df_SubredditDrama['target_sub'] = 1
df_AskReddit['target_sub']      = 0

# Concatination of the two subreddits
df_compare = pd.concat([df_SubredditDrama.drop('subreddit', axis=1),
                        df_AskReddit.drop('subreddit', axis=1)])

# Filling Nulls
df_compare.fillna('', inplace=True)

# Combining the title and selftext columns
df_compare['all_text'] = df_compare['title'] + ' ' + df_compare['selftext']

# Resetting the Index
df_compare.reset_index(inplace=True)

In [61]:
df_fds.isna().sum()

Name         0
Title        0
Selftext     0
Subreddit    0
dtype: int64

In [19]:
# Combining the title and selftext columns
df_compare['all_text'] = df_compare['title'] + ' ' + df_compare['selftext']



In [21]:
# Listing the correlations to the two data frames.
# 1 = represents coming from lesbians subreddit.
# 0 = represents coming from incels subreddit.
df_corrs = df_words.corr().sort_values(['target_sub'])['target_sub']
print("Most correlated to Target subreddit???")
df_corrs.tail(20)[18::-1]

Most correlated to Target subreddit???


user         0.352601
http         0.313004
com          0.312413
reddit       0.304718
comment      0.302215
www          0.275697
drama        0.271083
thread       0.241970
post         0.240567
argument     0.191756
debate       0.191575
op           0.185547
context      0.178263
subreddit    0.177042
source       0.176399
mod          0.174861
share        0.173518
medium       0.173511
amp          0.172538
Name: target_sub, dtype: float64

In [None]:
test_value = 
print(test_value)
def fun_(val_in):
    return val_out
funct_map_mins(test_value)

In [34]:
df_incels.drop(['is_lesbians', 'target_sub'], 
               axis = 1, inplace = True)

In [30]:

test_value = df_incels
print(test_value.head())
def fun_identify_jargon(df_in):
    df_in['target_sub']        = 1
    df_AskReddit['target_sub'] = 0

# Prepping Concatted Dataframe
    # Concatination of the two subreddits
#    df_compare = pd.concat([df_SubredditDrama.drop('subreddit', axis=1),
#        df_AskReddit.drop('subreddit', axis=1)])
#    df_compare.fillna('', inplace=True)
#    df_compare['all_text'] = df_compare['title'
#        ] + ' ' + df_compare['selftext']
#    df_compare.reset_index(inplace=True)
#
#    # Creating Cvec DataFrame of both forums
#    df_words = pd.DataFrame(cvec.fit_transform(df_compare['all_text']
#        ).todense(), columns=cvec.get_feature_names())
#    df_words['target_sub'] = df_compare['target_sub']

# Listing the correlations to the two data frames.
#    df_corrs = df_words.corr().sort_values(['target_sub'])['target_sub']
#    print("Most correlated to Target subreddit?")
#    return df_corrs.tail(20)#[18::-1]
fun_identify_jargon(test_value)

                                               title  \
0     Goodbye, guys. I leave you all with this song.   
1  Found a false flag IncelTear/AHS user. Watch o...   
2  First day of kindergarten tomorrow? Any advice...   
3                    Joblless Female Geting Rejected   
4           “The past is the past guys”. Get over it   

                                            selftext  subreddit  is_lesbians  \
0  I was going to do this on a throwaway and just...  Braincels            0   
1                                                NaN  Braincels            0   
2  Sup guys I'm a 4 year old incel (got rejected ...  Braincels            0   
3                                                NaN  Braincels            0   
4                                                NaN  Braincels            0   

   target_sub  
0           1  
1           1  
2           1  
3           1  
4           1  
Most correlated to Target subreddit?


amp           0.172538
medium        0.173511
share         0.173518
mod           0.174861
source        0.176399
subreddit     0.177042
context       0.178263
op            0.185547
debate        0.191575
argument      0.191756
post          0.240567
thread        0.241970
drama         0.271083
www           0.275697
comment       0.302215
reddit        0.304718
com           0.312413
http          0.313004
user          0.352601
target_sub    1.000000
Name: target_sub, dtype: float64

In [None]:
# Listing the correlations to the two data frames.
# 1 = represents coming from lesbians subreddit.
# 0 = represents coming from incels subreddit.
df_corrs = df_words.corr().sort_values(['is_lesbians'])['is_lesbians']
print("Most correlated to Lesbians subreddit")
df_corrs.tail(20)[18::-1]

In [None]:
print("Most correlated to Incels subreddit")
df_corrs.head(20)

In [None]:
# Identifying Jargon

# Note: Cut out words used in AskReddit Sub? (If "girl" has been used once or more in Ask, 
    # then cut it out of the Question DF)
# Identifying the y Values
['is_target'] = 1
Ask['is_target'] = 0

# Concatination of the two subreddits
df_jargon = pd.concat([targets.drop('subreddit', axis=1),
                        Ask.drop('subreddit', axis=1)],
                        sort=True)

# Filling Nulls
df_jargon.fillna('', inplace=True)

# Combining the title and selftext columns
df_jargon['all_text'] = df_jargon['title'] + ' ' + df_jargon['selftext']

# Resetting the Index
df_jargon.reset_index(inplace=True)



# Creating Cvec DataFrame of both forums
df_words = pd.DataFrame(cvec.fit_transform(df_jargon['all_text']).todense(), 
                        columns=cvec.get_feature_names())

# Inserting the target column
df_words['is_target'] = df_jargon['is_target']
# Listing the correlations to the two data frames.
# 1 = represents coming from lesbians subreddit.
# 0 = represents coming from incels subreddit.
df_corrs = df_words.corr().sort_values(['is_target'])['is_target']
print("Most correlated to target subreddit")
df_corrs.tail(20)[18::-1]

In [None]:

test_value = df_incels
print(test_value.head())
def fun_identify_jargon(df_in):
    df_in['target_sub']        = 1
    df_AskReddit['target_sub'] = 0

# Prepping Concatted Dataframe
    # Concatination of the two subreddits
#    df_compare = pd.concat([df_SubredditDrama.drop('subreddit', axis=1),
#        df_AskReddit.drop('subreddit', axis=1)])
#    df_compare.fillna('', inplace=True)
#    df_compare['all_text'] = df_compare['title'
#        ] + ' ' + df_compare['selftext']
#    df_compare.reset_index(inplace=True)
#
#    # Creating Cvec DataFrame of both forums
#    df_words = pd.DataFrame(cvec.fit_transform(df_compare['all_text']
#        ).todense(), columns=cvec.get_feature_names())
#    df_words['target_sub'] = df_compare['target_sub']

# Listing the correlations to the two data frames.
#    df_corrs = df_words.corr().sort_values(['target_sub'])['target_sub']
#    print("Most correlated to Target subreddit?")
#    return df_corrs.tail(20)#[18::-1]
fun_identify_jargon(test_value)

In [None]:
# Identifying Jargon

# Note: Cut out words used in AskReddit Sub? (If "girl" has been used once or more in Ask, 
    # then cut it out of the Question DF)

# Concatination of the two subreddits

# Combining the title and selftext columns
df_jargon['all_text'] = df_jargon['title'] + ' ' + df_jargon['selftext']

# Resetting the Index




# Creating Cvec DataFrame of both forums
df_words = pd.DataFrame(cvec.fit_transform(df_jargon['all_text']).todense(), 
                        columns=cvec.get_feature_names())

# Inserting the target column
df_words['is_target'] = df_jargon['is_target']
# Listing the correlations to the two data frames.
# 1 = represents coming from lesbians subreddit.
# 0 = represents coming from incels subreddit.
df_corrs = df_words.corr().sort_values(['is_target'])['is_target']
print("Most correlated to target subreddit")
df_corrs.tail(20)[18::-1]

In [None]:

test_value = df_incels
print(test_value.head())
def fun_identify_jargon(df_in):
    df_in['target_sub']        = 1
    df_AskReddit['target_sub'] = 0

# Prepping Concatted Dataframe
    # Concatination of the two subreddits
#    df_compare = pd.concat([df_SubredditDrama.drop('subreddit', axis=1),
#        df_AskReddit.drop('subreddit', axis=1)])
#    df_compare.fillna('', inplace=True)
#    df_compare['all_text'] = df_compare['title'
#        ] + ' ' + df_compare['selftext']
#    df_compare.reset_index(inplace=True)
#
#    # Creating Cvec DataFrame of both forums
#    df_words = pd.DataFrame(cvec.fit_transform(df_compare['all_text']
#        ).todense(), columns=cvec.get_feature_names())
#    df_words['target_sub'] = df_compare['target_sub']

# Listing the correlations to the two data frames.
#    df_corrs = df_words.corr().sort_values(['target_sub'])['target_sub']
#    print("Most correlated to Target subreddit?")
#    return df_corrs.tail(20)#[18::-1]
fun_identify_jargon(test_value)

In [None]:
# Words more correlated to one vs the other

# Identifying the y Values
df_lesbians['is_lesbians'] = 1
df_incels['is_lesbians'] = 0

# Concatination of the two subreddits
les_or_inc = pd.concat([df_lesbians.drop('subreddit', axis=1),
                        df_incels.drop('subreddit', axis=1)])

# Filling Nulls
les_or_inc.fillna('', inplace=True)

# Combining the title and selftext columns
les_or_inc['all_text'] = les_or_inc['title'] + ' ' + les_or_inc['selftext']

# Resetting the Index
les_or_inc.reset_index(inplace=True)



# Creating Cvec DataFrame of both forums
df_words = pd.DataFrame(cvec.fit_transform(les_or_inc['all_text']).todense(), 
                        columns=cvec.get_feature_names())

# Inserting the target column
df_words['is_lesbians'] = les_or_inc['is_lesbians']



# Listing the correlations to the two data frames.
# 1 = represents coming from lesbians subreddit.
# 0 = represents coming from incels subreddit.
df_corrs = df_words.corr().sort_values(['is_lesbians'])['is_lesbians']
print("Most correlated to Lesbians subreddit")
df_corrs.tail(20)[18::-1]

In [None]:
# Identifying the y Values
df_SubredditDrama['target_sub'] = 1
df_AskReddit['target_sub']      = 0

# Concatination of the two subreddits
df_compare = pd.concat([df_SubredditDrama.drop('subreddit', axis=1),
                        df_AskReddit.drop('subreddit', axis=1)])

# Filling Nulls
df_compare.fillna('', inplace=True)

# Combining the title and selftext columns
df_compare['all_text'] = df_compare['title'] + ' ' + df_compare['selftext']

# Resetting the Index
df_compare.reset_index(inplace=True)

In [None]:
# Creating Cvec DataFrame of both forums
df_words = pd.DataFrame(cvec.fit_transform(df_compare['all_text']).todense(), 
                        columns=cvec.get_feature_names())