# Dataset for embeddings
Let's scrape from 1000 to 1500 comments for each initial dataset class.

In [34]:
import kaggle
import os
import pandas as pd
import datetime as dt
import requests
import pandas as pd
import nltk
import lxml
import cchardet
import time
import numpy as np
import io
import regex as re
import importlib
import gensim
import modules.preprocess as preprocess
importlib.reload(preprocess)

from scipy import spatial
from bs4 import BeautifulSoup
from tqdm import tqdm
from nltk import WordNetLemmatizer
from multiprocessing import Pool



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\TOPAPEC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# kaggle datasets download -d theshadow29/subreddit-classification
try:
    os.mkdir("dataset")
except FileExistsError:
    print("Directory already exists")
kaggle.api.authenticate()
kaggle.api.dataset_download_files("theshadow29/subreddit-classification", path="dataset", unzip=True)

In [None]:
dataset = pd.read_csv('dataset/fine_grained_full.csv')
dataset.shape

In [None]:
dataset.head()

In [None]:
start_date = int(dt.datetime(2017, 1, 1, 0, 0).timestamp())
end_date = int(dt.datetime(2019, 1, 1, 0, 0).timestamp())
limit_per_subreddit = 1500
pages_per_sub = 40
subreddits = dataset["label"].unique()
try:
    os.mkdir("reddit_comments")
except FileExistsError:
    print("Directory already exists")
subreddits.shape

In [None]:
def pull_page(subreddit):
    titles = []
    url = f"https://old.reddit.com/r/{subreddit}/"
    requests_session = requests.Session()
    headers = {'User-Agent': 'Mozilla/5.0'}
    for i in range(pages_per_sub):
        if (i % 5 == 4):
            time.sleep(2)
        page = requests_session.get(url, headers=headers)
        soup = BeautifulSoup(page.text, 'lxml')
        attrs = {'class': 'thing'}
        for post in soup.find_all('div', attrs=attrs):
            titles.append(post.find('p', class_="title").text)
        next_button = soup.find("span", class_="next-button")
        try:
            url = next_button.find("a").attrs['href']
        except:
            print(f"{subreddit} page {i}")
            break
        
    return (subreddit, titles)

In [None]:

start = dt.datetime.now()
for chunk in tqdm(range(0, 1401, 40)):
    with Pool(12) as pool:
        for subreddit, result in pool.map(pull_page, subreddits[chunk:min(chunk + 40, 1430)]):
            df = pd.DataFrame(result)
            df.to_csv(f"reddit_comments{os.path.sep}{subreddit}.csv", index=False)
        time.sleep(2)
passed = dt.datetime.now() - start
print(passed)

In [None]:
pull_page(subreddits[0])

I failed to parse subreddits as there were too little titles available and the data was too noisy as well. I managed to parse 90 mib of data but of course that wasn't nearly enough to train decent embedding. At the same time my attempts to use pushshift api failed as there was a problem on their side (api returned only super small fraction of each query. 

But luckily I found pretrained glove word embeddings trained on more than 250 gib of reddit data. So I'm willing to test if they are good enough. 
https://www.kaggle.com/leighplt/glove-reddit-comments

In [None]:

# Number of words - 1623397 
# word_part = ["" for i in range(1623397)]
# number_part = np.empty((1623397, 300), dtype=np.float32)
embeddings = {}
with io.open("GloVe.Reddit.120B.300D.txt", "r", encoding='utf-8') as file:
    file.readline()
    for i, line in tqdm(enumerate(file)):
        values = line.split(' ');
#         word_part[i] = values[0]
#         number_part[i] = np.asarray(values[1:], "float32")
        embeddings[values[0]] = np.asarray(values[1:], "float32")

In [22]:
model = gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)

In [None]:
number_part.shape

In [18]:
# https://www.kaggle.com/leighplt/glove-reddit-comments

def test_embeddings_wordsim(embeddings):
    av_abs_dev = 0.0
    file_len = sum(1 for line in open('wordsim353_sim_rel/wordsim_relatedness_goldstandard.txt'))
    with open("wordsim353_sim_rel/wordsim_relatedness_goldstandard.txt") as file:
        for line in file:
            values = line.split()
            print(values[0], values[1], values[2])
            simil = float(values[2]) / 10.0
            cos_sim = 1 - spatial.distance.cosine(embeddings[values[0]], embeddings[values[1]])
            print(f"Accordings to embeddings {cos_sim}")
            av_abs_dev += abs(cos_sim - simil)
    print(f"Final average abs deviation: {av_abs_dev / float(file_len)}")

In [19]:
test_embeddings_wordsim(embeddings)

computer keyboard 7.62
Accordings to embeddings 0.5736358761787415
Jerusalem Israel 8.46
Accordings to embeddings 0.6853241920471191
planet galaxy 8.11
Accordings to embeddings 0.7235155701637268
canyon landscape 7.53
Accordings to embeddings 0.33178621530532837
OPEC country 5.63
Accordings to embeddings 0.26875486969947815
day summer 3.94
Accordings to embeddings 0.6057249903678894
day dawn 7.53
Accordings to embeddings 0.48720410466194153
country citizen 7.31
Accordings to embeddings 0.6312815546989441
planet people 5.75
Accordings to embeddings 0.4247969388961792
environment ecology 8.81
Accordings to embeddings 0.445987343788147
Maradona football 8.62
Accordings to embeddings 0.3413398563861847
OPEC oil 8.59
Accordings to embeddings 0.42219042778015137
money bank 8.50
Accordings to embeddings 0.6444064378738403
computer software 8.50
Accordings to embeddings 0.7147970199584961
law lawyer 8.38
Accordings to embeddings 0.6398980021476746
weather forecast 8.34
Accordings to embeddings

In [23]:
test_embeddings_wordsim(model)

computer keyboard 7.62
Accordings to embeddings 0.3963916301727295
Jerusalem Israel 8.46
Accordings to embeddings 0.6638747453689575
planet galaxy 8.11
Accordings to embeddings 0.6338510513305664
canyon landscape 7.53
Accordings to embeddings 0.24369587004184723
OPEC country 5.63
Accordings to embeddings 0.1194893941283226
day summer 3.94
Accordings to embeddings 0.4481317102909088
day dawn 7.53
Accordings to embeddings 0.331085741519928
country citizen 7.31
Accordings to embeddings 0.20985937118530273
planet people 5.75
Accordings to embeddings 0.08704742044210434
environment ecology 8.81
Accordings to embeddings 0.4299762547016144
Maradona football 8.62
Accordings to embeddings 0.28883832693099976
OPEC oil 8.59
Accordings to embeddings 0.5333777666091919
money bank 8.50
Accordings to embeddings 0.2613206207752228
computer software 8.50
Accordings to embeddings 0.5444108843803406
law lawyer 8.38
Accordings to embeddings 0.34465768933296204
weather forecast 8.34
Accordings to embedding

As you can see wordsim test is not very representative of the quality of embeddings. Now, let's use KNN classification on our dataset to determine the best vectors.

In [6]:

dataset1 = pd.read_csv('dataset/fine_grained_full.csv')
dataset2 = pd.read_csv('dataset/cleaned_all_title_data_controversial.csv')
print(dataset1.shape, dataset2.shape)

(429300, 2) (1000156, 2)


In [56]:
dataset1.head()

Unnamed: 0,label,text
0,studyroomf,Do you subscribe to the theory that all the ev...
1,studyroomf,"A pivotal moment for the dean: ""We love you too"""
2,studyroomf,Episode Discussion - S04E05 - Cooperative Esca...
3,studyroomf,"Dan Harmon says ""There's a character from seas..."
4,studyroomf,'Can we take a sidebar from this sidebar?' Sug...


In [2]:
os.cpu_count()

16

In [3]:

def preprocess_pipeline(dataset):
    cores = 12
    multicore_tok(dataset, cores)
    multicore_lem(dataset, cores)

def multicore_tok(dataset, cores=6):
    with Pool(processes=cores) as pool:
        dataset.loc[:, "text"] = pool.map(nltk.word_tokenize, dataset.loc[:, "text"])

def multicore_lem(dataset, cores=6):
    with Pool(processes=cores) as pool:
        wnl = WordNetLemmatizer()
        for i, line in tqdm(enumerate(dataset.text)):
            dataset.loc[i,"text"] = pool.map(wnl.lemmatize, dataset.loc[i, "text"])

In [98]:
preprocess_pipeline(dataset1)
print(dataset1[:4])
dataset1.to_csv("preprocessed_serialised/dataset_fine_grained_full.csv", index=False)

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…


        label                                               text
0  studyroomf  [Do, you, subscribe, to, the, theory, that, al...
1  studyroomf  [A, pivotal, moment, for, the, dean, :, ``, We...
2  studyroomf  [Episode, Discussion, -, S04E05, -, Cooperativ...
3  studyroomf  [Dan, Harmon, say, ``, There, 's, a, character...


In [1]:
preprocess_pipeline(dataset2)
print(dataset2[:4])
dataset2.to_csv("preprocessed_serialised/dataset_cleaned_all_title_data_controversial.csv", index=False)

NameError: name 'preprocess_pipeline' is not defined

In [2]:
dateset_controversial = preprocess.parse_lemmatized("preprocessed_serialised/dataset_cleaned_all_title_data_controversial.csv")

  return array(a, dtype, copy=False, order=order)


In [3]:
# import multiprocessing, logging
# logger = multiprocessing.log_to_stderr()
# logger.setLevel(logging.DEBUG)
dateset_controversial.loc[:, "text"] = preprocess.clean_further(dateset_controversial)

100%|████████████████████████████████████████████████████████████████████| 1000156/1000156 [00:07<00:00, 130092.02it/s]


In [13]:
pd.set_option('max_rows', 1000)
pd.set_option('max_colwidth', 300)
pd.describe_option('max_colwidth')

dateset_controversial.sample(frac=0.001)

display.max_colwidth : int or None
    The maximum width in characters of a column in the repr of
    a pandas data structure. When the column overflows, a "..."
    placeholder is embedded in the output. A 'None' value means unlimited.
    [default: 50] [currently: 300]


Unnamed: 0.1,Unnamed: 0,label,text
828713,828713,Guitar_licks,"[good, rockin, tonight, rockabilly, guitar, solo]"
138376,138376,nosleep,"[been, getting, some, weird, text, messages]"
100945,100945,dating_advice,"[how, do, you, find, woman, who, are, worth, your, time, nowadays]"
848443,848443,advancedentrepreneur,"[forbes, 100, best, websites, for, entrepreneurs]"
956377,956377,whatsbotheringyou,"[it, feel, a, if, i, fail, at, everything, i, try]"
110495,110495,Fallout,"[i, think, the, poor, story, and, npc, reactivity, of, fallout, 4, is, the, most, damaging, to, the, experience, that, i, ve, ever, seen, in, gaming]"
502893,502893,immigration,"[what, are, we, going, to, talk, about, when, family, separation, is, no, longer, a, thing]"
142147,142147,hardware,"[fluid, smart, water, meter, gives, consumers, the, power, to, solve, water, crisis]"
769064,769064,amathenedit,"[ama, then, edit, to, make, me, look, like, a, washedup, celebutante, desperate, to, stay, relevant]"
508877,508877,femalefashionadvice,"[what, to, wear, for, college, movein]"


Let's apply tfidf with word embeddings.

In [85]:
from sklearn.feature_extraction.text import TfidfVectorizer
def get_features(dataset, cores=12):
    X_reduced = pd.DataFrame()
    X_reduced.loc[:, "label"] = dataset["label"].unique()
    print(X_reduced.shape[0])
    X_reduced["text"] = [[] for i in range(X_reduced.shape[0])]
    for i, label in enumerate(tqdm(X_reduced.loc[:, "label"])):
        titles = dataset[dataset.label == "studyroomf"]["text"].to_list()
        X_reduced.loc[i, "text"].extend(title for title in titles)
    print(X_reduced["text"])
    with Pool(processes=cores) as pool:
        tmp = list(tqdm(pool.imap(preprocess.unite_string, X_reduced.loc[:,"text"], chunksize=(X_reduced.shape[0] // 100)), total=X_reduced.shape[0]))
        X_reduced.loc[:, "text"] = tmp
    print(X_reduced["text"])
    tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words='english')
    X_reduced.loc[:, "tfidf"] = tfidfvectorizer.fit_transform(X_reduced)
    return X_reduced

def features_to_embeddings(dateset_controversial, ind, embeddings):
    count = 0
    result = np.zeros(300)
    for word in x:
        if word in embeddings:
            count += 1
            result += embeddings[word] * x
            
# def tfidf_features_to_embeddings_space(X, embeddings):
    

In [86]:
X = get_features(dateset_controversial)

  0%|▏                                                                                | 4/1466 [00:00<00:45, 31.99it/s]

1466


100%|██████████████████████████████████████████████████████████████████████████████| 1466/1466 [00:46<00:00, 31.70it/s]
  0%|                                                                                         | 0/1466 [00:00<?, ?it/s]

0       [[in, regard, to, season, 4, 5, and, 6, and, h...
1       [[in, regard, to, season, 4, 5, and, 6, and, h...
2       [[in, regard, to, season, 4, 5, and, 6, and, h...
3       [[in, regard, to, season, 4, 5, and, 6, and, h...
4       [[in, regard, to, season, 4, 5, and, 6, and, h...
                              ...                        
1461    [[in, regard, to, season, 4, 5, and, 6, and, h...
1462    [[in, regard, to, season, 4, 5, and, 6, and, h...
1463    [[in, regard, to, season, 4, 5, and, 6, and, h...
1464    [[in, regard, to, season, 4, 5, and, 6, and, h...
1465    [[in, regard, to, season, 4, 5, and, 6, and, h...
Name: text, Length: 1466, dtype: object


  0%|                                                                                         | 0/1466 [00:02<?, ?it/s]


TypeError: sequence item 0: expected str instance, list found

In [69]:
print(type(X))

<class 'scipy.sparse.csr.csr_matrix'>


In [84]:
dateset_controversial[dateset_controversial.label == "studyroomf"]["text"].to_list()

[['in',
  'regard',
  'to',
  'season',
  '4',
  '5',
  'and',
  '6',
  'and',
  'hopefully',
  'no',
  'season',
  '7',
  'nurse',
  'jackie',
  'said',
  'it',
  'best'],
 ['instead',
  'of',
  'season',
  '5',
  'what',
  'if',
  'they',
  'made',
  'a',
  'season',
  '42'],
 ['karate', 'kid', 'in', 's6', 'an', 'obsessive', 'fantheory'],
 ['brand', 'new', 'study', 'group', 'for', 'the', 'next', 'series'],
 ['one',
  'of',
  'the',
  'biggest',
  'reason',
  'i',
  'am',
  'sad',
  'that',
  'community',
  'probably',
  'wo',
  'nt',
  'be',
  'coming',
  'back'],
 ['is', 'season', '4', 'unfairly', 'criticized', 'by', 'harmon'],
 ['the', 'chief', 'custodian', 'v', 'the', 'trues', 'repairman'],
 ['i',
  'know',
  'it',
  's',
  'probably',
  'been',
  'overdone',
  'and',
  'also',
  'quite',
  'late',
  'but',
  'i',
  'just',
  'found',
  'this',
  'sub',
  'and',
  'i',
  'have',
  'a',
  'discussion',
  'i',
  'want',
  'to',
  'have',
  'about',
  'the',
  'season',
  '4',
  'fin