# Dataset for embeddings
Let's scrape from 1000 to 1500 comments for each initial dataset class.

In [1]:
import kaggle
import os
import pandas as pd
import datetime as dt
import requests
import pandas as pd
import nltk
import lxml
import cchardet
import time
import numpy as np
import io
import regex as re
import importlib
import gensim
import modules.preprocess as preprocess
importlib.reload(preprocess)
import functools
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from gensim.models.fasttext import load_facebook_vectors
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import preprocessing
from scipy import spatial
from bs4 import BeautifulSoup
from tqdm import tqdm
from nltk import WordNetLemmatizer
from multiprocessing import Pool
random_seed = 42


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\TOPAPEC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\TOPAPEC\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
# kaggle datasets download -d theshadow29/subreddit-classification
try:
    os.mkdir("dataset")
except FileExistsError:
    print("Directory already exists")
kaggle.api.authenticate()
kaggle.api.dataset_download_files("theshadow29/subreddit-classification", path="dataset", unzip=True)

In [None]:
dataset = pd.read_csv('dataset/fine_grained_full.csv')
dataset.shape

In [None]:
dataset.head()

In [None]:
start_date = int(dt.datetime(2017, 1, 1, 0, 0).timestamp())
end_date = int(dt.datetime(2019, 1, 1, 0, 0).timestamp())
limit_per_subreddit = 1500
pages_per_sub = 40
subreddits = dataset["label"].unique()
try:
    os.mkdir("reddit_comments")
except FileExistsError:
    print("Directory already exists")
subreddits.shape

In [None]:
def pull_page(subreddit):
    titles = []
    url = f"https://old.reddit.com/r/{subreddit}/"
    requests_session = requests.Session()
    headers = {'User-Agent': 'Mozilla/5.0'}
    for i in range(pages_per_sub):
        if (i % 5 == 4):
            time.sleep(2)
        page = requests_session.get(url, headers=headers)
        soup = BeautifulSoup(page.text, 'lxml')
        attrs = {'class': 'thing'}
        for post in soup.find_all('div', attrs=attrs):
            titles.append(post.find('p', class_="title").text)
        next_button = soup.find("span", class_="next-button")
        try:
            url = next_button.find("a").attrs['href']
        except:
            print(f"{subreddit} page {i}")
            break
        
    return (subreddit, titles)

In [None]:

start = dt.datetime.now()
for chunk in tqdm(range(0, 1401, 40)):
    with Pool(12) as pool:
        for subreddit, result in pool.map(pull_page, subreddits[chunk:min(chunk + 40, 1430)]):
            df = pd.DataFrame(result)
            df.to_csv(f"reddit_comments{os.path.sep}{subreddit}.csv", index=False)
        time.sleep(2)
passed = dt.datetime.now() - start
print(passed)

In [None]:
pull_page(subreddits[0])

I failed to parse subreddits as there were too little titles available and the data was too noisy as well. I managed to parse 90 mib of data but of course that wasn't nearly enough to train decent embedding. At the same time my attempts to use pushshift api failed as there was a problem on their side (api returned only super small fraction of each query. 

But luckily I found pretrained glove word embeddings trained on more than 250 gib of reddit data. So I'm willing to test if they are good enough. 
https://www.kaggle.com/leighplt/glove-reddit-comments

In [10]:
def get_glove_reddit_embeddings():
    # Number of words - 1623397 
    embeddings = {}
    tmp = []
    with io.open("GloVe.Reddit.120B.300D.txt", "r", encoding='utf-8') as file:
        file.readline()
        for line in tqdm(file, total=1623397):
            tmp.append(line)
    with Pool(processes=14) as pool:
        tmp = list(tqdm(pool.imap(preprocess.fetch_embeddings_value, tmp, chunksize=200000), total=1623397))
    for word, vector in tqdm(tmp):
        embeddings[word] = vector
    del tmp
    return embeddings

def get_word2vec_embeddings():
    return gensim.models.KeyedVectors.load_word2vec_format('./GoogleNews-vectors-negative300.bin', binary=True)

def get_fasttext_embeddings():
    return load_facebook_vectors('cc.en.300.bin')

In [None]:
number_part.shape

In [18]:
# https://www.kaggle.com/leighplt/glove-reddit-comments

def test_embeddings_wordsim(embeddings):
    av_abs_dev = 0.0
    file_len = sum(1 for line in open('wordsim353_sim_rel/wordsim_relatedness_goldstandard.txt'))
    with open("wordsim353_sim_rel/wordsim_relatedness_goldstandard.txt") as file:
        for line in file:
            values = line.split()
            print(values[0], values[1], values[2])
            simil = float(values[2]) / 10.0
            cos_sim = 1 - spatial.distance.cosine(embeddings[values[0]], embeddings[values[1]])
            print(f"Accordings to embeddings {cos_sim}")
            av_abs_dev += abs(cos_sim - simil)
    print(f"Final average abs deviation: {av_abs_dev / float(file_len)}")

In [None]:
test_embeddings_wordsim(embeddings)

In [None]:
test_embeddings_wordsim(model)

As you can see wordsim test is not very representative of the quality of embeddings. Now, let's use classic ml classifiers on our dataset to determine the best vectors.

In [6]:

dataset1 = pd.read_csv('dataset/fine_grained_full.csv')
dataset2 = pd.read_csv('dataset/cleaned_all_title_data_controversial.csv')
print(dataset1.shape, dataset2.shape)

(429300, 2) (1000156, 2)


In [56]:
dataset1.head()

Unnamed: 0,label,text
0,studyroomf,Do you subscribe to the theory that all the ev...
1,studyroomf,"A pivotal moment for the dean: ""We love you too"""
2,studyroomf,Episode Discussion - S04E05 - Cooperative Esca...
3,studyroomf,"Dan Harmon says ""There's a character from seas..."
4,studyroomf,'Can we take a sidebar from this sidebar?' Sug...


In [2]:
os.cpu_count()

16

In [3]:

def preprocess_pipeline(dataset):
    cores = 12
    multicore_tok(dataset, cores)
    multicore_lem(dataset, cores)

def multicore_tok(dataset, cores=6):
    with Pool(processes=cores) as pool:
        dataset.loc[:, "text"] = pool.map(nltk.word_tokenize, dataset.loc[:, "text"])

def multicore_lem(dataset, cores=6):
    with Pool(processes=cores) as pool:
        wnl = WordNetLemmatizer()
        for i, line in tqdm(enumerate(dataset.text)):
            dataset.loc[i,"text"] = pool.map(wnl.lemmatize, dataset.loc[i, "text"])

In [98]:
preprocess_pipeline(dataset1)
print(dataset1[:4])
dataset1.to_csv("preprocessed_serialised/dataset_fine_grained_full.csv", index=False)

HBox(children=(HTML(value=''), FloatProgress(value=1.0, bar_style='info', layout=Layout(width='20px'), max=1.0…


        label                                               text
0  studyroomf  [Do, you, subscribe, to, the, theory, that, al...
1  studyroomf  [A, pivotal, moment, for, the, dean, :, ``, We...
2  studyroomf  [Episode, Discussion, -, S04E05, -, Cooperativ...
3  studyroomf  [Dan, Harmon, say, ``, There, 's, a, character...


In [None]:
preprocess_pipeline(dataset2)
print(dataset2[:4])
dataset2.to_csv("preprocessed_serialised/dataset_cleaned_all_title_data_controversial.csv", index=False)

In [3]:
dateset_controversial = preprocess.parse_lemmatized("preprocessed_serialised/dataset_cleaned_all_title_data_controversial.csv")

  return array(a, dtype, copy=False, order=order)


In [4]:
# import multiprocessing, logging
# logger = multiprocessing.log_to_stderr()
# logger.setLevel(logging.DEBUG)
dateset_controversial.loc[:, "text"] = preprocess.clean_further(dateset_controversial)

100%|████████████████████████████████████████████████████████████████████| 1000156/1000156 [00:08<00:00, 121618.73it/s]


In [5]:
pd.set_option('max_rows', 1000)
pd.set_option('max_colwidth', 300)
pd.describe_option('max_colwidth')

# dateset_controversial.sample(frac=0.001)

display.max_colwidth : int or None
    The maximum width in characters of a column in the repr of
    a pandas data structure. When the column overflows, a "..."
    placeholder is embedded in the output. A 'None' value means unlimited.
    [default: 50] [currently: 300]


Let's apply classes-tfidf with word embeddings.

In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer
def get_features(dataset, cores=12):
    X_reduced = pd.DataFrame()
    X_reduced.loc[:, "label"] = dataset["label"].unique()
    X_reduced["text"] = [[] for i in range(X_reduced.shape[0])]
    for i, label in enumerate(tqdm(X_reduced.loc[:, "label"])):
        titles = dataset[dataset.label == label]["text"].to_list()
        X_reduced.loc[i, "text"].extend(word for title in titles for word in title)
    with Pool(processes=cores) as pool:
        tmp = list(tqdm(pool.imap(preprocess.unite_string, X_reduced.loc[:,"text"], chunksize=(X_reduced.shape[0] // 100)), total=X_reduced.shape[0]))
        X_reduced.loc[:, "text"] = tmp
    tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words='english')
    tfidf_matrix = tfidfvectorizer.fit_transform(X_reduced.loc[:, "text"])
    feature_names = tfidfvectorizer.vocabulary_
    feature_names = {v: k for k, v in feature_names.items()}
    return tfidf_matrix, feature_names

def features_to_embeddings(dateset_controversial, ind, embeddings):
    count = 0
    result = np.zeros(300)
    for word in x:
        if word in embeddings:
            count += 1
            result += embeddings[word] * x
            
# def tfidf_features_to_embeddings_space(X, embeddings):
    

In [7]:
matrix, names = get_features(dateset_controversial)

100%|██████████████████████████████████████████████████████████████████████████████| 1466/1466 [00:47<00:00, 30.70it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 1466/1466 [00:03<00:00, 434.58it/s]


In [8]:
cx = matrix.tocoo()
tfidf_dict = {}
labels = dateset_controversial.loc[:, "label"].unique()
for i,j,v in tqdm(zip(cx.row, cx.col, cx.data)):
    if labels[i] not in tfidf_dict:
        tfidf_dict[labels[i]] = {}
    tfidf_dict[labels[i]][names[j]] = v

2267670it [00:02, 918802.67it/s]


In [17]:
embeddings = get_glove_reddit_embeddings()

100%|████████████████████████████████████████████████████████████████████| 1623397/1623397 [00:05<00:00, 296258.46it/s]
100%|█████████████████████████████████████████████████████████████████████| 1623397/1623397 [00:23<00:00, 68554.75it/s]
100%|███████████████████████████████████████████████████████████████████| 1623397/1623397 [00:00<00:00, 2124390.68it/s]


In [9]:

with Pool(processes=14) as pool:
    X = list(tqdm(pool.imap(functools.partial(preprocess.row_to_embedding, embeddings=embeddings, embeddings_dim=300, tfidf_dict=tfidf_dict), dateset_controversial.iterrows(), chunksize=200000), total=dateset_controversial.shape[0]))

NameError: name 'embeddings' is not defined

In [None]:
y = dateset_controversial.loc[:, "label"]
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)

In [17]:
print("Starting training")
model = LogisticRegression(random_state=random_seed, n_jobs=-1, verbose=True, max_iter=50)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

Starting training


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed: 12.6min finished


In [18]:
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred, average="macro"))
print(precision_score(y_test, y_pred, average="macro"))
print(recall_score(y_test, y_pred, average="macro"))    

0.32019996875488205
0.3372055347877084


  _warn_prf(average, modifier, msg_start, len(result))


0.5270011604440862
0.29506955168567023


In [27]:
def transform_to_tfidf_prod_embeddings(dataset, embeddings, embeddings_dim, tfidf_dict):
    with Pool(processes=12) as pool:
        return np.asarray(list(tqdm(pool.imap(functools.partial(preprocess.row_to_embedding, embeddings=embeddings, 
                                                  embeddings_dim=embeddings_dim, tfidf_dict=tfidf_dict), dataset.iterrows(), chunksize=20000), total=dataset.shape[0])))


def transform_to_tfidf_prod_embeddings_linear(dataset, embeddings, embeddings_dim, tfidf_dict):
    X = np.empty((dataset.shape[0], embeddings_dim), dtype=np.ndarray)
    for i, line in tqdm(enumerate(dataset.iterrows()), total=dataset.shape[0]):
        X[i] = functools.partial(preprocess.row_to_embedding, embeddings=embeddings, 
                                 embeddings_dim=embeddings_dim, tfidf_dict=tfidf_dict)(line)
    return X

In [10]:
y = dateset_controversial.loc[:, "label"]
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)
embeddings = get_word2vec_embeddings()
X = transform_to_tfidf_prod_embeddings(dateset_controversial, embeddings, 300, tfidf_dict)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=random_seed)

In [11]:
print("Starting training")
model = LogisticRegression(random_state=random_seed, n_jobs=-1, verbose=True, max_iter=50)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

100%|██████████████████████████████████████████████████████████████████████| 1000156/1000156 [04:16<00:00, 3898.99it/s]


Starting training


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed: 13.0min finished


In [12]:
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred, average="macro"))
print(precision_score(y_test, y_pred, average="macro"))
print(recall_score(y_test, y_pred, average="macro"))    

0.207412591782534
0.20976933841507145


  _warn_prf(average, modifier, msg_start, len(result))


0.38519818557904995
0.18745536394885665


In [28]:
y = dateset_controversial.loc[:, "label"]
le = preprocessing.LabelEncoder()
y = le.fit_transform(y)
embeddings = get_fasttext_embeddings()
X = transform_to_tfidf_prod_embeddings_linear(dateset_controversial, embeddings, 300, tfidf_dict)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=random_seed)

100%|██████████████████████████████████████████████████████████████████████| 1000156/1000156 [01:40<00:00, 9959.81it/s]


In [29]:
print("Starting training")
model = LogisticRegression(random_state=random_seed, n_jobs=-1, verbose=True, max_iter=50)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

Starting training


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 16 concurrent workers.
[Parallel(n_jobs=-1)]: Done   1 out of   1 | elapsed: 12.9min finished


In [30]:
print(accuracy_score(y_test, y_pred))
print(f1_score(y_test, y_pred, average="macro"))
print(precision_score(y_test, y_pred, average="macro"))
print(recall_score(y_test, y_pred, average="macro")) 

0.15277487892516795
0.1439512317219167


  _warn_prf(average, modifier, msg_start, len(result))


0.27191432800152043
0.1338477866119142


Glove reddit vectors turned out to be the best option.