## Part A: Subreddit Prediction ##

In [61]:
subreddit_train = "coursework_subreddit_train.json"
subreddit_validation = "coursework_subreddit_validation.json"
subreddit_test = "coursework_subreddit_test.json"

!gsutil cp gs://textasdata/coursework/coursework_subreddit_train2020.json $subreddit_train 
!gsutil cp gs://textasdata/coursework/coursework_subreddit_validation2020.json $subreddit_validation 
!gsutil cp gs://textasdata/coursework/coursework_subreddit_test.json  $subreddit_test

Copying gs://textasdata/coursework/coursework_subreddit_train2020.json...
/ [1 files][  8.0 MiB/  8.0 MiB]                                                
Operation completed over 1 objects/8.0 MiB.                                      
Copying gs://textasdata/coursework/coursework_subreddit_validation2020.json...
/ [1 files][  2.1 MiB/  2.1 MiB]                                                
Operation completed over 1 objects/2.1 MiB.                                      
Copying gs://textasdata/coursework/coursework_subreddit_test.json...
/ [1 files][  2.7 MiB/  2.7 MiB]                                                
Operation completed over 1 objects/2.7 MiB.                                      


In [62]:
import pandas as pd

train_threads = pd.read_json(path_or_buf=subreddit_train, lines=True)
print(list(train_threads.columns.values))
print(train_threads.head())
print(train_threads.size)

['is_self_post', 'posts', 'subreddit', 'title', 'url']
   is_self_post  ...                                                url
0           1.0  ...  https://www.reddit.com/r/relationships/comment...
1           1.0  ...  https://www.reddit.com/r/AskReddit/comments/22...
2           1.0  ...  https://www.reddit.com/r/trees/comments/46d0iu...
3           1.0  ...  https://www.reddit.com/r/AskReddit/comments/19...
4           1.0  ...  https://www.reddit.com/r/explainlikeimfive/com...

[5 rows x 5 columns]
5820


In [63]:
validation_threads = pd.read_json(path_or_buf=subreddit_validation, lines=True)
print(list(validation_threads.columns.values))
print(validation_threads.head())
print(validation_threads.size)

['is_self_post', 'posts', 'subreddit', 'title', 'url']
   is_self_post  ...                                                url
0           1.0  ...  https://www.reddit.com/r/hearthstone/comments/...
1           1.0  ...  https://www.reddit.com/r/explainlikeimfive/com...
2           1.0  ...  https://www.reddit.com/r/AskReddit/comments/tl...
3           1.0  ...  https://www.reddit.com/r/AskReddit/comments/22...
4           1.0  ...  https://www.reddit.com/r/reddit.com/comments/f...

[5 rows x 5 columns]
1460


In [64]:
test_threads = pd.read_json(path_or_buf=subreddit_test, lines=True)
print(test_threads.head())
print(test_threads.size)

   is_self_post  ...                                                url
0           1.0  ...  https://www.reddit.com/r/starcraft/comments/mq...
1           1.0  ...  https://www.reddit.com/r/whowouldwin/comments/...
2           1.0  ...  https://www.reddit.com/r/AskReddit/comments/27...
3           1.0  ...  https://www.reddit.com/r/AskReddit/comments/x9...
4           1.0  ...  https://www.reddit.com/r/tipofmytongue/comment...

[5 rows x 5 columns]
1825


In [65]:
subreddit_counts = train_threads['subreddit'].value_counts()
print(subreddit_counts.describe())
top_subbreddits = subreddit_counts.nlargest(20)
top_subbreddits_list = top_subbreddits.index.tolist()
print(top_subbreddits)

count     20.000000
mean      58.200000
std       60.248258
min       20.000000
25%       28.750000
50%       38.500000
75%       49.500000
max      276.000000
Name: subreddit, dtype: float64
askreddit               276
leagueoflegends         157
buildapc                103
explainlikeimfive        60
gaming                   51
trees                    49
techsupport              48
pcmasterrace             47
electronic_cigarette     46
relationships            42
tipofmytongue            35
summonerschool           33
jailbreak                31
hearthstone              30
whowouldwin              29
atheism                  28
reddit.com               27
personalfinance          27
movies                   25
starcraft                20
Name: subreddit, dtype: int64


In [0]:
train_labels = train_threads['subreddit']
validation_labels = validation_threads['subreddit']
test_labels = test_threads['subreddit']

**Evaluation Summary**

In [0]:
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import accuracy_score
from sklearn.metrics import fbeta_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


def eval_summary(predictions, labels, avg='macro'):
    precision = precision_score(predictions, labels, average=avg)
    recall = recall_score(predictions, labels, average=avg)
    f1 = fbeta_score(predictions, labels, 1, average=avg)
    accuracy = accuracy_score(predictions, labels)
    print("Classifier  has Acc=%0.3f P=%0.3f R=%0.3f F1=%0.3f" % (accuracy,precision,recall,f1))
    print(classification_report(predictions, labels, digits=3))
    print('\nConfusion matrix:\n',confusion_matrix(labels, predictions))

# Spacy 2.0

In [68]:
import spacy

# Load the medium english model. 
# We will use this model to get embedding features for tokens later.
#!python -m spacy download en_core_web_md

nlp = spacy.load('en_core_web_sm', disable=['ner'])
nlp.remove_pipe('tagger')
nlp.remove_pipe('parser')

# Download a stopword list
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [0]:
#@Tokenize
def spacy_tokenize(string):
  tokens = list()
  doc = nlp(string)
  for token in doc:
    tokens.append(token)
  return tokens

#@Normalize
def normalize(tokens):
  normalized_tokens = list()
  for token in tokens:
    normalized = token.text.lower().strip()
    if ((token.is_alpha or token.is_digit)):
      normalized_tokens.append(normalized)
  return normalized_tokens
  return normalized_tokens

#@Tokenize and normalize
def tokenize_normalize(string):
  return normalize(spacy_tokenize(string))

#  Q1 Whole Thread Selection

In [0]:
def field_thread(thread):
    field_thread = [thread['title']]
    for post in thread['posts']:
        if "body" in post:
            a = post['body'].strip()
            if a != '':
                field_thread.append(a)
        if "author" in post:
            b = post['author'].strip()
            if b != '':
                field_thread.append(b)       
    return " ".join(field_thread)
  
train_threads['full_thread'] = train_threads.apply(field_thread, axis=1)
test_threads['full_thread'] = test_threads.apply(field_thread, axis=1)
validation_threads['full_thread'] = validation_threads.apply(field_thread, axis=1)
full_thread = pd.concat([train_threads['full_thread'], test_threads['full_thread'],validation_threads['full_thread']])

# Count Vectorizer & TFIDF Vectorizer

In [0]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
count_vectorizer = CountVectorizer(tokenizer=tokenize_normalize)

tfidf_vectorizer = TfidfVectorizer(tokenizer=tokenize_normalize)
train_tfidf_matrix = tfidf_vectorizer.fit_transform(train_threads.full_thread.values)
train_cv_matrix = count_vectorizer.fit_transform(train_threads.full_thread.values)

test_tfidf_matrix = tfidf_vectorizer.transform(test_threads.full_thread.values)
test_cv_matrix = count_vectorizer.transform(test_threads.full_thread.values)

# Dummy Classifier with strategy="most_frequent"

In [72]:
from sklearn.dummy import DummyClassifier
most_frequent_cv = DummyClassifier(strategy="most_frequent")
most_frequent_cv.fit(train_cv_matrix, train_labels)
Most_Frequent= ['Dummy Most_Frequent', 'Count Vectorizer']
prediction = most_frequent_cv.predict(test_cv_matrix)
summary= eval_summary(prediction, test_labels, avg='macro')


Classifier  has Acc=0.230 P=0.050 R=0.012 F1=0.019
                      precision    recall  f1-score   support

           askreddit      1.000     0.230     0.374       365
             atheism      0.000     0.000     0.000         0
            buildapc      0.000     0.000     0.000         0
electronic_cigarette      0.000     0.000     0.000         0
   explainlikeimfive      0.000     0.000     0.000         0
              gaming      0.000     0.000     0.000         0
         hearthstone      0.000     0.000     0.000         0
           jailbreak      0.000     0.000     0.000         0
     leagueoflegends      0.000     0.000     0.000         0
              movies      0.000     0.000     0.000         0
        pcmasterrace      0.000     0.000     0.000         0
     personalfinance      0.000     0.000     0.000         0
          reddit.com      0.000     0.000     0.000         0
       relationships      0.000     0.000     0.000         0
           starcra

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [73]:
most_frequent_tf = DummyClassifier(strategy="most_frequent")
most_frequent_tf.fit(train_tfidf_matrix, train_labels)
Most_Frequent1= ['Dummy Most_Frequent', 'Tf-idf Vectorizer']
prediction = most_frequent_tf.predict(test_tfidf_matrix)
summary1= eval_summary(prediction, test_labels, avg='macro')

Classifier  has Acc=0.230 P=0.050 R=0.012 F1=0.019
                      precision    recall  f1-score   support

           askreddit      1.000     0.230     0.374       365
             atheism      0.000     0.000     0.000         0
            buildapc      0.000     0.000     0.000         0
electronic_cigarette      0.000     0.000     0.000         0
   explainlikeimfive      0.000     0.000     0.000         0
              gaming      0.000     0.000     0.000         0
         hearthstone      0.000     0.000     0.000         0
           jailbreak      0.000     0.000     0.000         0
     leagueoflegends      0.000     0.000     0.000         0
              movies      0.000     0.000     0.000         0
        pcmasterrace      0.000     0.000     0.000         0
     personalfinance      0.000     0.000     0.000         0
          reddit.com      0.000     0.000     0.000         0
       relationships      0.000     0.000     0.000         0
           starcra

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Dummy Classifier with strategy="stratified"

In [74]:
stratified_cv = DummyClassifier(strategy="stratified")
stratified_cv.fit(train_cv_matrix, train_labels)
DS = ['Dummy Stratified', 'Count Vectorizer']
prediction = stratified_cv.predict(test_cv_matrix)
summary2 = eval_summary(prediction, test_labels, avg='macro')

stratified_tf = DummyClassifier(strategy="stratified")
stratified_tf.fit(train_tfidf_matrix, train_labels)
DS1= ['Dummy Stratified', 'Tf-idf Vectorizer']
prediction = stratified_tf.predict(test_tfidf_matrix)
summary3 = eval_summary(prediction, test_labels, avg='macro')


Classifier  has Acc=0.082 P=0.033 R=0.033 F1=0.033
                      precision    recall  f1-score   support

           askreddit      0.226     0.207     0.216        92
             atheism      0.000     0.000     0.000        11
            buildapc      0.054     0.080     0.065        25
electronic_cigarette      0.000     0.000     0.000        12
   explainlikeimfive      0.000     0.000     0.000        20
              gaming      0.000     0.000     0.000        23
         hearthstone      0.000     0.000     0.000         8
           jailbreak      0.000     0.000     0.000        13
     leagueoflegends      0.146     0.135     0.140        52
              movies      0.000     0.000     0.000        10
        pcmasterrace      0.000     0.000     0.000         9
     personalfinance      0.000     0.000     0.000        10
          reddit.com      0.167     0.167     0.167         6
       relationships      0.000     0.000     0.000         7
           starcra

# LogisticRegression with One-hot vectorization

In [75]:
from sklearn.linear_model import LogisticRegression
lr_cv = LogisticRegression()
lr_cv.fit(train_cv_matrix, train_labels)
LR= ['LogisticRegression', 'Count Vectorizer']
prediction = lr_cv.predict(test_cv_matrix)
print(LR)
summary4 = eval_summary(prediction, test_labels, avg='macro')

['LogisticRegression', 'Count Vectorizer']
Classifier  has Acc=0.597 P=0.479 R=0.585 F1=0.504
                      precision    recall  f1-score   support

           askreddit      0.833     0.560     0.670       125
             atheism      0.417     0.556     0.476         9
            buildapc      0.730     0.730     0.730        37
electronic_cigarette      0.667     0.667     0.667         9
   explainlikeimfive      0.500     0.636     0.560        11
              gaming      0.471     0.381     0.421        21
         hearthstone      0.267     0.800     0.400         5
           jailbreak      0.545     1.000     0.706         6
     leagueoflegends      0.833     0.615     0.708        65
              movies      0.200     0.500     0.286         2
        pcmasterrace      0.174     0.444     0.250         9
     personalfinance      0.800     1.000     0.889         8
          reddit.com      0.000     0.000     0.000         2
       relationships      0.667     0

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# LogisticRegression with TF-IDF vectorization

In [76]:
lr_tfidf = LogisticRegression()
lr_tfidf.fit(train_tfidf_matrix, train_labels)
LR1 = ['LogisticRegression', 'Tf-idf Vectorizer']
prediction = lr_tfidf.predict(test_tfidf_matrix)
summary5= eval_summary(prediction, test_labels, avg='macro')

Classifier  has Acc=0.477 P=0.256 R=0.396 F1=0.268
                      precision    recall  f1-score   support

           askreddit      0.952     0.340     0.502       235
             atheism      0.000     0.000     0.000         0
            buildapc      0.730     0.794     0.761        34
electronic_cigarette      0.556     1.000     0.714         5
   explainlikeimfive      0.071     0.500     0.125         2
              gaming      0.000     0.000     0.000         2
         hearthstone      0.067     1.000     0.125         1
           jailbreak      0.273     1.000     0.429         3
     leagueoflegends      0.875     0.656     0.750        64
              movies      0.000     0.000     0.000         0
        pcmasterrace      0.000     0.000     0.000         0
     personalfinance      0.400     1.000     0.571         4
          reddit.com      0.000     0.000     0.000         0
       relationships      0.667     1.000     0.800         4
           starcra

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# SVC Classifier with One-hot vectorization

In [77]:
from sklearn.svm import SVC, LinearSVC, NuSVC
svc_cv = SVC()
svc_cv.fit(train_cv_matrix, train_labels)
SVC1= ['SVM Classifier', 'Count Vectorizer']
prediction = svc_cv.predict(test_cv_matrix)
summary6 = eval_summary(prediction, test_labels, avg='macro')

Classifier  has Acc=0.329 P=0.159 R=0.200 F1=0.140
                      precision    recall  f1-score   support

           askreddit      0.964     0.280     0.434       289
             atheism      0.000     0.000     0.000         0
            buildapc      0.486     0.857     0.621        21
electronic_cigarette      0.000     0.000     0.000         1
   explainlikeimfive      0.357     0.385     0.370        13
              gaming      0.000     0.000     0.000         0
         hearthstone      0.000     0.000     0.000         0
           jailbreak      0.000     0.000     0.000         0
     leagueoflegends      0.125     0.400     0.190        15
              movies      0.000     0.000     0.000         0
        pcmasterrace      0.000     0.000     0.000         0
     personalfinance      0.100     1.000     0.182         1
          reddit.com      0.000     0.000     0.000         0
       relationships      0.667     0.308     0.421        13
           starcra

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# An ‘interesting’ classifier model : Decision Tree

In [78]:
from sklearn.tree import DecisionTreeClassifier
dt_tfidf = DecisionTreeClassifier()
dt_tfidf.fit(train_tfidf_matrix, train_labels)
Tree= ['decision', 'Tf-idf Vectorizer']
prediction = dt_tfidf.predict(test_tfidf_matrix)
summary7= eval_summary(prediction, test_labels, avg='macro')

Classifier  has Acc=0.430 P=0.347 R=0.364 F1=0.347
                      precision    recall  f1-score   support

           askreddit      0.607     0.531     0.567        96
             atheism      0.083     0.077     0.080        13
            buildapc      0.568     0.700     0.627        30
electronic_cigarette      0.333     0.429     0.375         7
   explainlikeimfive      0.214     0.214     0.214        14
              gaming      0.353     0.286     0.316        21
         hearthstone      0.267     0.286     0.276        14
           jailbreak      0.455     0.556     0.500         9
     leagueoflegends      0.479     0.535     0.505        43
              movies      0.000     0.000     0.000         5
        pcmasterrace      0.043     0.056     0.049        18
     personalfinance      0.400     0.800     0.533         5
          reddit.com      0.000     0.000     0.000        13
       relationships      0.667     0.500     0.571         8
           starcra

# Q2 Parameter tuning

In [79]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
prediction_pipeline = Pipeline([
              ('tf', TfidfVectorizer(tokenizer=tokenize_normalize)),
              ('logreg', LogisticRegression())
              ])
prediction_pipeline.fit(train_threads.full_thread.values, train_labels)
    

Pipeline(memory=None,
         steps=[('tf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u...',
                                 tokenizer=<function tokenize_normalize at 0x7f14d34d5268>,
                                 use_idf=True, vocabulary=None)),
                ('logreg',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_i

In [80]:
from sklearn.model_selection import GridSearchCV
params = {
    'tf__sublinear_tf': [True],
    'tf__ngram_range': [(1,1)],
    'tf__max_features': [5000],
    'logreg__C': [100]
} 
grid_search = GridSearchCV(prediction_pipeline, param_grid=params, n_jobs=1, verbose=1, scoring='f1_macro', cv=2)
grid_search.fit(train_threads.full_thread.values, train_labels)
best_estimator = grid_search.best_estimator_
best_estimator_test_predict = best_estimator.predict(test_threads.full_thread.values)
best_estimator_validation_predict = best_estimator.predict(validation_threads.full_thread.values)
summary8=eval_summary(best_estimator_test_predict, test_labels)

Fitting 2 folds for each of 1 candidates, totalling 2 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   11.6s finished
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preproc

Classifier  has Acc=0.701 P=0.587 R=0.776 F1=0.635
                      precision    recall  f1-score   support

           askreddit      0.917     0.579     0.710       133
             atheism      0.417     0.833     0.556         6
            buildapc      0.838     0.756     0.795        41
electronic_cigarette      0.778     1.000     0.875         7
   explainlikeimfive      0.571     0.727     0.640        11
              gaming      0.647     0.458     0.537        24
         hearthstone      0.600     1.000     0.750         9
           jailbreak      0.727     1.000     0.842         8
     leagueoflegends      0.875     0.750     0.808        56
              movies      0.200     0.500     0.286         2
        pcmasterrace      0.261     0.857     0.400         7
     personalfinance      0.800     1.000     0.889         8
          reddit.com      0.000     0.000     0.000         1
       relationships      1.000     0.750     0.857         8
           starcra

# For error findings compare with validation.predict

In [81]:
best_estimator_validation_predict = best_estimator.predict(validation_threads.full_thread.values)
summary9=eval_summary(best_estimator_validation_predict,validation_labels)


Classifier  has Acc=0.688 P=0.605 R=0.786 F1=0.638
                      precision    recall  f1-score   support

           askreddit      0.966     0.514     0.671       109
             atheism      0.444     1.000     0.615         4
            buildapc      0.821     0.767     0.793        30
electronic_cigarette      0.846     1.000     0.917        11
   explainlikeimfive      0.409     0.600     0.486        15
              gaming      0.636     0.700     0.667        10
         hearthstone      0.375     1.000     0.545         3
           jailbreak      0.714     0.833     0.769         6
     leagueoflegends      0.897     0.761     0.824        46
              movies      0.125     1.000     0.222         1
        pcmasterrace      0.333     0.714     0.455         7
     personalfinance      1.000     0.714     0.833         7
          reddit.com      0.000     0.000     0.000         0
       relationships      0.833     1.000     0.909         5
           starcra

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


# Q3 Adding Features

In [0]:
train_thread = list()
temp=list()
import json 
with open(subreddit_train) as jsonfile:
  for i, line in enumerate(jsonfile):
    thread = json.loads(line)
    id = list()
    author = list()
    body = list()
    in_reply_to=list()
    for post in thread['posts']:
      id.append(post.get('id', 0))
      author.append(post.get('author', ""))
      body.append(post.get('body', ""))
      in_reply_to.append(post.get('in_reply_to',""))
    temp.append((thread['title'],' '.join(id), ' '.join(author), ' '.join(body),' '.join(in_reply_to)))
    train_thread.extend(temp)
    temp.clear()
labels = ['title',  'id', 'author', 'body','in_reply_to']
trainDf= pd.DataFrame(train_thread, columns=labels)

In [0]:
test_thread = list()
temp=list()
with open(subreddit_test) as jsonfile:
  for i, line in enumerate(jsonfile):
    thread = json.loads(line)
    id = list()
    author = list()
    body = list()
    in_reply_to=list()
    for post in thread['posts']:
      id.append(post.get('id', 0))
      author.append(post.get('author', ""))
      body.append(post.get('body', ""))
      in_reply_to.append(post.get('in_reply_to',""))
    temp.append((thread['title'],' '.join(id), ' '.join(author), ' '.join(body),' '.join(in_reply_to)))
    test_thread.extend(temp)
    temp.clear()
labels = ['title', 'id', 'author', 'body','in_reply_to']
testDf= pd.DataFrame(test_thread, columns=labels)

In [0]:
from sklearn.base import BaseEstimator, TransformerMixin

class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.    """

    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

In [0]:
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
prediction_pipeline = Pipeline([
        ('union', FeatureUnion(
          transformer_list=[
            ('title', Pipeline([
              ('selector', ItemSelector(key='title')),
              ('Tf_idf', TfidfVectorizer(tokenizer=tokenize_normalize,sublinear_tf=True,ngram_range=(1,1),max_features=5000)), 
              ])),
            ('id', Pipeline([
              ('selector', ItemSelector(key='id')),
              ('Tf_idf', TfidfVectorizer(tokenizer=tokenize_normalize,sublinear_tf=True,ngram_range=(1,1),max_features=5000)), 
              ])),
             ('author', Pipeline([
              ('selector', ItemSelector(key='author')),
              ('vectorizer', TfidfVectorizer(tokenizer=tokenize_normalize, sublinear_tf= True, max_features=5000, ngram_range=(1,1)))
              ])),
            ('body', Pipeline([
              ('selector', ItemSelector(key='body')),
              ('vectorizer', TfidfVectorizer(tokenizer=tokenize_normalize, sublinear_tf= True, max_features=5000, ngram_range=(1,1)))
              ])),
            ('in_reply_to', Pipeline([
              ('selector', ItemSelector(key='in_reply_to')),
              ('Tf_idf', TfidfVectorizer(tokenizer=tokenize_normalize,sublinear_tf=True,ngram_range=(1,1),max_features=5000)), 
              ])), 
        ])
        )
    ])

In [0]:
train_features= prediction_pipeline.fit_transform(trainDf)
test_features = prediction_pipeline.transform(testDf)

In [87]:
lr3 = LogisticRegression(C=100)
combined_model = lr3.fit(train_features,train_labels)
summary10=eval_summary(lr3.predict(test_features), test_labels) 
 

Classifier  has Acc=0.660 P=0.520 R=0.663 F1=0.557
                      precision    recall  f1-score   support

           askreddit      0.964     0.551     0.701       147
             atheism      0.500     1.000     0.667         6
            buildapc      0.811     0.811     0.811        37
electronic_cigarette      0.889     0.889     0.889         9
   explainlikeimfive      0.429     0.857     0.571         7
              gaming      0.294     0.455     0.357        11
         hearthstone      0.533     1.000     0.696         8
           jailbreak      0.636     1.000     0.778         7
     leagueoflegends      0.896     0.632     0.741        68
              movies      0.200     0.500     0.286         2
        pcmasterrace      0.174     0.444     0.250         9
     personalfinance      0.900     1.000     0.947         9
          reddit.com      0.000     0.000     0.000         2
       relationships      1.000     1.000     1.000         6
           starcra

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


## Part B: Discourse prediction ##

In [88]:
discourse_train = "coursework_discourse_train.json"
discourse_test = "coursework_discourse_test.json"
  
!gsutil cp gs://textasdata/coursework/coursework_discourse_train.json $discourse_train  
!gsutil cp gs://textasdata/coursework/coursework_discourse_test.json  $discourse_test

Copying gs://textasdata/coursework/coursework_discourse_train.json...
| [1 files][ 60.2 MiB/ 60.2 MiB]                                                
Operation completed over 1 objects/60.2 MiB.                                     
Copying gs://textasdata/coursework/coursework_discourse_test.json...
\ [1 files][ 15.1 MiB/ 15.1 MiB]                                                
Operation completed over 1 objects/15.1 MiB.                                     


In [0]:
# The reddit thread structure is nested with posts in a new content.
# This block reads the file as json and creates a new data frame.
import pandas as pd
import json

def load_posts(file):
  # A temporary variable to store the list of post content.
  posts_tmp = list()

  with open(file) as jsonfile:
    for i, line in enumerate(jsonfile):
     # if (i > 2): break
      thread = json.loads(line)
      for post in thread['posts']:
        # NOTE: This should be changed to use additional features from the post or thread.
        # DO NOT change the labels for the test set.
        posts_tmp.append((thread['subreddit'], thread['title'], thread['url'],
                        post['id'], post.get('author', ""), post.get('body', ""), post.get("majority_link", ""), 
                        post.get('post_depth', 0), post.get('majority_type', ""), # discourse type label 
                        post.get('in_reply_to', "") ))

# Create the posts data frame.  
  labels = ['subreddit', 'title', 'url', 'id', 'author', 'body', 'majority_link', 
          'post_depth', 'discourse_type', 'in_reply_to']
  return pd.DataFrame(posts_tmp, columns=labels)

In [90]:
original_train_posts = load_posts(discourse_train)
# Filter out empty labels
original_train_posts = original_train_posts[original_train_posts['discourse_type'] != ""]
print(original_train_posts.head())
print("Num posts: ", original_train_posts.size)
print(len(original_train_posts))

    subreddit                           title  ... discourse_type in_reply_to
0  worldofpvp  Help me decide my new PvP main  ...       question            
1  worldofpvp  Help me decide my new PvP main  ...         answer   t3_2v0anq
2  worldofpvp  Help me decide my new PvP main  ...         answer   t3_2v0anq
3  worldofpvp  Help me decide my new PvP main  ...         answer   t3_2v0anq
4  worldofpvp  Help me decide my new PvP main  ...         answer   t3_2v0anq

[5 rows x 10 columns]
Num posts:  792670
79267


#### Development / Validation data

For part B it is up to you to split the "original" training data into a new train/validation (development) dataset appropriately. See Lab 4 for an example. 

In [91]:

train_split = int(len(original_train_posts))
tmp_train = original_train_posts.iloc[:train_split,:]
#Split the train data into a train/validation split that's 80% train, 20% developemnt 
validation_split = int(train_split * 0.8)
train_posts = tmp_train.iloc[:validation_split,:]
validation_posts= tmp_train.iloc[validation_split:,:]
print(len(validation_posts))
print((train_posts))


15854
        subreddit                           title  ... discourse_type in_reply_to
0      worldofpvp  Help me decide my new PvP main  ...       question            
1      worldofpvp  Help me decide my new PvP main  ...         answer   t3_2v0anq
2      worldofpvp  Help me decide my new PvP main  ...         answer   t3_2v0anq
3      worldofpvp  Help me decide my new PvP main  ...         answer   t3_2v0anq
4      worldofpvp  Help me decide my new PvP main  ...         answer   t3_2v0anq
...           ...                             ...  ...            ...         ...
70778   starcraft             Mechanical Keyboard  ...         answer    t3_k0qoo
70779   starcraft             Mechanical Keyboard  ...       question  t1_c2gnfdx
70780   starcraft             Mechanical Keyboard  ...         answer  t1_c2goe63
70781   starcraft             Mechanical Keyboard  ...         answer    t3_k0qoo
70782   starcraft             Mechanical Keyboard  ...         answer    t3_k0qoo

[63413 ro

### Test data

In [92]:
test_posts = load_posts(discourse_test)
# Filter out empty labels
test_posts = test_posts[test_posts['discourse_type'] != ""]
print(test_posts)


         subreddit  ... in_reply_to
0      photography  ...            
1      photography  ...   t3_1ds5ds
2      photography  ...  t1_c9tbz9b
3      photography  ...  t1_c9tcqh8
4      photography  ...  t1_c9tbz9b
...            ...  ...         ...
22092  picrequests  ...   t3_163amt
22093  picrequests  ...  t1_c7sk2dc
22094  picrequests  ...   t3_163amt
22095  picrequests  ...  t1_c7supz7
22096  picrequests  ...   t3_163amt

[19812 rows x 10 columns]


### Labels

The label for the post we will be predicting is in the discourse_type column.

In [0]:
train_labels = train_posts['discourse_type']
validation_labels = validation_posts['discourse_type']
test_labels = test_posts['discourse_type']

Examine the distribution over labels on the training data.

In [94]:
discourse_counts = original_train_posts['discourse_type'].value_counts()
print(discourse_counts.describe())

top_discourse = discourse_counts.nlargest(20)
print(top_discourse)

count       10.000000
mean      7926.700000
std       9664.321866
min       1266.000000
25%       1671.500000
50%       3235.500000
75%      11919.750000
max      31419.000000
Name: discourse_type, dtype: float64
answer              31419
elaboration         14775
question            13610
appreciation         6849
agreement            3868
disagreement         2603
humor                1787
other                1633
announcement         1457
negativereaction     1266
Name: discourse_type, dtype: int64


# Q4 Text classification model for comment discourse prediction

In [0]:
from sklearn.base import BaseEstimator, TransformerMixin

class ItemSelector(BaseEstimator, TransformerMixin):
    """For data grouped by feature, select subset of data at a provided key.    """

    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

In [0]:
from sklearn.pipeline import FeatureUnion
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
body_tf = TfidfVectorizer(tokenizer=tokenize_normalize, sublinear_tf= True)
body_tf.fit_transform(train_posts.body)
author_tf = TfidfVectorizer(tokenizer=tokenize_normalize, sublinear_tf= True) 
author_tf.fit_transform(train_posts.author)
title_tf = TfidfVectorizer(tokenizer=tokenize_normalize, sublinear_tf= True)
title_tf.fit_transform(train_posts.title)
prediction_pipeline = Pipeline([
        ('union', FeatureUnion(
          transformer_list=[
            ('body', Pipeline([
              ('selector', ItemSelector(key='body')),
              ('Tf_idf',body_tf), 
              ])),
            ('author', Pipeline([
              ('selector', ItemSelector(key='author')),
              ('Tf_idf', author_tf), 
              ])),
            ('title', Pipeline([
              ('selector', ItemSelector(key='title')),
              ('Tf_idf', title_tf), 
              ]))
            ])
        )
])

In [0]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
train1 = prediction_pipeline.fit_transform(train_posts)
test1 = prediction_pipeline.transform(test_posts)
validation1 = prediction_pipeline.transform(validation_posts)

# Logistic Regression with L2

In [98]:
lr4 = LogisticRegression(penalty='l2', C=10)
lr4.fit(train1, train_labels)
summary11=eval_summary(lr4.predict(test1), test_labels) 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Classifier  has Acc=0.475 P=0.262 R=0.334 F1=0.279
                  precision    recall  f1-score   support

       agreement      0.239     0.391     0.297       580
    announcement      0.104     0.198     0.136       192
          answer      0.693     0.529     0.600     10414
    appreciation      0.588     0.672     0.627      1505
    disagreement      0.045     0.168     0.071       173
     elaboration      0.236     0.287     0.259      2994
           humor      0.037     0.181     0.062        94
negativereaction      0.101     0.263     0.146       118
           other      0.085     0.184     0.116       174
        question      0.486     0.467     0.476      3568

        accuracy                          0.475     19812
       macro avg      0.262     0.334     0.279     19812
    weighted avg      0.542     0.475     0.501     19812


Confusion matrix:
 [[ 227    4  402   52   13  147    3    9    9   85]
 [   1   38  176   13    2   69    2    1    6   57]
 [ 136  

# Error Analysis with validation posts

In [99]:
summary12=eval_summary(lr4.predict(validation1), validation_labels) 

Classifier  has Acc=0.479 P=0.269 R=0.352 F1=0.287
                  precision    recall  f1-score   support

       agreement      0.269     0.419     0.328       503
    announcement      0.122     0.158     0.138       215
          answer      0.712     0.527     0.605      8454
    appreciation      0.578     0.688     0.628      1138
    disagreement      0.048     0.227     0.079       119
     elaboration      0.255     0.310     0.280      2438
           humor      0.051     0.274     0.086        73
negativereaction      0.092     0.213     0.128       108
           other      0.091     0.231     0.131       130
        question      0.471     0.472     0.472      2676

        accuracy                          0.479     15854
       macro avg      0.269     0.352     0.287     15854
    weighted avg      0.552     0.479     0.506     15854


Confusion matrix:
 [[ 211    5  330   30    6  107    0    8   16   70]
 [   5   34  115    9    2   58    3    2    7   43]
 [ 117  

# Dummy classifier

In [100]:
stratified_tf = DummyClassifier(strategy="stratified")
stratified_tf.fit(train1, train_labels)
DS3 = ['Dummy Stratified', 'Tf-idf Vectorizer']
prediction = stratified_tf.predict(test1)
summary32 = eval_summary(prediction, test_labels, avg='macro')

Classifier  has Acc=0.240 P=0.102 R=0.102 F1=0.102
                  precision    recall  f1-score   support

       agreement      0.049     0.048     0.049       971
    announcement      0.014     0.014     0.014       349
          answer      0.400     0.404     0.402      7857
    appreciation      0.097     0.095     0.096      1756
    disagreement      0.031     0.033     0.032       603
     elaboration      0.187     0.183     0.185      3702
           humor      0.018     0.019     0.018       427
negativereaction      0.020     0.019     0.019       317
           other      0.021     0.020     0.020       407
        question      0.187     0.187     0.187      3423

        accuracy                          0.240     19812
       macro avg      0.102     0.102     0.102     19812
    weighted avg      0.239     0.240     0.239     19812


Confusion matrix:
 [[  47   25  381   94   23  172   19   17   21  152]
 [  34    5  147   31    5   72    4   10    6   51]
 [ 378  

# Q5

In [0]:
def load_posts_rich(file):
  # A temporary variable to store the list of post content.
  posts_tmp = list()
  with open(file) as jsonfile:
    for i, line in enumerate(jsonfile):
      thread = json.loads(line)
      thread_author = None
      for post in thread['posts']:
        # NOTE: This could be changed to use additional features from the post or thread.
        # DO NOT change the labels for the test set.
        discourse_type = post.get('majority_type', '')
        post_depth = 0
        if 'is_first_post' in post and post['is_first_post']:
          thread_author = post.get('author', None)
        else:
          post_depth = post['post_depth']
        features = [
            thread["is_self_post"],          
            len(thread['posts']),            
            post_depth,                      
            post.get('author', None),        
            thread_author,                   
            thread['subreddit'],             
            thread['title'],                 
            post.get('body',''),            
            discourse_type,                  
        ]
        posts_tmp.append(features)
  # Create the posts data frame.  
  labels = ['is_self_post','thread_length','post_depth','post_author','thread_author','subreddit','thread_title','body','discourse_type']
  return pd.DataFrame(posts_tmp, columns=labels)

# Load Train and Test data

In [0]:
train_posts = load_posts_rich(discourse_train)
train_posts = train_posts[train_posts['discourse_type'] != ""]
test_posts = load_posts_rich(discourse_test)
test_posts = test_posts[test_posts['discourse_type'] != ""]
train_labels = train_posts['discourse_type']
test_labels = test_posts['discourse_type']

# nltk library for tagged sentences

In [103]:

nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer, MinMaxScaler, MaxAbsScaler
from sklearn.linear_model import LogisticRegression
import nltk
from nltk.corpus import treebank
nltk.download('treebank')
corpus = nltk.corpus.treebank
treebank_tagged_sentences = corpus.tagged_sents()
tagged_sentences = [sentence for sentence in treebank_tagged_sentences]
tagger = nltk.tag.perceptron.PerceptronTagger(load=False)
tagger.train(tagged_sentences)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package treebank to /root/nltk_data...
[nltk_data]   Package treebank is already up-to-date!


# Function Transformer function

In [0]:
import pandas as pd
def get_pos(string):
  tags = [tag for word, tag in tagger.tag([a for a in string.split(' ') if a != ''])]
  transformed = " ".join(tags)
  return transformed
train_posts['pos'] = train_posts['body'].apply(get_pos)
test_posts['pos'] = test_posts['body'].apply(get_pos)

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
#content_punctuation
post_vect = TfidfVectorizer(tokenizer=tokenize_normalize,sublinear_tf= True)
def get_text(df):
  return df['body'].values 
select_text = FunctionTransformer(get_text, validate=False)
content_punctuation_pipeline = Pipeline([
      ('select_text', select_text), 
      ('vectorizer', post_vect)
])
normal_vectorizer =  TfidfVectorizer(tokenizer=tokenize_normalize, sublinear_tf= True)
normal_pipeline = Pipeline([
      ('select_text', select_text), 
      ('vectorizer_normal', normal_vectorizer)
])
# Author
import numpy as np
def same_author(df):
  tmp = []
  for i, j in df.iterrows():
    tmp.append(1 if j['post_author'] == j['thread_author'] else 0)
  return np.array(tmp).reshape(len(tmp), 1)
s_author = FunctionTransformer(same_author, validate=False)
# Thread features
def thread_length(df):
  return df['thread_length'].values.reshape(len(df),1)
t_length = FunctionTransformer(thread_length, validate=False)
# Community:
def get_subreddit(df):    
  return df['subreddit'].values 
subreddit_hash_function = FunctionTransformer(get_subreddit, validate=False)
subreddit_hash = Pipeline([
      ('get_sub', subreddit_hash_function), 
      ('vectorizer', CountVectorizer())
])
# Word2Vec
td = RegexpTokenizer(r'\w+')
def simple_tokenizer(string):
  return [token.lower() for token in td.tokenize(string)]
w2v_vectorizer = TfidfVectorizer(tokenizer=simple_tokenizer, sublinear_tf= True)
def get_pos(df):
  return df['pos'].values
select_pos = FunctionTransformer(get_pos, validate=False)
word_2_vec = Pipeline([
      ('select_pos', select_pos), 
      ('w2v_vectorizer', w2v_vectorizer)
])

In [0]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
prediction_pipeline= Pipeline([
            ('union',FeatureUnion([
              ('Content + Punctuation', content_punctuation_pipeline),
              ('Author', s_author),
              ('Thread features', t_length),
              ('Community', subreddit_hash),
              ('Word2vec', word_2_vec)
            ]))
])    
train2 = prediction_pipeline.fit_transform(train_posts)
test2 = prediction_pipeline.transform(test_posts)

# Logistic Regression

In [107]:
lr5 = LogisticRegression(penalty='l2', C=10,solver='sag')
lr5.fit(train2, train_labels)
summary15=eval_summary(lr5.predict(test2),test_posts['discourse_type']) 




Classifier  has Acc=0.543 P=0.292 R=0.437 F1=0.308
                  precision    recall  f1-score   support

       agreement      0.184     0.501     0.269       349
    announcement      0.395     0.465     0.427       310
          answer      0.821     0.562     0.668     11595
    appreciation      0.584     0.740     0.653      1356
    disagreement      0.027     0.198     0.047        86
     elaboration      0.223     0.334     0.268      2426
           humor      0.028     0.295     0.052        44
negativereaction      0.020     0.400     0.037        15
           other      0.040     0.300     0.070        50
        question      0.599     0.573     0.585      3581

        accuracy                          0.543     19812
       macro avg      0.292     0.437     0.308     19812
    weighted avg      0.666     0.543     0.586     19812


Confusion matrix:
 [[ 175    3  453   52    3  174    0    1    3   87]
 [   2  144   45    9    0   33    0    0    0  132]
 [  67  