# Imprint Data Science Exercise - Reddit Text Classification

### Pip install cell

In [0]:
#wrapper for reddit api
!pip install praw
#keras based library that help uses BERT
!pip install ktrain
#flask for serving the model
!pip install flask
!pip install flask-ngrok

Collecting praw
[?25l  Downloading https://files.pythonhosted.org/packages/25/c0/b9714b4fb164368843b41482a3cac11938021871adf99bf5aaa3980b0182/praw-6.5.1-py3-none-any.whl (134kB)
[K     |██▍                             | 10kB 24.4MB/s eta 0:00:01[K     |████▉                           | 20kB 1.5MB/s eta 0:00:01[K     |███████▎                        | 30kB 2.1MB/s eta 0:00:01[K     |█████████▊                      | 40kB 1.6MB/s eta 0:00:01[K     |████████████▏                   | 51kB 1.8MB/s eta 0:00:01[K     |██████████████▋                 | 61kB 2.2MB/s eta 0:00:01[K     |█████████████████               | 71kB 2.3MB/s eta 0:00:01[K     |███████████████████▌            | 81kB 2.5MB/s eta 0:00:01[K     |█████████████████████▉          | 92kB 2.8MB/s eta 0:00:01[K     |████████████████████████▎       | 102kB 2.7MB/s eta 0:00:01[K     |██████████████████████████▊     | 112kB 2.7MB/s eta 0:00:01[K     |█████████████████████████████▏  | 122kB 2.7MB/s eta 0:00:01

### Imports

In [0]:
import praw
import pandas as pd
import numpy as np
import string
from tqdm import tqdm
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import ktrain
from ktrain import text

from flask import Flask
from flask import request
import re

using Keras version: 2.2.4-tf


### Constants

In [0]:
MY_CLIENT_ID = "Z_U0Fc057pEKJA"
MY_CLIENT_SECRET = "gDcrvRJaOI8mG97lMLkBehQT_qw"
MY_USER_AGENT = "local_python:com.task.mysubredditanalysis (by u/reddit_bert_user)"
ALL_PUNCT = string.punctuation
N_POSTS=100
MAX_BERT_LENGTH=512
VAL_PERC=0.25
SUBREDDIT_LIST = ['datascience', 'askpsychology', 'startups', 'FanTheories', 'hockey', 'dogs', 
                  'SecurityAnalysis', 'askscience', 'AskHistorians', 'askmath']

### Task 1 - Data Collection



#### Define Functions

In [0]:
def validate_post(my_post_title, my_post_text, my_post_list):
    """This function checks if the new post has text and then appends to the list of posts."""
    if my_post_text != '':
            if my_post_title[-1] in ALL_PUNCT:
                my_post_list.append(my_post_title + ' ' + my_post_text)
            else:
                my_post_list.append(my_post_title + '. ' + my_post_text)           
    return my_post_list

In [0]:
def get_n_subreddit_top_posts(subreddit_name, n_posts, reddit_client_id = MY_CLIENT_ID, 
                              reddit_client_secret = MY_CLIENT_SECRET, 
                              reddit_user_agent=MY_USER_AGENT):
    """This function returns the top number of posts from a given subreddit from Reddit.
    subreddit_name: The name of the subreddit we want posts from
    n_posts: The number of top posts from the subreddit
    reddit_client_id: The client id of the Reddit developer
    reddit_user_agent: The user agent of the Reddit developer"""
    
    reddit_instance = praw.Reddit(client_id=MY_CLIENT_ID,
                     client_secret=MY_CLIENT_SECRET,
                     user_agent=MY_USER_AGENT )
    my_subreddit = reddit_instance.subreddit(subreddit_name)
    post_list=[]
    top_post_generator = my_subreddit.top(limit=None)
    for my_post in top_post_generator:
        post_title = my_post.title
        post_text = my_post.selftext
        post_list = validate_post(post_title, post_text, post_list)
        
        if len(post_list) >= n_posts:
            break
    return post_list

In [0]:
def get_df_of_posts(my_subreddit_list, n_posts):
  """This function creates a dataframe with all the posts from the various subreddits."""
  df_dict = {'subreddit_category': [], 'reddit_post': []}
  my_df = pd.DataFrame(data=df_dict)
  
  for my_subreddit_name in tqdm(my_subreddit_list):
      post_list = get_n_subreddit_top_posts(my_subreddit_name, n_posts)
      df_b_dict = {'subreddit_category': [my_subreddit_name]*len(post_list), 'reddit_post': post_list}
      df_b=pd.DataFrame(data=df_b_dict)
      my_df=pd.concat([my_df, df_b], axis=0).reset_index().drop(['index'], axis=1)

  return my_df

#### Download All the Data

In [0]:
new_df = get_df_of_posts(SUBREDDIT_LIST, N_POSTS)

100%|██████████| 10/10 [00:57<00:00,  5.54s/it]


In [0]:
new_df.subreddit_category.value_counts()

askmath             100
dogs                100
hockey              100
FanTheories         100
AskHistorians       100
askscience          100
datascience         100
askpsychology       100
SecurityAnalysis    100
startups            100
Name: subreddit_category, dtype: int64

#### Encode the text classes into numbers

In [0]:
my_encoder = LabelEncoder()
my_encoder.fit(SUBREDDIT_LIST)
new_df.subreddit_category = my_encoder.transform(new_df.subreddit_category)

In [0]:
new_df.subreddit_category.value_counts()

9    100
8    100
7    100
6    100
5    100
4    100
3    100
2    100
1    100
0    100
Name: subreddit_category, dtype: int64

#### Train Test Split

In [0]:
X = new_df.drop(['subreddit_category'], axis=1)
y = new_df.subreddit_category
    
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

original_size = new_df.shape[0]
train_size = X_train.shape[0]
test_size = X_test.shape[0]

print("Size of Train set: {}%".format(train_size/original_size*100))
print("Size of Test set: {}%".format(test_size/original_size*100))

Size of Train set: 80.0%
Size of Test set: 20.0%


### Task 2 - Classification Model

#### Preprocess the data

In [0]:
(X_train,  y_train), (X_test, y_test), preproc = text.texts_from_array(x_train=X_train.reddit_post.to_list(), y_train=y_train.to_list(), 
                                                                       x_test=X_test.reddit_post.to_list(), y_test=y_test.to_list(), 
                                                                       val_pct=VAL_PERC,
                                                                       class_names=list(my_encoder.classes_),
                                                                       preprocess_mode='bert',
                                                                       maxlen=MAX_BERT_LENGTH) #The max length bert supports is 512 characters.

downloading pretrained BERT model (uncased_L-12_H-768_A-12.zip)...
[██████████████████████████████████████████████████]
extracting pretrained BERT model...
done.

cleanup downloaded zip...
done.

preprocessing train...
language: en


preprocessing test...
language: en


##### Create and fit the BERT Model

In [0]:
model = text.text_classifier('bert', train_data=(X_train, y_train), preproc=preproc)
learner = ktrain.get_learner(model, train_data=(X_train, y_train), batch_size=6, val_data=(X_test, y_test))

Is Multi-Label? False
maxlen is 512
done.


In [0]:
learner.fit_onecycle(2e-5, 4)



begin training using onecycle policy with max lr of 2e-05...
Train on 800 samples, validate on 200 samples
Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<tensorflow.python.keras.callbacks.History at 0x7f1b7a362b70>

We can see from the above epochs that the validation accuracy stops increasing after the 4th epoch and the training accuracy reaches 0.99 which is close to the highest possible. So further increasing epochs would results in overfitting. Therefore we stop at 4 epochs.

#### Evaluate BERT Model

In [0]:
learner.validate(val_data=(X_test, y_test), class_names=list(my_encoder.classes_))

                  precision    recall  f1-score   support

   AskHistorians       0.89      1.00      0.94        17
     FanTheories       1.00      1.00      1.00        24
SecurityAnalysis       0.82      0.82      0.82        17
         askmath       0.82      0.90      0.86        20
   askpsychology       0.89      0.81      0.85        21
      askscience       0.85      0.77      0.81        22
     datascience       0.95      0.91      0.93        22
            dogs       1.00      1.00      1.00        21
          hockey       0.94      1.00      0.97        17
        startups       0.79      0.79      0.79        19

        accuracy                           0.90       200
       macro avg       0.90      0.90      0.90       200
    weighted avg       0.90      0.90      0.90       200



array([[17,  0,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0, 24,  0,  0,  0,  0,  0,  0,  0,  0],
       [ 0,  0, 14,  1,  0,  0,  0,  0,  0,  2],
       [ 0,  0,  0, 18,  0,  1,  0,  0,  1,  0],
       [ 0,  0,  0,  1, 17,  2,  0,  0,  0,  1],
       [ 2,  0,  0,  1,  2, 17,  0,  0,  0,  0],
       [ 0,  0,  0,  1,  0,  0, 20,  0,  0,  1],
       [ 0,  0,  0,  0,  0,  0,  0, 21,  0,  0],
       [ 0,  0,  0,  0,  0,  0,  0,  0, 17,  0],
       [ 0,  0,  3,  0,  0,  0,  1,  0,  0, 15]])

### Create and save predictor to be Served

In [0]:
predictor = ktrain.get_predictor(learner.model, preproc)

In [0]:
predictor.save('my_predictor')

### Task 3 - Model Serving (Bonus Task)

In [0]:
from flask_ngrok import run_with_ngrok
from flask import Flask

app = Flask(__name__)
run_with_ngrok(app)   #starts ngrok when the app is run@app.route("/")

@app.route('/classify_text/<name>')
def model_to_serve(name):
  """This function takes a string as input and predicts the subreddit it came from"""
  predictor = ktrain.load_predictor('my_predictor')
  my_prediction = predictor.predict(name)
  return f"I consider it being {my_prediction}"

@app.route("/")
def home():
    return "<h1>Go to /classify_text/ to make it run</h1>"
  
app.run()


 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
   Use a production WSGI server instead.
 * Debug mode: off


 * Running on http://127.0.0.1:5000/ (Press CTRL+C to quit)


 * Running on http://0f69ebe0.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


127.0.0.1 - - [12/Jan/2020 23:36:46] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/Jan/2020 23:36:46] "[33mGET /favicon.ico HTTP/1.1[0m" 404 -


127.0.0.1 - - [12/Jan/2020 23:37:23] "[37mGET /classify_text/owls HTTP/1.1[0m" 200 -
127.0.0.1 - - [12/Jan/2020 23:37:43] "[33mGET /classify_text/ HTTP/1.1[0m" 404 -
127.0.0.1 - - [12/Jan/2020 23:41:00] "[37mGET / HTTP/1.1[0m" 200 -
