---
Import, process, and clean the data
---
---


In [2]:
import pandas as pd
import numpy as np 

# Load in the data 
#df = pd.read_csv('rspct.tsv', sep="\t")
df = pd.read_csv('rspct.tsv', sep="\t")

In [3]:
df.shape

(1013000, 4)

In [4]:
# Get rid of ambiguous data that can decrese model efficiency 
df = df.drop(['id'], axis=1)

# Combine 'title' and 'selftext' columns to enrich data
df["full_text"] = df["title"] + " " + df["selftext"]
df.head()

Unnamed: 0,subreddit,title,selftext,full_text
0,talesfromtechsupport,Remember your command line switches...,"Hi there, <lb>The usual. Long time lerker, fi...",Remember your command line switches... Hi ther...
1,teenmom,"So what was Matt ""addicted"" to?",Did he ever say what his addiction was or is h...,"So what was Matt ""addicted"" to? Did he ever sa..."
2,Harley,No Club Colors,Funny story. I went to college in Las Vegas. T...,No Club Colors Funny story. I went to college ...
3,ringdoorbell,"Not door bell, but floodlight mount height.",I know this is a sub for the 'Ring Doorbell' b...,"Not door bell, but floodlight mount height. I ..."
4,intel,Worried about my 8700k small fft/data stress r...,"Prime95 (regardless of version) and OCCT both,...",Worried about my 8700k small fft/data stress r...


In [5]:
# Extract 5 percent of the dataset, even .1 of the data yields memory errors
df = df.sample(frac=0.05, random_state=7)

In [6]:
from bs4 import BeautifulSoup
# Function to clean html tags
def soup(text):
    soup = BeautifulSoup(text, 'html.parser')
    clean_text = soup.get_text()
    return clean_text

In [7]:
df['clean_text'] = df['full_text'].apply(soup)
df.head()

Unnamed: 0,subreddit,title,selftext,full_text,clean_text
929565,OnePunchMan,[Spoilers] Black Sperm's cell stock,"We all know Black Sperm is op, but let's revie...",[Spoilers] Black Sperm's cell stock We all kno...,[Spoilers] Black Sperm's cell stock We all kno...
100277,devops,PSA: Zookeeper 3.5 beta issues (can break Apac...,If anyone is looking into deploying Zookeeper ...,PSA: Zookeeper 3.5 beta issues (can break Apac...,PSA: Zookeeper 3.5 beta issues (can break Apac...
707395,namenerds,How do you design a sibset?,"Say you pick the first child's name, and it's ...",How do you design a sibset? Say you pick the f...,How do you design a sibset? Say you pick the f...
374556,twinpeaks,[S3E16] Plotlines still to be resolved before ...,With only two hours to go I wanted to get this...,[S3E16] Plotlines still to be resolved before ...,[S3E16] Plotlines still to be resolved before ...
503596,FidgetSpinners,Why do teachers like to take Fidget Spinners?,I only see one situation. I can see when peopl...,Why do teachers like to take Fidget Spinners? ...,Why do teachers like to take Fidget Spinners? ...


In [8]:
# Taking a look at value count of target 
df['subreddit'].value_counts()

AskEconomics          79
Sneakers              70
AcademicPsychology    70
adderall              69
gravityfalls          68
                      ..
HomeDepot             34
latin                 33
civilengineering      33
RocketLeague          33
CryptoKitties         31
Name: subreddit, Length: 1013, dtype: int64

In [9]:
from sklearn.model_selection import train_test_split

# We'll do a 80/20 train/test split and stratify the target 
train, test = train_test_split(df, test_size=0.2, stratify=df["subreddit"])

# Sanity check 
train.shape, test.shape

((40520, 5), (10130, 5))

In [10]:
# Assign data to features/target
X_train = train["clean_text"]
X_test = test["clean_text"]

y_train = train["subreddit"]
y_test = test["subreddit"]

print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(40520,)
(10130,)
(40520,)
(10130,)


In [11]:
y_train.head()

982337             GearsOfWar
693107    legaladviceofftopic
318088                   SCCM
4425              backpacking
538155        ProtectAndServe
Name: subreddit, dtype: object

In [12]:
# Sanity check v2
y_train.value_counts()

AskEconomics          63
Sneakers              56
AcademicPsychology    56
adderall              55
gravityfalls          54
                      ..
HomeDepot             27
RWBY                  27
latin                 26
RocketLeague          26
CryptoKitties         25
Name: subreddit, Length: 1013, dtype: int64

---
Use Label Encoding on the target 
---
---

In [13]:
from sklearn.preprocessing import LabelEncoder

# Instantiate LabelEncoder
encoder = LabelEncoder()
# Fit the encoder
encoder.fit(y_train)
# Transform on train and test 
y_train = encoder.transform(y_train)
y_test  = encoder.transform(y_test)

---
Create a pipeline with countvectorizer, tfidf-transformer, and a SGDClassifier model
---
---

In [14]:
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
text_clf = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', SGDClassifier()),
    ])

In [15]:
text_clf.fit(X_test, y_test)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=Non...
                 SGDClassifier(alpha=0.0001, average=False, class_weight=None,
                               early_stopping=False, epsilon=0.1, eta0=0.0,
                               fit_intercept=True, l1_ratio=0.15,
                               learning_rate='optimal', loss='hinge',
                               max_iter=1000,

In [16]:
test_post = np.array(["I love getting new sneakers; Jordan's, Nike, Addidas, custom footwear. Wanting to see what people think of these fresh kicks. I love the laces, contour of the shoe, more generic sneaker and shoe terms"])
text_clf.predict(test_post)

array([329])

In [17]:
df.iloc[329][0]

'tattoo'

In [18]:
from sklearn.metrics import classification_report

text_clf.fit(X_train,y_train)
predictions = text_clf.predict(X_test)
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.67      0.60      0.63        10
           1       1.00      0.38      0.55         8
           2       0.43      0.60      0.50        10
           3       0.44      0.44      0.44         9
           4       0.86      0.55      0.67        11
           5       0.90      0.75      0.82        12
           6       0.50      0.42      0.45        12
           7       0.58      1.00      0.73        11
           8       0.85      1.00      0.92        11
           9       1.00      0.78      0.88         9
          10       0.44      0.40      0.42        10
          11       0.61      0.92      0.73        12
          12       0.57      0.89      0.70         9
          13       0.53      0.57      0.55        14
          14       0.50      0.44      0.47         9
          15       0.45      0.50      0.48        10
          16       0.64      0.78      0.70         9
          17       0.89    

  _warn_prf(average, modifier, msg_start, len(result))


In [19]:
from sklearn import metrics
y_train_pred = text_clf.predict(X_train)
print (metrics.accuracy_score(y_train, y_train_pred))

0.990523198420533


---
function that returns the top 5 results
---
---

In [20]:
def get_predictions(post, num_answers=5):
  """ takes a potential post and returns the top options """

  preds = pd.Series(text_clf.decision_function(post)[0])

  preds.index = text_clf.classes_

  preds = preds.sort_values(ascending=False)

  return preds[:5]

In [21]:
get_predictions(test_post)

329   -0.567996
404   -0.981242
735   -0.997192
248   -0.998099
690   -1.014068
dtype: float64

In [22]:
df.iloc[324]

subreddit                                       DBZDokkanBattle
title                               dokkan battlefield level 16
selftext      how the fuck do you beat this level? ive sent ...
full_text     dokkan battlefield level 16 how the fuck do yo...
clean_text    dokkan battlefield level 16 how the fuck do yo...
Name: 711479, dtype: object

In [23]:
nba_post = [ """
               LeBron James and Kobe Bryant and both great NBA players and we should stop comparing them.
                """]

In [24]:
get_predictions(nba_post)

982   -0.743801
457   -0.971568
874   -0.990362
642   -0.994767
772   -1.000571
dtype: float64

In [25]:
df.iloc[738]

subreddit                                                Ripple
title                              Quick paper wallet questions
selftext      I plan on buying Ripple soon and storing it in...
full_text     Quick paper wallet questions I plan on buying ...
clean_text    Quick paper wallet questions I plan on buying ...
Name: 254102, dtype: object

---
Testing pipeline with a couple other models really quickly, first RFC
---
---

In [None]:
from sklearn.ensemble import RandomForestClassifier
text_clf1 = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', RandomForestClassifier()),
    ])

text_clf1.fit(X_test, y_test)

In [None]:
test_post = np.array(["I love getting new sneakers; Jordan's, Nike, Addidas, custom footwear. Wanting to see what people think of these fresh kicks"])
text_clf1.predict(test_post)

In [None]:
from sklearn import metrics
y_train_pred1 = text_clf1.predict(X_train)
print (metrics.accuracy_score(y_train, y_train_pred1))

---
 pickling the SGD model 
---
---

In [None]:
import pickle

# Save the Model to file in the current working directory
Pkl_Filename = "Baseline_SGD_Model.pkl"  

with open(Pkl_Filename, 'wb') as file:  
    pickle.dump(text_clf, file)

In [None]:
#def picklizer(to_pickle, filename, path):
    """
    Creates a pickle file.
    
    Parameters
    ----------
    to_pickle : Python object
        The trained / fitted instance of the 
        transformer or model to be pickled.
    filename : string
        The desired name of the output file,
        not including the '.pkl' extension.
    path : string or path-like object
        The path to the desired output directory.
    """
    import os
    import pickle

    # Create the path to save location
    picklepath = os.path.join(path, filename)

    # Use context manager to open file
    with open(picklepath, "wb") as p:
        pickle.dump(to_pickle, p)

In [None]:
#vect = text_clf.named_steps['vect']
#tfidf = text_clf.named_steps['tfidf']
#clf = text_clf.named_steps['clf']

In [None]:
#filepath = "models/"

# Export vectorizer as pickle
#picklizer(vect, "vec_01.pkl", filepath)

# Export transformer as pickle
#picklizer(tfidf, "tfidf_01.pkl", filepath)

# Export sgd model as pickle
#picklizer(clf, "clf_01.pkl", filepath)

---
load the pickled model and try to run predictions
---
---

In [None]:
with open(Pkl_Filename, 'rb') as file:  
    Baseline_SGD_Model = pickle.load(file)

Baseline_SGD_Model

In [None]:
Baseline_SGD_Model.fit(X_test, y_test)

In [None]:
Baseline_SGD_Model.predict(test_post)

In [None]:
from sklearn import metrics
y_train_pred = Baseline_SGD_Model.predict(X_train)
print (metrics.accuracy_score(y_train, y_train_pred))

---
zip the model
---
---

In [None]:
pip install py7zr

In [None]:
import py7zr

archive = py7zr.SevenZipFile('Baseline_SGD_Model.7z', mode='r')
archive.extractall(path="/tmp")
archive.close()