In [1]:
import numpy as np
import pandas as pd

from sklearn.preprocessing import LabelEncoder
from collections import defaultdict
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils.np_utils import to_categorical
from keras.layers import Embedding
from keras.layers import Dense, Input, Flatten
from keras.layers import Conv1D, MaxPooling1D, Embedding, Dropout
from keras.models import Model
from keras.callbacks import ModelCheckpoint

import math

Using TensorFlow backend.


In [2]:
from keras import backend as K
K.set_image_data_format('channels_first')

In [3]:
MAX_SEQ_LENGTH = 150
"""max words for tokenizer."""
MAX_WORDS = 10000
EMBEDDING_DIM = 100 
#256

In [4]:
data = pd.read_csv('reddit_top5.csv',sep=',')

In [5]:
print(len(data["subreddit"]))
data.head()

8350048


Unnamed: 0,subreddit,num
0,PewdiepieSubmissions,Cool.
1,PewdiepieSubmissions,Please don’t steal my meme bros thank you
2,PewdiepieSubmissions,From chile with love
3,PewdiepieSubmissions,����WARNING THE SUB GAP IS DANGEROUSLY LOW����...
4,PewdiepieSubmissions,Thank you for sorting by new:)


data = data.reindex(np.random.permutation(data.index))
data = data[0:10000]

In [6]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
Y = le.fit_transform(data['subreddit'])
Y = to_categorical(np.asarray(Y))

In [7]:
import re

def replace_abbreviations(text):
    texts = []
    for item in text:
        item = str(item)
        item = item.lower().replace("it's", "it is").replace("i'm", "i am").replace("he's", "he is").replace("she's", "she is")\
      .replace("we're", "we are").replace("they're", "they are").replace("you're", "you are").replace("that's", "that is")\
      .replace("this's", "this is").replace("can't", "can not").replace("don't", "do not").replace("doesn't", "does not")\
      .replace("we've", "we have").replace("i've", " i have").replace("isn't", "is not").replace("won't", "will not")\
      .replace("hasn't", "has not").replace("wasn't", "was not").replace("weren't", "were not").replace("let's", "let us")\
      .replace("didn't", "did not").replace("hadn't", "had not").replace("waht's", "what is").replace("couldn't", "could not")\
      .replace("you'll", "you will").replace("you've", "you have")
        item = item.replace("'s", "")
        texts.append(item)
    return texts
 
def clear_review(text):
    for i in range(len(text)):
        item = text[i]
        item = item.replace("<br /><br />", "")
        item = re.sub("[^a-zA-Z]", " ", item.lower())
        text[i]=" ".join(item.split())
    return text

def stemed_words(text):
    stop_words = ['to','of','at','by','is','do','does','a','an','the']
    for i in range(len(text)):
        item = text[i]
        words = [w for w in item.split() if w not in stop_words]
        text[i]=" ".join(words)
    return text

def preprocess(text):
    text = replace_abbreviations(text)
    text = clear_review(text)
    text = stemed_words(text)
    return text
 
X = preprocess(data['num'])


In [8]:
tokenizer = Tokenizer(num_words=MAX_WORDS,split=" ",char_level=False)  
tokenizer.fit_on_texts(X)
seq = tokenizer.texts_to_sequences(X)
X = pad_sequences(seq, maxlen=MAX_SEQ_LENGTH)

In [9]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X,Y,test_size = 0.3,shuffle = True,random_state=30)

In [10]:
word_index = tokenizer.word_index

In [11]:
inputs = Input(shape=(MAX_SEQ_LENGTH,), dtype='int32')
embedding = Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=MAX_SEQ_LENGTH,trainable=True)(inputs)
l_cov1= Conv1D(128, 5, activation='relu')(embedding)
l_pool1 = MaxPooling1D(2)(l_cov1)
l_cov2 = Conv1D(128, 5, activation='relu')(l_pool1)
l_pool2 = MaxPooling1D(2)(l_cov2)
l_cov3 = Conv1D(128, 5, activation='relu')(l_pool2)
l_pool3 = MaxPooling1D(30)(l_cov3)
l_flat = Flatten()(l_pool3)
l_dense = Dense(128, activation='relu')(l_flat)
preds = Dense(5, activation='softmax')(l_dense)

model = Model(inputs, preds)
model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
Model: "model_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         (None, 150)               0         
_________________________________________________________________
embedding_1 (Embedding)      (None, 150, 100)          34952100  
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 146, 128)          64128     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 73, 128)           0         
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 69, 128)           82048     
_________________________________________________________________
max_pooling1d_2 (MaxPooling1 (None, 34, 128)           0         
_____________________________________________________

In [None]:
checkpoint=ModelCheckpoint('model_cnn.hdf5',monitor='val_acc',verbose=1,save_best_only=True)
model.fit(X_train, y_train, validation_data=(X_test, y_test),epochs=15, batch_size=1000,callbacks=[checkpoint])
#batch size bigger, acuuracy better. (try 1000 or 500,2000,4000...)

Instructions for updating:
Use tf.cast instead.
Train on 5845033 samples, validate on 2505015 samples
Epoch 1/15
Epoch 2/15




Epoch 3/15
Epoch 4/15