In [1]:
import numpy as np
import pandas as pd
import re
import string
from wordcloud import WordCloud
import matplotlib.pyplot as plt
%matplotlib inline
from nltk.corpus import stopwords # Import the stop word list
from tqdm import tqdm, tqdm_pandas
from sklearn.feature_extraction.text import CountVectorizer
import random

In [2]:
tqdm_pandas(tqdm())

0it [00:00, ?it/s]


In [3]:
removeTag = lambda x : re.sub('<[^>]*>','',x)
removeNewLine = lambda x: re.sub('\\n',' ', x)
removePunc = lambda x: re.sub("[^a-zA-Z\s]","",string.lower(x.translate(string.maketrans("",""), string.punctuation)))
reduceSpaces = lambda x: re.sub("[\s]{2,}"," ",x)

def stripUnMeaningfulWords(sentance):
    words = sentance.split(" ")
    newWorldList = [w for w in words if not w in stopwords.words("english")]
    return(string.join(newWorldList))

removeUnwantedTag = lambda x : x.replace('please-remove-this-tag','')
removeCommonWords = lambda x: x.replace('ive','').replace('im','')

def wordCloudGenerator(column_heading):

    wordcloud = WordCloud().generate(string.join(cooking_frame[column_heading].tolist()))
    # lower max_font_size
    #wordcloud = WordCloud().generate(text)
    plt.figure()
    plt.imshow(wordcloud)
    plt.axis("off")
    plt.show()
    


In [4]:
def cleanContent(df, contentColumn):
    df['content_clean'] = df[contentColumn].apply(removeTag).apply(removeNewLine).apply(removePunc).apply(reduceSpaces)
    df['content_clean'] = df['content_clean'].progress_apply(stripUnMeaningfulWords)
    df['content_clean'] = df['content_clean'].apply(removeCommonWords)
    return df

def cleanTags(df, tagColumn):
    df['tags_clean'] = df['tags'].apply(removeUnwantedTag).apply(removeCommonWords)   
    return df

In [5]:
cooking_frame = pd.read_csv('~/Data/cooking.csv')
biology_frame = pd.read_csv('~/Data/biology.csv')
crypto_frame = pd.read_csv('~/Data/crypto.csv')
diy_frame = pd.read_csv('~/Data/diy.csv')
robotics_frame = pd.read_csv('~/Data/robotics.csv')
travel_frame = pd.read_csv('~/Data/travel.csv')


In [6]:
cooking_frame = cleanContent(cooking_frame, 'content')
biology_frame = cleanContent(biology_frame, 'content')
crypto_frame = cleanContent(crypto_frame, 'content')
diy_frame = cleanContent(diy_frame, 'content')
robotics_frame = cleanContent(robotics_frame, 'content')
travel_frame = cleanContent(travel_frame, 'content')

15404it [02:54, 88.50it/s]
100%|██████████| 13196/13196 [02:32<00:00, 86.61it/s]
100%|██████████| 10432/10432 [02:50<00:00, 61.11it/s]
100%|██████████| 25918/25918 [06:15<00:00, 68.93it/s]
100%|██████████| 2771/2771 [00:48<00:00, 57.05it/s]
100%|██████████| 19279/19279 [03:48<00:00, 84.22it/s]


In [7]:
cooking_frame = cleanTags(cooking_frame, 'tags')
biology_frame = cleanTags(biology_frame, 'tags')
crypto_frame = cleanTags(crypto_frame, 'tags')
diy_frame = cleanTags(diy_frame, 'tags')
robotics_frame = cleanTags(robotics_frame, 'tags')
travel_frame = cleanTags(travel_frame, 'tags')

In [8]:
frames = [cooking_frame, biology_frame, crypto_frame, diy_frame, robotics_frame, travel_frame]

In [9]:
overallFrame = pd.concat(frames)

In [10]:
overallFrame.head(20)

Unnamed: 0,id,title,content,tags,content_clean,tags_clean
0,1,How can I get chewy chocolate chip cookies?,<p>My chocolate chips cookies are always too c...,baking cookies texture,chocolate chips cookies always crisp get chewy...,baking cookies texture
1,2,How should I cook bacon in an oven?,<p>I've heard of people cooking bacon in an ov...,oven cooking-time bacon,heard people cooking bacon oven laying strips...,oven cooking-te bacon
2,3,What is the difference between white and brown...,"<p>I always use brown extra large eggs, but I ...",eggs,always use brown extra large eggs cant honestl...,eggs
3,4,What is the difference between baking soda and...,<p>And can I use one in place of the other in ...,substitutions please-remove-this-tag baking-so...,use one place certain recipes,substitutions baking-soda baking-powder
4,5,"In a tomato sauce recipe, how can I cut the ac...",<p>It seems that every time I make a tomato sa...,sauce pasta tomatoes italian-cuisine,seems every te make tomato sauce pasta sauce l...,sauce pasta tomatoes italian-cuisine
5,6,What ingredients (available in specific region...,<p>I have a recipe that calls for fresh parsle...,substitutions herbs parsley,recipe calls fresh parsley substituted fresh h...,substitutions herbs parsley
6,9,What is the internal temperature a steak shoul...,<p>I'd like to know when to take my steaks off...,food-safety beef cooking-time,id like know take steaks grill please everybody,food-safety beef cooking-te
7,11,How should I poach an egg?,<p>What's the best method to poach an egg with...,eggs basics poaching,whats best method poach egg without turning eg...,eggs basics poaching
8,12,"How can I make my Ice Cream ""creamier""",<p>My ice cream doesn't feel creamy enough. I...,ice-cream,ice cream doesnt feel creamy enough got recipe...,ice-cream
9,17,How long and at what temperature do the variou...,"<p>I'm interested in baking thighs, legs, brea...",baking chicken cooking-time,interested baking thighs legs breasts wings l...,baking chicken cooking-te


In [11]:
pattern = '''([\w-]+)'''
pattern2 = '''(?u)\\b\\w\\w+\\b'''
#pattern3 = '''(?u)\\b\\w+\\w-w\\b'''



#vectorizer = CountVectorizer(min_df=1)
#http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer
#vectorizer = CountVectorizer(analyzer=partial(nltk.regexp_tokenize, pattern=pattern) )
vectorizer = CountVectorizer(analyzer='word', binary=False, decode_error='strict',
       dtype=np.float32, encoding='utf-8', input='content',
       lowercase=True, max_df=1.0, max_features=None, min_df=1,
       ngram_range=(1, 1), preprocessor=None, stop_words=None,
       strip_accents=None, token_pattern=pattern2,
       tokenizer=None, vocabulary=None)

corpus = overallFrame['content_clean'].tolist()
X = vectorizer.fit_transform(corpus)
#vectorizer = CountVectorizer(analyzer='word', binary=False, decode_error='strict',
#        dtype=np.float32, encoding='utf-8', input='content',
#        lowercase=True, max_df=1.0, max_features=None, min_df=1,
#        ngram_range=(1, 1), preprocessor=None, stop_words=None,
#        strip_accents=None, token_pattern=pattern3,
#        tokenizer=None, vocabulary=None)


In [12]:
from functools import partial
import nltk
#corpus = overallFrame['content_clean'].tolist()
#X = vectorizer.fit_transform(corpus)
pattern = '''([\w-]+)'''
vectorizer = CountVectorizer(analyzer=partial(nltk.regexp_tokenize, pattern=pattern) )

corpus = overallFrame['tags_clean'].tolist()
Y = vectorizer.fit_transform(corpus)



#overallFrame.head(10)

In [14]:
#print vectorizer.vocabulary_

In [15]:
Z = vectorizer.inverse_transform(Y)
Z[6]

array(['cooking-te', 'food-safety', 'beef'], 
      dtype='|S25')

In [16]:
import tensorflow as tf

from keras.models import Sequential
from keras.layers import Input, Dropout, Flatten, Convolution2D, MaxPooling2D, Dense, Activation
from keras.optimizers import RMSprop
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping
from keras.utils import np_utils

Using TensorFlow backend.


In [17]:
print np.shape(X)

print np.shape(Y)

(87000, 139899)
(87000, 4267)


In [17]:
optimizer = RMSprop(lr=1e-4)
objective = 'binary_crossentropy'

model = Sequential()

#model.add(Convolution2D(32, 3, 3, border_mode='same', input_shape=[WIDTH, HEIGHT, 3], activation='relu'))
#model.add(Flatten())
model.add(Dense(8000, input_shape=[139899,], activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(4000, activation='relu'))
model.add(Dropout(0.6))

model.add(Dense(2000, activation='relu'))
model.add(Dropout(0.5))

model.add(Dense(4267))
model.add(Activation('sigmoid'))

model.compile(loss=objective, optimizer=optimizer, metrics=['accuracy'])

In [18]:
nb_epoch = 5
batch_size = 150

## Callback for loss logging per epoch
class LossHistory(Callback):
    def on_train_begin(self, logs={}):
        self.losses = []
        self.val_losses = []
        
    def on_epoch_end(self, batch, logs={}):
        self.losses.append(logs.get('loss'))
        self.val_losses.append(logs.get('val_loss'))

early_stopping = EarlyStopping(monitor='val_loss', patience=3, verbose=1, mode='auto') 

In [19]:
history = LossHistory()


#train_x = np.array(train_x).astype(float)




#train_y = np.array(train_y).astype(float)


#test_x = [test[i] for i in range(len(test))]
#test_x = np.array(test_x).astype(float)

In [20]:
train_X = X.toarray()
train_Y = Y.toarray()

MemoryError: 

In [21]:
#train_x = [X[i,:] for i in tqdm(range(28600))]
#train_y = [Y[i,:] for i in tqdm(range(28600))]

In [22]:
train_X

array([[ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       ..., 
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.],
       [ 0.,  0.,  0., ...,  0.,  0.,  0.]], dtype=float32)

In [23]:
model.fit(train_X, train_Y, batch_size=batch_size, nb_epoch=nb_epoch,
              validation_split=0.20, verbose=1, shuffle=True, callbacks=[history, early_stopping])

Train on 22880 samples, validate on 5720 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f330919ea90>

In [47]:
# now predict training features
Z = model.predict(train_X)

In [48]:
low_values_indices = Z < 0.05  # Where values are low
Z[low_values_indices] = 0  # All low values set to 0

In [49]:
new_Y = vectorizer.inverse_transform(Z)

In [54]:
new_Y[4]

array(['baking', 'food-safety'], 
      dtype='|S25')