In [1]:
import numpy as np
import pandas as pd
import re
import string
from nltk.corpus import stopwords # Import the stop word list
from tqdm import tqdm, tqdm_pandas

In [2]:
cooking_frame = pd.read_csv('..\Data\Cooking.csv')

In [3]:
cooking_frame['content']

0        <p>My chocolate chips cookies are always too c...
1        <p>I've heard of people cooking bacon in an ov...
2        <p>I always use brown extra large eggs, but I ...
3        <p>And can I use one in place of the other in ...
4        <p>It seems that every time I make a tomato sa...
5        <p>I have a recipe that calls for fresh parsle...
6        <p>I'd like to know when to take my steaks off...
7        <p>What's the best method to poach an egg with...
8        <p>My ice cream doesn't feel creamy enough.  I...
9        <p>I'm interested in baking thighs, legs, brea...
10       <p>I've fallen in love with this wonderful <a ...
11       <p>Is there really an advantage to sifting flo...
12       <p>When I roast a goose, I decant the fat, str...
13       <p>Where can safe and reliable instructions (i...
14       <p>I know what spices like garlic and black pe...
15       <p>Is it safe to leave butter at room temperat...
16       <p>In this <a href="http://www.chefmichaelsmit.

In [4]:
removeTag = lambda x : re.sub('<[^>]*>','',x)
removeNewLine = lambda x: re.sub('\\n',' ', x)
removePunc = lambda x: re.sub("[^a-zA-Z\s]","",string.lower(x.translate(string.maketrans("",""), string.punctuation)))
reduceSpaces = lambda x: re.sub("[\s]{2,}"," ",x)

def stripUnMeaningfulWords(sentance):
    words = sentance.split(" ")
    newWorldList = [w for w in words if not w in stopwords.words("english")]
    return(string.join(newWorldList))


In [5]:
cooking_frame.head(2)

Unnamed: 0,id,title,content,tags
0,1,How can I get chewy chocolate chip cookies?,<p>My chocolate chips cookies are always too c...,baking cookies texture
1,2,How should I cook bacon in an oven?,<p>I've heard of people cooking bacon in an ov...,oven cooking-time bacon


In [7]:
#removetag(cooking_frame['content'][11])

cooking_frame['content_clean'] = cooking_frame['content'].apply(removeTag).apply(removeNewLine).apply(removePunc).apply(reduceSpaces)                                
#clean = cooking_frame['content'].ix[:10].apply(removeTag).apply(removeNewLine).apply(removePunc).apply(reduceSpaces).apply(stripUnMeaningfulWords)

In [None]:
tqdm_pandas(tqdm())

In [10]:
cooking_frame['content_clean'] = cooking_frame['content_clean'].progress_apply(stripUnMeaningfulWords)  

15404it [10:05, 25.45it/s]


In [11]:
print cooking_frame['content_clean'][0]

chocolate chips cookies always crisp get chewy cookies like starbucks thank everyone answered far tip biggest impact chill rest dough however also increased brown sugar ratio increased bit butter also adding maple syrup helped 


In [12]:
from sklearn.feature_extraction.text import CountVectorizer

In [15]:
vectorizer = CountVectorizer(min_df=1)
#http://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html#sklearn.feature_extraction.text.CountVectorizer
vectorizer = CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=np.float32, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [16]:
corpus = cooking_frame['content_clean'].tolist()
X = vectorizer.fit_transform(corpus)
X                              

<15404x29114 sparse matrix of type '<type 'numpy.float32'>'
	with 556779 stored elements in Compressed Sparse Row format>

In [17]:
X[0,0]

0.0

In [18]:
np.shape(X)

(15404, 29114)

In [27]:
sum(X.toarray()[1,:])

16.0

In [37]:
cooking_frame.head(50)

Unnamed: 0,id,title,content,tags,content_clean,tags_clean
0,1,How can I get chewy chocolate chip cookies?,<p>My chocolate chips cookies are always too c...,baking cookies texture,chocolate chips cookies always crisp get chewy...,baking cookies texture
1,2,How should I cook bacon in an oven?,<p>I've heard of people cooking bacon in an ov...,oven cooking-time bacon,ive heard people cooking bacon oven laying str...,oven cooking-time bacon
2,3,What is the difference between white and brown...,"<p>I always use brown extra large eggs, but I ...",eggs,always use brown extra large eggs cant honestl...,eggs
3,4,What is the difference between baking soda and...,<p>And can I use one in place of the other in ...,substitutions please-remove-this-tag baking-so...,use one place certain recipes,substitutions baking-soda baking-powder
4,5,"In a tomato sauce recipe, how can I cut the ac...",<p>It seems that every time I make a tomato sa...,sauce pasta tomatoes italian-cuisine,seems every time make tomato sauce pasta sauce...,sauce pasta tomatoes italian-cuisine
5,6,What ingredients (available in specific region...,<p>I have a recipe that calls for fresh parsle...,substitutions herbs parsley,recipe calls fresh parsley substituted fresh h...,substitutions herbs parsley
6,9,What is the internal temperature a steak shoul...,<p>I'd like to know when to take my steaks off...,food-safety beef cooking-time,id like know take steaks grill please everybody,food-safety beef cooking-time
7,11,How should I poach an egg?,<p>What's the best method to poach an egg with...,eggs basics poaching,whats best method poach egg without turning eg...,eggs basics poaching
8,12,"How can I make my Ice Cream ""creamier""",<p>My ice cream doesn't feel creamy enough. I...,ice-cream,ice cream doesnt feel creamy enough got recipe...,ice-cream
9,17,How long and at what temperature do the variou...,"<p>I'm interested in baking thighs, legs, brea...",baking chicken cooking-time,im interested baking thighs legs breasts wings...,baking chicken cooking-time


In [35]:
removeUnwantedTag = lambda x : x.replace('please-remove-this-tag','')

In [36]:
cooking_frame['tags_clean'] = cooking_frame['tags'].apply(removeUnwantedTag)                            

In [38]:
corpus = cooking_frame['tags_clean'].tolist()
Y = vectorizer.fit_transform(corpus)
Y   

<15404x744 sparse matrix of type '<type 'numpy.float32'>'
	with 42116 stored elements in Compressed Sparse Row format>