In [152]:
import numpy as np
import pandas as pd
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans, MeanShift, estimate_bandwidth
from sklearn.cluster import SpectralClustering, AffinityPropagation, MiniBatchKMeans
from sklearn import metrics
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
%matplotlib inline

In [40]:
import os
curr_directory = os.getcwd()
datafile = curr_directory + "/all_recipe_data.json"

In [69]:
import json
raw_recipe_data = pd.read_json(datafile)

In [52]:
raw_recipe_data.isnull().values.any()

False

In [70]:
raw_recipe_data.Instructions.replace('', np.nan, inplace=True)
raw_recipe_data.head()

Unnamed: 0,Author,Instructions
0,[witchywoman],"Mix grapefruit, orange, apples, pineapple, mar..."
1,[witchywoman],Fill a margarita glass with crushed ice. Pour ...
2,[witchywoman],Preheat an oven to 350 degrees F (175 degrees ...
3,[witchywoman],
4,[witchywoman],"Mix the teriyaki sauce, garlic, ginger, and re..."


In [76]:
raw_recipe_data.isnull().sum()

Author          0
Instructions    0
dtype: int64

In [153]:
raw_recipe_data.head()

Unnamed: 0,Author,Instructions
0,[witchywoman],"mix grapefruit, orange, apples, pineapple, mar..."
1,[witchywoman],fill a margarita glass with crushed ice. pour ...
2,[witchywoman],preheat an oven to 350 degrees f (175 degrees ...
4,[witchywoman],"mix the teriyaki sauce, garlic, ginger, and re..."
5,[witchywoman],"fill a pint glass with ice, and pour in the vo..."


In [165]:
raw_recipe_data.Instructions = raw_recipe_data.Instructions.lower().split('.').replace(",", "")
raw_recipe_data.head()

Unnamed: 0,Author,Instructions
0,[witchywoman],"mix grapefruit, orange, apples, pineapple, mar..."
1,[witchywoman],fill a margarita glass with crushed ice. pour ...
2,[witchywoman],preheat an oven to 350 degrees f (175 degrees ...
4,[witchywoman],"mix the teriyaki sauce, garlic, ginger, and re..."
5,[witchywoman],"fill a pint glass with ice, and pour in the vo..."


In [172]:
sentences = [text for text in raw_recipe_data.Instructions]

np.random.shuffle(sentences)
train, test = sentences[:81], sentences[81:]


In [197]:
instructions = raw_recipe_data.Instructions


In [199]:
# Create a set of frequent words
stoplist = set('for a of the and to in'.split(' '))
# Lowercase each document, split it by white space and filter out stopwords
texts = [[word for word in instructions.lower().split() if word not in stoplist]
         for document in instructions[1]]

# Count word frequencies
from collections import defaultdict
frequency = defaultdict(int)
for text in texts:
    for token in text:
        frequency[token] += 1

# Only keep words that appear more than once
processed_corpus = [[token for token in text if frequency[token] > 1] for text in texts]
processed_corpus

AttributeError: 'Series' object has no attribute 'lower'

In [118]:
# import modules & set up logging
import gensim, logging

logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

In [187]:
model = gensim.models.Word2Vec(train, min_count=3, sg=1, window=7, sample=1e-3)

2018-04-20 14:42:49,406 : INFO : collecting all words and their counts
2018-04-20 14:42:49,413 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-04-20 14:42:49,427 : INFO : collected 56 word types from a corpus of 35682 raw words and 81 sentences
2018-04-20 14:42:49,429 : INFO : Loading a fresh vocabulary
2018-04-20 14:42:49,434 : INFO : min_count=3 retains 47 unique words (83% of original 56, drops 9)
2018-04-20 14:42:49,437 : INFO : min_count=3 leaves 35669 word corpus (99% of original 35682, drops 13)
2018-04-20 14:42:49,442 : INFO : deleting the raw counts dictionary of 56 items
2018-04-20 14:42:49,444 : INFO : sample=0.001 downsamples 27 most-common words
2018-04-20 14:42:49,446 : INFO : downsampling leaves estimated 6936 word corpus (19.4% of prior 35669)
2018-04-20 14:42:49,457 : INFO : estimated required memory for 47 words and 100 dimensions: 61100 bytes
2018-04-20 14:42:49,460 : INFO : resetting layer weights
2018-04-20 14:42:49,463 : INFO : trai

In [201]:
model.wv.vocab

{' ': <gensim.models.keyedvectors.Vocab at 0x11355a908>,
 '"': <gensim.models.keyedvectors.Vocab at 0x11355af98>,
 "'": <gensim.models.keyedvectors.Vocab at 0x11355ad30>,
 '(': <gensim.models.keyedvectors.Vocab at 0x11355a438>,
 ')': <gensim.models.keyedvectors.Vocab at 0x11355a0b8>,
 ',': <gensim.models.keyedvectors.Vocab at 0x11355a668>,
 '-': <gensim.models.keyedvectors.Vocab at 0x11355a4e0>,
 '.': <gensim.models.keyedvectors.Vocab at 0x11355a588>,
 '/': <gensim.models.keyedvectors.Vocab at 0x11355a198>,
 '0': <gensim.models.keyedvectors.Vocab at 0x11355a4a8>,
 '1': <gensim.models.keyedvectors.Vocab at 0x11355a160>,
 '2': <gensim.models.keyedvectors.Vocab at 0x11355a278>,
 '3': <gensim.models.keyedvectors.Vocab at 0x11355a470>,
 '4': <gensim.models.keyedvectors.Vocab at 0x11355a2b0>,
 '5': <gensim.models.keyedvectors.Vocab at 0x11355a390>,
 '6': <gensim.models.keyedvectors.Vocab at 0x11355ac88>,
 '7': <gensim.models.keyedvectors.Vocab at 0x11355a2e8>,
 '8': <gensim.models.keyedvecto

In [200]:
# set up clustering estimators
from sklearn import cluster

dbscan = cluster.DBSCAN() 
meanshift = cluster.MeanShift()
spectral = cluster.SpectralClustering()
affinity = cluster.AffinityPropagation()