In [1]:
import sys
import pandas as pd
import numpy as np
import os
import matplotlib.pyplot as plt
from collections import Counter
import advertools as adv
from sklearn.feature_extraction.text import CountVectorizer

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from collections import Counter
from num2words import num2words

import nltk
import os
import string
import numpy as np
import copy
import pandas as pd
import pickle
import re
import math
from scipy.special import logsumexp

In [2]:
rd_interaction = pd.read_csv('ds_dataset/raw-data_interaction.csv')
rd_recipe = pd.read_csv('ds_dataset/raw-data_recipe.csv')

## Preprocessing the data in 'cooking directions'

In [3]:
def convert_lower_case(data):
    return np.char.lower(data)


In [4]:
def remove_stop_words(data):
    stop_words = stopwords.words('english')
    words = word_tokenize(str(data))
    new_text = ""
    for w in words:
        if w not in stop_words and len(w) > 1:
            new_text = new_text + " " + w
    return new_text

In [5]:
def remove_apostrophe(data):
    return np.char.replace(data, "'", "")

In [6]:
def stemming(data):
    stemmer= PorterStemmer()
    
    tokens = word_tokenize(str(data))
    new_text = ""
    for w in tokens:
        new_text = new_text + " " + stemmer.stem(w)
    return new_text
    

In [7]:
def convert_numbers(data):
    tokens = word_tokenize(str(data))
    
    new_text = ""
    for w in tokens:

        for character in w:
            if character.isdigit():
                w = re.sub("[A-Za-z]+", lambda ele: " " + ele[0] + " ", w)
        
        try:
            
            w = num2words(int(w))
        except:
            a = 0
        new_text = new_text + " " + w
        
    new_text = np.char.replace(new_text, "-", " ")
    
    return new_text

In [8]:
def remove_punctuation(data):
    symbols = "!\"#$%&()*+-./:;<=>?@[\]^_`{|}~"
    
    data = np.char.replace(data, " m\\n", ' minutes ' )
    data = np.char.replace(data, 'h\\', ' hours ')
    data = np.char.replace(data, ' h ', ' hours ')

    data = np.char.replace(data, "\\n", ' ')

    for i in range(len(symbols)):
        
        data = np.char.replace(data, symbols[i], ' ')
        data = np.char.replace(data, "  ", " ")
        
    data = np.char.replace(data, ',', '')
    data = np.char.replace(data, ' f ', ' fahrenheit ')
    data = np.char.replace(data, ' c ', ' celcius ')
    data = np.char.replace(data, " u'", ' ' )

    return data

In [9]:
def remove_otherwords(data):
    data = np.char.replace(data, 'prep', ' ')
    data = np.char.replace(data, 'directions', ' ')
    return data

In [10]:
def preprocess(data):
    data = convert_lower_case(data)
    data = convert_numbers(data)
    data = remove_punctuation(data) #remove comma seperately
    data = convert_numbers(data)
    data = remove_apostrophe(data)
    data = remove_stop_words(data)
    data = convert_numbers(data)
    data = remove_punctuation(data)
    data = convert_numbers(data)
    data = remove_stop_words(data) #needed again as num2word is giving stop words 101 - one hundred and one
    return data

In [11]:
processed_text = []
for text in rd_recipe['cooking_directions']:
    processed_text.append(word_tokenize(preprocess(text)))

# Calculating DF of all words

DF is the count of occurences of term t in the document set N. 
In other words, DF is the no. of documents in which the word is present

df(t) = occurence of t in N documents

To keep this also in a range, we normalize by dividing by the total no. of documents. Our main goal is to know the INFORMATIVENESS of a term. The higher the no. of DF, the less informativeness the term has.

In [12]:

DF = {}

N = len(processed_text)
for i in range(N):
    tokens = processed_text[i]
    
    for w in tokens:
        #print(w)
        try:
            DF[w].add(i)
        except:
            DF[w] = {i}



for i in DF:
    DF[i] = len(DF[i])

you can see below that "directions" pops up in every recipe. so it is the least informative term

In [27]:
list(DF.items())[:4]

[('directions', 49698), ('prep', 44217), ('five', 36921), ('ncook', 36457)]

In [14]:
total_vocab_size = len(DF)
total_vocab_size

9079

In [15]:
total_vocab = [x for x in DF]
total_vocab[:10]

['directions',
 'prep',
 'five',
 'ncook',
 'two',
 'hours',
 'forty',
 'minutes',
 'ready',
 'eleven']

In [16]:
def doc_freq(word):
    c = 0
    try:
        c = DF[word]
    except:
        pass
    return c

TF is individual to each document and word. 

IDF is the inverse of the document frequency which is proportional to the informativeness of term t. When we calculate IDF, it will be very low for the most common words such as stop words. N/df therefore would be low. This gives what we want, a relative weightage.

If idf(t) = N/df, we get singularity when N is too big and df is too small. So the smoothest formula is shown below.

tf(t,d) = count of t in d / number of words in d

df(t) = occurence of t in N documents

idf(t) = log(N/(df+1))

Finally, by taking a multiplicative value of TF and IDF, we get TF-IDF score. There are many different variations of TF-IDF but for now let us concentrate on this basic version:

tf-idf(t,d) = tf(t,d) * log(N/(df+1))

We use tf-idf values to represent the weight of each term within each document. A series of tf-idf values of our total_vocab will form the vector of our document.



In [17]:
doc = 0

tf_idf = {}

for i in range(N):
    
    tokens = processed_text[i]
    
    counter = Counter(tokens)
    words_count = len(tokens)
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = np.log((N+1)/(df+1))
        
        tf_idf[doc, token] = tf*idf

    doc += 1

## Cosine similarity function

There are many ways to calculate similarities between queries/documents. In this method, we need to convert our text data into numerical values. Here, using gen_vector(), we have converted a series of words/strings into a vector. The vector is composed of tf-idf values of a token. Then, using cosine_sim, we calculate the similarity between our query vector and each vector of our 'cooking_directions' doc.

In [18]:
def cosine_sim(a, b):
    cos_sim = np.dot(a, b)/(np.linalg.norm(a)*np.linalg.norm(b))
    return cos_sim

In [19]:
D = np.zeros((N, total_vocab_size))
for i in tf_idf:
    try:
        ind = total_vocab.index(i[1])
        D[i[0]][ind] = tf_idf[i]
    except:
        pass

In [20]:
def gen_vector(tokens):

    Q = np.zeros((len(total_vocab)))
    
    counter = Counter(tokens)
    words_count = len(tokens)

    query_weights = {}
    
    for token in np.unique(tokens):
        
        tf = counter[token]/words_count
        df = doc_freq(token)
        idf = math.log((N+1)/(df+1))

        try:
            ind = total_vocab.index(token)
            Q[ind] = tf*idf
        except:
            pass
        
    return Q

In [21]:
def cosine_similarity(k, query):
    print("Cosine Similarity")
    preprocessed_query = preprocess(query)
    tokens = word_tokenize(str(preprocessed_query))
    
    print("\nQuery:", query)
    print("")
    print(tokens)
    
    d_cosines = []
    
    query_vector = gen_vector(tokens)

    print(query_vector)
    
    for d in D:
        d_cosines.append(cosine_sim(query_vector, d))
        
    out = np.array(d_cosines).argsort()[-k:][::-1]
    
    print("")
    
    print(out)
    return out

In [22]:
Q = cosine_similarity(10, "pork skillet soup potatoes bake fry pepper")

Cosine Similarity

Query: pork skillet soup potatoes bake fry pepper

['pork', 'skillet', 'soup', 'potatoes', 'bake', 'fry', 'pepper']
[0. 0. 0. ... 0. 0. 0.]

[  403 13952  7229 13059  9539  7500 13392  4099   573  9255]


In [23]:
print(preprocess(rd_recipe['cooking_directions'][403]))
print(" ")
print(preprocess(rd_recipe['ingredients'][403]))
print(" ")
print(preprocess(rd_recipe['recipe_name'][403]))

 directions prep five ncook twenty five minutes ready thirty minutes season pork salt black pepper heat one tablespoon olive oil twelve inch skillet medium high heat add pork batches cook six minutes browned sides remove pork skillet add onion skillet cook five minutes tender crisp stirring occasionally stir honey mustard soup heat boil return pork skillet reduce heat low cook five minutes pork cooked serve pork soup mixture potatoes
 
 boneless pork tenderloins medium onion honey dijon style mustard carton campbells® sweet onion soup hot mashed potatoes
 
 sweet onion pork medallions
