# 0. Dependancies

### mount google drive

In [2]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [3]:
%cd /content/gdrive/MyDrive/!_2022_fall/PDSP/kaggle/

/content/gdrive/MyDrive/!_2022_fall/PDSP/kaggle


### import packages

In [4]:
import pandas as pd

from keras.preprocessing.text import Tokenizer
import re
import nltk
from string import punctuation 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import WordPunctTokenizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
en_stop = set(nltk.corpus.stopwords.words('english'))

from gensim.models import FastText


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


# 1. Dataset preparation

### load dataset

In [5]:
# read recipe csv file
df_train = pd.read_csv("./data/RAW_recipes.csv")

# extract relevant col
steps = df_train['steps']
reviews = df_train['description']
ingredients = df_train['ingredients']

### normalization & tokenizing 

In [6]:
def preprocessing(document):
  # remove special characters
  document = re.sub(r'\W', ' ', str(document))

  # remove numbers 
  document = re.sub('[0-9]+', '', document)

  # remove single characters
  document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
  document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

  # substituting multiple spaces with single space
  document = re.sub(r'\s+', ' ', document, flags=re.I)

  # converting to lowercase
  dodument = document.lower()

  # tokenizing 
  document = tokenizer.tokenize(document) 

  # remove stop words 
  document = [w for w in document if len(w) > 2 if not w in en_stop]
  '''
  # lemmatization 
  tokens = document.split()
  stemmer = WordNetLemmatizer()
  tokens = [stemmer.lemmatize(word) for word in tokens]
  tokens = [word for word in tokens if word not in en_stop]
  tokens = [word for word in tokens if len(word)>3]

  res = ' '.join(tokens)
  '''

  return document

In [7]:
# define tokenizer
tokenizer = nltk.WordPunctTokenizer()

# preprocessing dataset 
preprocessed_steps = [preprocessing(step) for step in steps]
preprocessed_reviews = [preprocessing(review) for review in reviews]


In [8]:

# merge 
dataset = []
dataset.extend(preprocessed_steps)
dataset.extend(preprocessed_reviews)

# 2. Train FastText model

* quick tutorial : https://github.com/PacktPublishing/fastText-Quick-Start-Guide/blob/master/chapter5/fasttext%20with%20gensim.ipynb

* learn more about the gensim fastText model parameter : https://radimrehurek.com/gensim/models/fasttext.html

  * window (int, optional) – The maximum distance between the current and predicted word within a sentence.

`fooddotcom_v1`  
 * dataset : steps + review
 * vector size : 100
 * model : skip-gram 
  
 `fooddotcom_v2`  
 * dataset : steps + review
 * vector size : 100
 * model : skip-gram 
 * n_gram min max

In [17]:
#model2 = FastText(dataset, size=100, window=5, min_count=5, min_n=2, max_n=5, workers=4, sg=1)

In [22]:
# quick test 
model.wv.most_similar("olive oil")

[('olive', 0.8815826177597046),
 ('oliver', 0.8713400363922119),
 ('olivetomato', 0.8702647686004639),
 ('kalamata', 0.8153486251831055),
 ('olives', 0.8048213720321655),
 ('toscano', 0.7860913276672363),
 ('olivio', 0.7833059430122375),
 ('olio', 0.7641619443893433),
 ('pepperocini', 0.7613413333892822),
 ('calamata', 0.7546628713607788)]

In [23]:
model.save('./model/fooddotcom_v2')