# 0. Dependancies

### mount google drive

In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Mounted at /content/gdrive


In [None]:
%cd /content/gdrive/MyDrive/!_2022_fall/PDSP/kaggle/

/content/gdrive/MyDrive/!_2022_fall/PDSP/kaggle


### import packages

In [None]:
import pandas as pd

from keras.preprocessing.text import Tokenizer
import re
import nltk
from string import punctuation 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import WordPunctTokenizer

nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')
en_stop = set(nltk.corpus.stopwords.words('english'))

from gensim.models import FastText


# 1. Dataset preparation

### load dataset

In [None]:
# read recipe csv file
df_train = pd.read_csv("./data/RAW_recipes.csv")

# extract relevant col
steps = df_train['steps']
reviews = df_train['description']
##ingredients = df_train['ingredients']

### text normalization & tokenizing 

In [None]:
def preprocessing(document):
  # remove special characters
  document = re.sub(r'\W', ' ', str(document))

  # remove numbers 
  document = re.sub('[0-9]+', '', document)

  # remove single characters
  document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)
  document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

  # substituting multiple spaces with single space
  document = re.sub(r'\s+', ' ', document, flags=re.I)

  # converting to lowercase
  dodument = document.lower()

  # tokenizing 
  document = tokenizer.tokenize(document) 

  # remove stop words 
  document = [w for w in document if len(w) > 2 if not w in en_stop]
  
  # lemmatization 
  stemmer = WordNetLemmatizer()
  document = [stemmer.lemmatize(word) for word in document]
  document = [word for word in document if word not in en_stop]
  document = [word for word in document if len(word)>3]

  return document

In [None]:
# define tokenizer
tokenizer = nltk.WordPunctTokenizer()

# preprocessing dataset 
preprocessed_steps = [preprocessing(step) for step in steps]
preprocessed_reviews = [preprocessing(review) for review in reviews]


In [None]:
# merge 
dataset = []
dataset.extend(preprocessed_steps)
dataset.extend(preprocessed_reviews)

# 2. Train FastText model

* quick tutorial : https://github.com/PacktPublishing/fastText-Quick-Start-Guide/blob/master/chapter5/fasttext%20with%20gensim.ipynb

* learn more about the gensim fastText model parameter : https://radimrehurek.com/gensim/models/fasttext.html

  * window (int, optional) – The maximum distance between the current and predicted word within a sentence.

`fooddotcom_v1`  
 * dataset : steps
 * vector size : 100
 * model : skip-gram 
   
   
 `fooddotcom_v2`  
 * dataset : steps + review
 * vector size : 100
 * model : c-bow
 * n_gram min max

In [None]:
#model2 = FastText(dataset, size=100, window=5, min_count=5, min_n=2, max_n=5, workers=4, sg=1)

In [None]:
model3 = FastText(dataset, size=200, window=5, min_count=5, min_n=2, max_n=5, workers=4, sg=0)

In [None]:
# quick test 
model3.wv.most_similar("piece scrod fillets fish choice")

[('fillet', 0.5782290697097778),
 ('pikelet', 0.5710306167602539),
 ('swordfish', 0.5619159936904907),
 ('catfish', 0.5538877248764038),
 ('piglet', 0.5432965159416199),
 ('filet', 0.5417371988296509),
 ('filleted', 0.5275126695632935),
 ('rockfish', 0.5269321203231812),
 ('piece', 0.525778591632843),
 ('monkfish', 0.5253928899765015)]

In [None]:
# save model
model3.save('./model/fooddotcom_v3')