In [1]:
from pyspark.sql import SparkSession
spark = SparkSession\
    .builder\
    .appName("WhatsCook")\
    .getOrCreate()

In [2]:
from sklearn.model_selection import train_test_split
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType
from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.ml.feature import StopWordsRemover
from pyspark.ml.feature import HashingTF
from pyspark.ml.feature import IDF
from pyspark.ml.feature import Word2Vec
from pyspark.ml.classification import *
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml import Pipeline
import json

In [3]:
testdata = spark.read.format("json").load("./test.json")
traindata = spark.read.format("json").load("./train.json")

In [4]:
traindata.show(5)

+-------+---+--------------------+-----+
|cuisine| id|         ingredients|label|
+-------+---+--------------------+-----+
|chinese|  0|mince,cloves,grat...|    0|
|chinese|  1|fresh-wide-noodle...|    0|
|chinese|  2|fresh-wide-noodle...|    0|
|chinese|  3|mince-pork,hot-be...|    0|
|chinese|  4|long-grain-white-...|    0|
+-------+---+--------------------+-----+
only showing top 5 rows



In [5]:
def accuracy(result):
    temp = result.collect()
    total = 0.0
    hit = 0.0
    labelmax = 0
    for row in temp:
        total += 1
        if row['label'] == int(row['prediction']):
            hit += 1
        labelmax = max(labelmax, row['prediction'])
    print (hit / total)
    print (labelmax)

Text Extraction

In [6]:
regexTokenizer = RegexTokenizer(inputCol="ingredients", outputCol="words", pattern="[^A-Za-z]+", toLowercase=True)

TF-IDF

In [None]:
hashingTF = HashingTF(inputCol="words", outputCol="raw_features", numFeatures=20)
idf = IDF(inputCol="raw_features", outputCol="features")
pipeline_tfidf = Pipeline(stages=[regexTokenizer, hashingTF, idf])
pipeline_tfidf_model_train = pipeline_tfidf.fit(traindata)
train_tfidf_ready = pipeline_tfidf_model_train.transform(traindata)
test_tfidf = pipeline_tfidf_model_train.transform(testdata)

Word2Vector

In [7]:
word2Vec = Word2Vec(vectorSize=100, minCount=0, inputCol="words", outputCol="features")
pipeline_w2v = Pipeline(stages=[regexTokenizer, word2Vec])
pipeline_w2v_model = pipeline_w2v.fit(traindata)
train_w2v = pipeline_w2v_model.transform(traindata)
test_w2v = pipeline_w2v_model.transform(testdata)
# train_w2v['label','words', 'features'].orderBy("cuisine").show(5, False)

OvR SVM Model

In [8]:
svc = LinearSVC(maxIter = 1000, tol=0.001, aggregationDepth=3, regParam=0.01)
ovr_svc = OneVsRest(classifier=svc)

In [10]:
ovr_svc_model = ovr_svc.fit(train_w2v)
prediction_w2v_ovr = ovr_svc_model.transform(test_w2v)

In [11]:
accuracy(prediction_w2v_ovr)

0.7213727343011769
19.0


LogisticRegression Model

In [12]:
lr = LogisticRegression(maxIter=1000, regParam=0.001)

In [13]:
lrModel_w2v = lr.fit(train_w2v)
prediction_w2v_lr = lrModel_w2v.transform(test_w2v)

In [14]:
accuracy(prediction_w2v_lr)

0.7607593849325659
19.0


DecisionTree Model

In [15]:
DT = DecisionTreeClassifier(maxDepth = 15)
DTmodel = DT.fit(train_w2v)

In [16]:
DTresult = DTmodel.transform(test_w2v)

In [18]:
DTresult.show()

+----------+-----+--------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|   cuisine|   id|         ingredients|label|               words|            features|       rawPrediction|         probability|prediction|
+----------+-----+--------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|vietnamese|88554|baguette,olive-oi...|   12|[baguette, olive,...|[-0.0398798672433...|[1.0,0.0,0.0,0.0,...|[0.03333333333333...|       8.0|
|   chinese| 1077|chinese-eggplants...|    0|[chinese, eggplan...|[-0.1610382734177...|[1.0,3.0,0.0,0.0,...|[0.01190476190476...|       4.0|
|    french| 9729|cake-mix,cocoa,cr...|    1|[cake, mix, cocoa...|[-0.0133330086246...|[0.0,5.0,0.0,0.0,...|[0.0,1.0,0.0,0.0,...|       1.0|
|  japanese|71972|soy-sauce,dried-t...|    5|[soy, sauce, drie...|[-0.1885573548421...|[0.0,0.0,0.0,0.0,...|[0.0,0.0,0.0,0.0,...|       5.0|
|   spanish|6

In [17]:
accuracy(DTresult)

0.8246714199811013
19.0


RandomForest Model

In [19]:
rf = RandomForestClassifier(maxDepth = 15)
rfmodel = rf.fit(train_w2v)

In [20]:
rfresult = rfmodel.transform(test_w2v)
rfresult.show()

+----------+-----+--------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|   cuisine|   id|         ingredients|label|               words|            features|       rawPrediction|         probability|prediction|
+----------+-----+--------------------+-----+--------------------+--------------------+--------------------+--------------------+----------+
|vietnamese|88554|baguette,olive-oi...|   12|[baguette, olive,...|[-0.0398798672433...|[3.57845433890070...|[0.17892271694503...|      12.0|
|   chinese| 1077|chinese-eggplants...|    0|[chinese, eggplan...|[-0.1610382734177...|[3.77167727274225...|[0.18858386363711...|       4.0|
|    french| 9729|cake-mix,cocoa,cr...|    1|[cake, mix, cocoa...|[-0.0133330086246...|[0.01538461538461...|[7.69230769230769...|       1.0|
|  japanese|71972|soy-sauce,dried-t...|    5|[soy, sauce, drie...|[-0.1885573548421...|[0.91163340336134...|[0.04558167016806...|       5.0|
|   spanish|6

In [21]:
accuracy(rfresult)

0.9272828794777082
19.0
