<a href="https://colab.research.google.com/github/Verschworer/HSE_ML_final/blob/main/spark_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install -q kaggle
from google.colab import files
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle competitions download -c whats-cooking  

Saving kaggle.json to kaggle.json
Downloading sample_submission.csv.zip to /content
  0% 0.00/25.8k [00:00<?, ?B/s]
100% 25.8k/25.8k [00:00<00:00, 39.2MB/s]
Downloading train.json.zip to /content
  0% 0.00/1.76M [00:00<?, ?B/s]
100% 1.76M/1.76M [00:00<00:00, 58.1MB/s]
Downloading test.json.zip to /content
  0% 0.00/426k [00:00<?, ?B/s]
100% 426k/426k [00:00<00:00, 133MB/s]


### Data

In [None]:
# ! pip install -q pyspark

In [2]:
!wget http://setup.johnsnowlabs.com/colab.sh -O - | bash

--2021-12-25 09:19:13--  http://setup.johnsnowlabs.com/colab.sh
Resolving setup.johnsnowlabs.com (setup.johnsnowlabs.com)... 51.158.130.125
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://setup.johnsnowlabs.com/colab.sh [following]
--2021-12-25 09:19:14--  https://setup.johnsnowlabs.com/colab.sh
Connecting to setup.johnsnowlabs.com (setup.johnsnowlabs.com)|51.158.130.125|:443... connected.
HTTP request sent, awaiting response... 302 Moved Temporarily
Location: https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh [following]
--2021-12-25 09:19:15--  https://raw.githubusercontent.com/JohnSnowLabs/spark-nlp/master/scripts/colab_setup.sh
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.111.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:44

In [3]:
# import libraries

import pyspark
import pyspark.sql.functions as F
from pyspark.sql.types import * #JVM типы данных


from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

from pyspark.ml.feature import *
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import *

import sparknlp
from sparknlp import DocumentAssembler, Finisher
from sparknlp.annotator import *
from sparknlp.base import *

import numpy as np
import pandas as pd

from itertools import chain

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

#SPARK UP

In [4]:
spark = SparkSession.builder \
    .appName("Spark NLP") \
    .master("local[5]") \
    .config("spark.driver.memory","5G")\
    .config("spark.driver.maxResultSize", "0") \
    .config("spark.kryoserializer.buffer.max", "2000M") \
    .config("spark.jars.packages", "com.johnsnowlabs.nlp:spark-nlp_2.12:3.3.4") \
    .config("spark.sql.execution.arrow.pyspark.enabled", "false") \
    .getOrCreate()

In [5]:
spark # spark is alive

In [6]:
! unzip '/content/train.json.zip'
! unzip '/content/test.json.zip'
! unzip '/content/sample_submission.csv.zip'

Archive:  /content/train.json.zip
  inflating: train.json              
   creating: __MACOSX/
  inflating: __MACOSX/._train.json   
Archive:  /content/test.json.zip
  inflating: test.json               
  inflating: __MACOSX/._test.json    
Archive:  /content/sample_submission.csv.zip
  inflating: sample_submission.csv   


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

In [7]:
df = spark.read.json('train.json', multiLine=True)
sub_df = spark.read.json('test.json', multiLine=True)

In [8]:
print((df.count(), len(df.columns)))

(39774, 3)


In [9]:
df.show(truncate=False)

+-----------+-----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|cuisine    |id   |ingredients                                                                                                                                                                                                                                       |
+-----------+-----+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|greek      |10259|[romaine lettuce, black olives, grape tomatoes, garlic, pepper, purple onion, seasoning, garbanzo beans, feta cheese crumbles]                                                                  

In [10]:
# check the NULL
df.select([F.count(F.when(F.isnull(c), c)).alias(c) for c in df.columns]).show()

+-------+---+-----------+
|cuisine| id|ingredients|
+-------+---+-----------+
|      0|  0|          0|
+-------+---+-----------+



In [11]:
# check the NaNs
df_temp = df.withColumn("ingredients", F.concat_ws(', ' ,F.col("ingredients")))
df_temp.select([F.count(F.when(F.isnan(c), c)).alias(c) for c in df_temp.columns]).show()


+-------+---+-----------+
|cuisine| id|ingredients|
+-------+---+-----------+
|      0|  0|          0|
+-------+---+-----------+



In [12]:
len(df.select('cuisine').distinct().rdd.map(lambda r: r[0]).collect())

20

In [13]:
df.groupBy('cuisine').count().orderBy('count', ascending=False).show()

+------------+-----+
|     cuisine|count|
+------------+-----+
|     italian| 7838|
|     mexican| 6438|
| southern_us| 4320|
|      indian| 3003|
|     chinese| 2673|
|      french| 2646|
|cajun_creole| 1546|
|        thai| 1539|
|    japanese| 1423|
|       greek| 1175|
|     spanish|  989|
|      korean|  830|
|  vietnamese|  825|
|    moroccan|  821|
|     british|  804|
|    filipino|  755|
|       irish|  667|
|    jamaican|  526|
|     russian|  489|
|   brazilian|  467|
+------------+-----+



In [14]:
df = df.withColumn("size",F.size(F.col("ingredients")))

In [15]:
df.groupBy('cuisine') \
  .agg(F.count('cuisine').alias('total_cuisine_count'),
       F.max('size').alias('size_max'),
       F.min('size').alias('size_min'),
       )\
  .sort(F.col('total_cuisine_count').desc()) \
  .show()

+------------+-------------------+--------+--------+
|     cuisine|total_cuisine_count|size_max|size_min|
+------------+-------------------+--------+--------+
|     italian|               7838|      65|       1|
|     mexican|               6438|      52|       1|
| southern_us|               4320|      40|       1|
|      indian|               3003|      49|       1|
|     chinese|               2673|      38|       2|
|      french|               2646|      31|       1|
|cajun_creole|               1546|      31|       2|
|        thai|               1539|      40|       1|
|    japanese|               1423|      34|       1|
|       greek|               1175|      27|       1|
|     spanish|                989|      35|       1|
|      korean|                830|      29|       2|
|  vietnamese|                825|      31|       1|
|    moroccan|                821|      31|       2|
|     british|                804|      30|       2|
|    filipino|                755|      38|   

In [16]:
df.filter(df['size'] < 3).groupBy('size') \
  .agg(F.count('cuisine').alias('total_cuisine_count')
  )\
  .sort(F.col('size')) \
  .show()

+----+-------------------+
|size|total_cuisine_count|
+----+-------------------+
|   1|                 22|
|   2|                193|
+----+-------------------+



#PREPROCESSING

In [18]:
# import re

# def remove_punctuations(x):
#     res=[]
#     for x_ in x:
#         res.append(re.sub(r'[^\w\s]','',x_))
#     return res

# rm_punct = F.udf(remove_punctuations, ArrayType(StringType()))

# def remove_digits(x):
#     res=[]
#     for x_ in x:
#         res.append(re.sub(r"(\d)", "",x_))
#     return res

# rm_digits = F.udf(remove_digits, ArrayType(StringType()))

# def remove_paranthesis(x):
#     res=[]
#     for x_ in x:
#         res.append(re.sub(r'\([^)]*\)', '',x_))
#     return res

# rm_paranthesis = F.udf(remove_paranthesis, ArrayType(StringType()))

# def remove_brand(x):
#     res=[]
#     for x_ in x:
#         res.append(re.sub(u'\w*\u2122', '',x_))
#     return res

# rm_brand = F.udf(remove_brand, ArrayType(StringType()))

# def lower_case(x):
#     res = []
#     for x_ in x:
#         res.append(x_.lower())
#     return res

# convert_to_lower = F.udf(lower_case, ArrayType(StringType()))

In [19]:
# df = df.withColumn("ingredients", rm_punct(F.col("ingredients")))

# df = df.withColumn("ingredients", rm_digits(F.col("ingredients")))

# df = df.withColumn("ingredients", rm_paranthesis(F.col("ingredients")))

# df = df.withColumn("ingredients", rm_brand(F.col("ingredients")))

# df = df.withColumn("ingredients", convert_to_lower(F.col("ingredients")))

In [17]:
corpus_train = df.select("ingredients")
corpus_test = sub_df.select("ingredients")
corpus = corpus_train.union(corpus_test) \
        .withColumn("ingredients", F.concat_ws(', ' ,F.col("ingredients")))

In [18]:
#annotators
assembler = DocumentAssembler()\
    .setInputCol('ingredients')\
    .setOutputCol('docs')

#re-token
regexTokenizer = RegexTokenizer()\
    .setInputCols(['docs'])\
    .setOutputCol('token')

#lemmatizaton
lemmatizer = LemmatizerModel.pretrained()\
    .setInputCols(['token'])\
    .setOutputCol('lemma')

#clean garbage
normalizer = Normalizer()\
    .setCleanupPatterns([
        '[^a-zA-Z.-]+',
        '^[^a-zA-Z]+',
        '[^a-zA-Z]+$',
    ])\
    .setInputCols(['lemma'])\
    .setOutputCol('normalized')\
    .setLowercase(True)

#clean temp phases
finisher = Finisher()\
    .setInputCols(['normalized'])\
    .setOutputCols(['normalized'])\
    .setOutputAsArray(True)

#stop words
stopwords = [] # ["kraft", "uncle"]
stopwordsRemover = StopWordsRemover(inputCol="normalized", 
                                    outputCol="filtered")\
                                    .setStopWords(stopwords)
#vectorizing
# count
countVectors = CountVectorizer(inputCol="filtered", 
                               outputCol="count_features",
                               binary=False)
#idf from count
idf = IDF(inputCol="count_features", outputCol="tfidf_features") 

lemma_antbnc download started this may take some time.
Approximate size to download 907.6 KB
[OK!]


In [19]:
# unarray ingridients
df_tr = df.withColumn("ingredients", F.concat_ws(', ' ,F.col("ingredients")))
df_ts = sub_df.withColumn("ingredients", F.concat_ws(', ' ,F.col("ingredients")))

In [20]:
pipeline = Pipeline(stages=[
                            assembler,
                            regexTokenizer,
                            lemmatizer, 
                            normalizer,
                            finisher,
                            stopwordsRemover,
                            countVectors, 
                            idf
                            ]
                    )

pipelinefit = pipeline.fit(corpus)
next_df = pipelinefit.transform(df_tr)
next_sub = pipelinefit.transform(df_ts)

In [21]:
#labeling
label_stringIdx = StringIndexer(inputCol = "cuisine", 
                                outputCol = "label")

label_Idx = label_stringIdx.fit(next_df)
next_df = label_Idx.transform(next_df)

In [22]:
# balance 

label_collect = next_df.select("label").groupBy("label").count().collect()
unique_label = [x["label"] for x in label_collect]
total_label = sum([x["count"] for x in label_collect])
unique_label_count = len(label_collect)
bin_count = [x["count"] for x in label_collect]

class_weights_label = {i: ii for i, ii in zip(unique_label, total_label / (unique_label_count * np.array(bin_count)))}

mapping_expr = F.create_map([F.lit(x) for x in chain(*class_weights_label.items())])

next_df = next_df.withColumn("weight", mapping_expr[F.col("label")])

print("\n".join("{!r}: {!r},".format(k, v) for k, v in sorted(class_weights_label.items())))

0.0: 0.25372544016330695,
1.0: 0.30890027958993477,
2.0: 0.46034722222222224,
3.0: 0.6622377622377622,
4.0: 0.7439955106621773,
5.0: 0.7515873015873016,
6.0: 1.2863518758085382,
7.0: 1.2922027290448344,
8.0: 1.3975404075895994,
9.0: 1.6925106382978723,
10.0: 2.0108190091001013,
11.0: 2.3960240963855424,
12.0: 2.4105454545454545,
13.0: 2.4222898903775882,
14.0: 2.473507462686567,
15.0: 2.634039735099338,
16.0: 2.981559220389805,
17.0: 3.7807984790874523,
18.0: 4.066871165644172,
19.0: 4.258458244111349,


#ML FLOW

In [23]:
next_df.show(truncate=False)

+-----------+-----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

In [28]:
#some special for baseline
next_df = next_df.withColumn("norm", F.concat_ws(', ' ,F.col("normalized")))
next_df = next_df.withColumn("labels", F.array(next_df["cuisine"]))

emb_assembler = DocumentAssembler()\
    .setInputCol('norm')\
    .setOutputCol('docs')

embeddings = UniversalSentenceEncoder.pretrained() \
  .setInputCols(["docs"])\
  .setOutputCol("embeddings")

emb_pipeline = Pipeline(stages=[
                            emb_assembler,
                            embeddings
                            ]
                        )

emb_pipelinefit = emb_pipeline.fit(next_df)
next_df = emb_pipelinefit.transform(next_df)

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[OK!]


In [27]:
docClassifier = ClassifierDLApproach() \
                .setInputCols("embeddings") \
                .setOutputCol("category") \
                .setLabelColumn("cuisine") \
                .setBatchSize(256) \
                .setMaxEpochs(30) \
                .setLr(1e-3) \
                .setValidationSplit(0.1) \
                .setRandomSeed(8) \
                .setEnableOutputLogs(True)

In [28]:
docClassifiermodel = docClassifier.fit(next_df)

In [29]:
! ls -l ~/annotator_logs/

total 4
-rw-r--r-- 1 root root 2838 Dec 24 17:41 ClassifierDLApproach_f9b8aea789f6.log


In [30]:
import os
log_file_name = os.listdir("/root/annotator_logs")[0]

with open("/root/annotator_logs/"+log_file_name, "r") as log_file :
    print(log_file.read())

Training started - epochs: 30 - learning_rate: 0.001 - batch_size: 256 - training_examples: 35797 - classes: 20
Epoch 0/30 - 9.78s - loss: 393.5794 - acc: 0.28901052 - val_acc: 33.643448 - batches: 140
Epoch 1/30 - 9.30s - loss: 387.81805 - acc: 0.33622274 - val_acc: 33.894897 - batches: 140
Epoch 2/30 - 8.94s - loss: 388.7957 - acc: 0.35591152 - val_acc: 36.585365 - batches: 140
Epoch 3/30 - 9.03s - loss: 387.5734 - acc: 0.3767917 - val_acc: 37.993465 - batches: 140
Epoch 4/30 - 9.55s - loss: 386.85504 - acc: 0.38520002 - val_acc: 38.446064 - batches: 140
Epoch 5/30 - 9.19s - loss: 386.41275 - acc: 0.38973588 - val_acc: 38.69751 - batches: 140
Epoch 6/30 - 9.08s - loss: 386.13052 - acc: 0.39271474 - val_acc: 38.974102 - batches: 140
Epoch 7/30 - 8.96s - loss: 385.9045 - acc: 0.3943447 - val_acc: 38.948956 - batches: 140
Epoch 8/30 - 9.18s - loss: 385.69308 - acc: 0.39547446 - val_acc: 39.049538 - batches: 140
Epoch 9/30 - 8.61s - loss: 385.4787 - acc: 0.39680663 - val_acc: 39.07468 - 

In [24]:
(trainingData, testData) = next_df.randomSplit([0.7, 0.3], seed = 8)

print("We have %d training examples and %d test examples." % (trainingData.count(), testData.count()))

We have 27961 training examples and 11813 test examples.


In [125]:
# sub_next_df = sub_next_df.withColumn("norm", F.concat_ws(', ' ,F.col("normalized")))

# emb_sub = Pipeline(stages=[
#                             emb_assembler,
#                             embeddings
#                             ]
#                     )

# emb_subfit = emb_sub.fit(sub_next_df)
# sub_next_df = emb_subfit.transform(sub_next_df)

# doc_pred = docClassifiermodel.transform(sub_next_df)

In [25]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", metricName="accuracy")

In [26]:
idx_to_string = IndexToString(
    inputCol="prediction", outputCol="cuisine", labels=label_Idx.labels)

In [130]:
count_nb = NaiveBayes(featuresCol="count_features", smoothing=0.2, modelType="multinomial")
count_nbmodel = count_nb.fit(trainingData)
nbCountPredictions = count_nbmodel.transform(testData)
evaluator.evaluate(nbCountPredictions)

0.7160755100313214

In [134]:
count_nbmodel_sub = count_nb.fit(next_df)
count_nbmodel_pred = count_nbmodel_sub.transform(next_sub)
count_nbmodel_sub = idx_to_string.transform(count_nbmodel_pred)
sub_count_nb = count_nbmodel_sub.select("id", "cuisine")
sub_count_nb.write.csv('/content/sub_count_nb.csv', header=True)

In [136]:
! kaggle competitions submit -c whats-cooking -f /content/sub_count_nb.csv/part-00000-b6a658c4-13ff-434d-b8c4-1fbf8232d7fd-c000.csv -m "final count_nb sub"

100% 139k/139k [00:01<00:00, 85.1kB/s]
Successfully submitted to What's Cooking?

##kaggle.com - 0.71962

In [138]:
tfidf_nb = NaiveBayes(featuresCol="tfidf_features", smoothing=0.2, modelType="multinomial")
tfidf_nbmodel = tfidf_nb.fit(trainingData)
nbTfIdfPredictions = tfidf_nbmodel.transform(testData)
evaluator.evaluate(nbTfIdfPredictions)

0.7004994497587403

In [142]:
tfidf_nbmodel_sub = tfidf_nb.fit(next_df)
tfidf_nbmodel_pred = tfidf_nbmodel_sub.transform(next_sub)
tfidf_nbmodel_sub = idx_to_string.transform(tfidf_nbmodel_pred)
sub_tfidf_nb = tfidf_nbmodel_sub.select("id", "cuisine")
sub_tfidf_nb.write.csv('/content/sub_tfidf_nb.csv', header=True)

In [143]:
! kaggle competitions submit -c whats-cooking -f /content/sub_tfidf_nb.csv/part-00000-76eab513-87c4-4193-9a7c-1182f94dfe98-c000.csv -m "final tfidf_nb sub"

100% 138k/138k [00:01<00:00, 73.7kB/s]
Successfully submitted to What's Cooking?

##kaggle.com - 0.70625

In [152]:
count_lr = LogisticRegression(featuresCol="count_features",
                        maxIter=13, 
                        regParam=0.1,
                        family = "multinomial", 
                        elasticNetParam=0, 
                        tol=0.001, 
                        fitIntercept=True, 
                        weightCol="weight")
count_lrModel = count_lr.fit(trainingData)
count_lr_predictions = count_lrModel.transform(testData)
evaluator.evaluate(count_lr_predictions)

0.7338525353424193

In [148]:
count_lr = LogisticRegression(featuresCol="count_features",
                        maxIter=13, 
                        regParam=0.1,
                        family = "multinomial", 
                        elasticNetParam=0, 
                        tol=0.001, 
                        fitIntercept=True)#, 
                        # weightCol="weight")
count_lrModel = count_lr.fit(trainingData)
count_lr_predictions = count_lrModel.transform(testData)
evaluator.evaluate(count_lr_predictions)

0.7452806230424109

In [146]:
count_lrmodel_sub = count_lr.fit(next_df)
count_lrmodel_pred = count_lrmodel_sub.transform(next_sub)
count_lrmodel_sub = idx_to_string.transform(count_lrmodel_pred)
sub_count_lr = count_lrmodel_sub.select("id", "cuisine")
sub_count_lr.write.csv('/content/sub_count_lr.csv', header=True)

In [147]:
! kaggle competitions submit -c whats-cooking -f /content/sub_count_lr.csv/part-00000-b5cf9701-4748-4d4a-8ad5-4b97da8b3566-c000.csv -m "final count_lr sub"

100% 139k/139k [00:01<00:00, 80.2kB/s]
Successfully submitted to What's Cooking?

##kaggle.com - 0.74416

In [154]:
tfidf_lr = LogisticRegression(featuresCol="tfidf_features",
                        maxIter=13, 
                        regParam=0.1,
                        family = "multinomial", 
                        elasticNetParam=0, 
                        tol=0.001, 
                        fitIntercept=True)
tfidf_lrModel = tfidf_lr.fit(trainingData)
tfidf_lr_predictions = tfidf_lrModel.transform(testData)
evaluator.evaluate(tfidf_lr_predictions)

0.7452806230424109

In [155]:
tfidf_lrmodel_sub = tfidf_lr.fit(next_df)
tfidf_lrmodel_pred = tfidf_lrmodel_sub.transform(next_sub)
tfidf_lrmodel_sub = idx_to_string.transform(tfidf_lrmodel_pred)
sub_tfidf_lr = count_lrmodel_sub.select("id", "cuisine")
sub_tfidf_lr.write.csv('/content/sub_tfidf_lr.csv', header=True)

In [156]:
! kaggle competitions submit -c whats-cooking -f /content/sub_tfidf_lr.csv/part-00000-3f5c1141-7a86-4fc0-8efe-d18a2e37044e-c000.csv -m "final tfidf_lr sub"

100% 139k/139k [00:02<00:00, 68.6kB/s]
Successfully submitted to What's Cooking?

##kaggle.com - 0.74416, such as count

In [18]:
lsvc = LinearSVC(maxIter=20000, regParam=3)
# gbt = GBTClassifier(maxIter=30, maxDepth=20)
# fm = FMClassifier(factorSize=2)

In [21]:
count_ovr = OneVsRest(classifier=lsvc, featuresCol="count_features", weightCol="weight")

In [22]:
count_ovrModel = count_ovr.fit(trainingData)
count_ovr_predictions = count_ovrModel.transform(testData)
evaluator.evaluate(count_ovr_predictions)

0.7086260899009565

In [23]:
count_ovrmodel_sub = count_ovr.fit(next_df)
count_ovrmodel_pred = count_ovrmodel_sub.transform(next_sub)
count_ovrmodel_sub = idx_to_string.transform(count_ovrmodel_pred)
sub_count_ovr = count_ovrmodel_sub.select("id", "cuisine")
sub_count_ovr.write.csv('/content/sub_count_ovr.csv', header=True)

In [24]:
! kaggle competitions submit -c whats-cooking -f /content/sub_count_ovr.csv/part-00000-fbc282b0-3228-4454-a697-39d73f20c0dc-c000.csv -m "final tfidf_lr sub"

100% 137k/137k [00:02<00:00, 69.6kB/s]
Successfully submitted to What's Cooking?

##kaggle.com - 0.71198

In [25]:
tfidf_ovr = OneVsRest(classifier=lsvc, featuresCol="tfidf_features", weightCol="weight")

In [26]:
tfidf_ovrModel = tfidf_ovr.fit(trainingData)
tfidf_ovr_predictions = tfidf_ovrModel.transform(testData)
evaluator.evaluate(tfidf_ovr_predictions)

0.7086260899009565

In [34]:
tfidf_ovrmodel_sub = tfidf_ovr.fit(next_df)
tfidf_ovrmodel_pred = tfidf_ovrmodel_sub.transform(next_sub)
tfidf_ovrmodel_sub = idx_to_string.transform(tfidf_ovrmodel_pred)
sub_tfidf_ovr = tfidf_ovrmodel_sub.select("id", "cuisine")
sub_tfidf_ovr.write.csv('/content/sub_tfidf_ovr.csv', header=True)

In [35]:
! kaggle competitions submit -c whats-cooking -f /content/sub_tfidf_ovr.csv/part-00000-f73d6de5-626d-4ac8-b2ac-437d02205ba9-c000.csv -m "final tfidf_ovr(lsvc)+w sub"

100% 137k/137k [00:01<00:00, 84.1kB/s]
Successfully submitted to What's Cooking?

##kaggle.com - 0.71198

In [41]:
fm = FMClassifier(factorSize=2)
ovr = OneVsRest(classifier=fm, featuresCol="tfidf_features")
ovrModel = ovr.fit(trainingData)
ovr_predictions = ovrModel.transform(testData)
evaluator.evaluate(ovr_predictions)

  "as it is not supported by {} now.".format(classifier))


0.6793363243883856

#DL FLOW

In [27]:
%tensorflow_version 2.x
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, MaxPooling1D, Conv1D, GlobalMaxPooling1D, Dropout, LSTM, GRU, Flatten
from tensorflow.keras import utils
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras import utils
from sklearn.preprocessing import LabelEncoder
%matplotlib inline 

In [28]:
next_df.show()

+-----------+-----+--------------------+----+--------------------+--------------------+--------------------+--------------------+-----+-------------------+
|    cuisine|   id|         ingredients|size|          normalized|            filtered|      count_features|      tfidf_features|label|             weight|
+-----------+-----+--------------------+----+--------------------+--------------------+--------------------+--------------------+-----+-------------------+
|      greek|10259|romaine lettuce, ...|   9|[romaine, lettuce...|[romaine, lettuce...|(3190,[0,3,8,11,1...|(3190,[0,3,8,11,1...|  9.0| 1.6925106382978723|
|southern_us|25693|plain flour, grou...|  11|[plain, flour, gr...|[plain, flour, gr...|(3190,[0,1,2,4,11...|(3190,[0,1,2,4,11...|  2.0|0.46034722222222224|
|   filipino|20130|eggs, pepper, sal...|  12|[eggs, pepper, sa...|[eggs, pepper, sa...|(3190,[0,1,2,3,6,...|(3190,[0,1,2,3,6,...| 15.0|  2.634039735099338|
|     indian|22213|water, vegetable ...|   4|[water, vegetable..

In [29]:
train = next_df.select("cuisine", "id", "filtered").toPandas()

In [30]:
train["ing"] = train["filtered"].apply(lambda x: '|'.join(map(str, x)))

In [31]:
train

Unnamed: 0,cuisine,id,filtered,ing
0,greek,10259,"[romaine, lettuce, black, olives, grape, tomat...",romaine|lettuce|black|olives|grape|tomatoes|ga...
1,southern_us,25693,"[plain, flour, ground, pepper, salt, tomatoes,...",plain|flour|ground|pepper|salt|tomatoes|ground...
2,filipino,20130,"[eggs, pepper, salt, mayonaise, cook, oil, gre...",eggs|pepper|salt|mayonaise|cook|oil|green|chil...
3,indian,22213,"[water, vegetable, oil, wheat, salt]",water|vegetable|oil|wheat|salt
4,indian,13162,"[black, pepper, shallots, cornflour, cayenne, ...",black|pepper|shallots|cornflour|cayenne|pepper...
...,...,...,...,...
39769,irish,29109,"[light, brown, sugar, granulated, sugar, butte...",light|brown|sugar|granulated|sugar|butter|warm...
39770,italian,11462,"[kraft, zesty, italian, dressing, purple, onio...",kraft|zesty|italian|dressing|purple|onion|broc...
39771,irish,2238,"[eggs, citrus, fruit, raisins, sourdough, star...",eggs|citrus|fruit|raisins|sourdough|starter|fl...
39772,chinese,41882,"[boneless, chicken, skinless, thigh, mince, ga...",boneless|chicken|skinless|thigh|mince|garlic|s...


In [32]:
ingredients = train["ing"]

In [33]:
ingredients[15000]

'white|wine|garlic|cloves|crush|red|pepper|flakes|spaghetti|kosher|salt|flat|leaf|parsley|cockles|extra-virgin|olive|oil'

In [34]:
num_words = 3200
max_recipe_len = 200
nb_classes = 20

In [35]:
tokenizer = Tokenizer(num_words=num_words,  split='|')

In [36]:
tokenizer.fit_on_texts(ingredients)

In [37]:
sequences = tokenizer.texts_to_sequences(ingredients)

In [38]:
x_train = pad_sequences(sequences, maxlen=max_recipe_len)

In [39]:
le = LabelEncoder()
le.fit(train["cuisine"])
train["label"] = le.transform(train["cuisine"])
train["label"].value_counts()

9     7838
13    6438
16    4320
7     3003
3     2673
5     2646
2     1546
18    1539
11    1423
6     1175
17     989
12     830
19     825
14     821
1      804
4      755
8      667
10     526
15     489
0      467
Name: label, dtype: int64

In [40]:
y_train = utils.to_categorical(train['label'], nb_classes)

In [41]:
model_lstm = Sequential()
model_lstm.add(Embedding(num_words, 256, input_length=max_recipe_len))
model_lstm.add(LSTM(
                    256, 
                    return_sequences=True, 
                    dropout=0.5
                    )
              )
model_lstm.add(GRU(64))
model_lstm.add(Dense(20, activation='softmax'))

In [42]:
model_lstm.compile(optimizer='adam', 
              loss='categorical_crossentropy', 
              metrics=['accuracy'])

In [43]:
history_lstm = model_lstm.fit(x_train, 
                              y_train, 
                              epochs=10,
                              batch_size=128,
                              validation_split=0.1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [45]:
demo_pipeline = Pipeline(stages=[
                            assembler,
                            regexTokenizer,
                            lemmatizer, 
                            normalizer
                            ]
                    )

demo_pipelinefit = demo_pipeline.fit(corpus)
demo_df = demo_pipelinefit.transform(df_tr)

In [46]:
demo_df.show(truncate=False)

+-----------+-----+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------

1 - 0.69046 on kaggle

2 - 0.70132 on kaggle

3 - 0.71771 on kaggle

In [None]:
# split_udf = F.udf(lambda value: value[1].item(), DoubleType())