<a href="https://colab.research.google.com/github/Verschworer/HSE_ML_final/blob/main/team8final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
! pip install -q kaggle
from google.colab import files
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle competitions download -c whats-cooking  

mkdir: cannot create directory ‘/root/.kaggle’: File exists
Downloading test.json.zip to /content
  0% 0.00/426k [00:00<?, ?B/s]
100% 426k/426k [00:00<00:00, 63.7MB/s]
Downloading train.json.zip to /content
  0% 0.00/1.76M [00:00<?, ?B/s]
100% 1.76M/1.76M [00:00<00:00, 109MB/s]
Downloading sample_submission.csv.zip to /content
  0% 0.00/25.8k [00:00<?, ?B/s]
100% 25.8k/25.8k [00:00<00:00, 22.8MB/s]


### Data

In [1]:
! pip install -q pyspark

In [1]:
# import libraries

import pyspark
import pyspark.sql.functions as F
from pyspark.sql.types import * #JVM типы данных


from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

from pyspark.ml.feature import VectorAssembler, StringIndexer, RegexTokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression, GBTClassifier, NaiveBayes, RandomForestClassifier

import numpy as np
import pandas as pd


import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [2]:
# spark up

spark = SparkSession.builder.master(
    "local[*]"
    ).appName(
    "team8final"
    ).config(
        "spark.sql.execution.arrow.pyspark.enabled", 
        "true").getOrCreate()

In [3]:
spark # spark is alive

In [None]:
# ! unzip '/content/train.json.zip'
# ! unzip '/content/test.json.zip'
# ! unzip '/content/sample_submission.csv.zip'

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [4]:
df = spark.read.json('train.json', multiLine=True)

In [5]:
print((df.count(), len(df.columns)))

(39774, 3)


In [6]:
df.show()

+-----------+-----+--------------------+
|    cuisine|   id|         ingredients|
+-----------+-----+--------------------+
|      greek|10259|[romaine lettuce,...|
|southern_us|25693|[plain flour, gro...|
|   filipino|20130|[eggs, pepper, sa...|
|     indian|22213|[water, vegetable...|
|     indian|13162|[black pepper, sh...|
|   jamaican| 6602|[plain flour, sug...|
|    spanish|42779|[olive oil, salt,...|
|    italian| 3735|[sugar, pistachio...|
|    mexican|16903|[olive oil, purpl...|
|    italian|12734|[chopped tomatoes...|
|    italian| 5875|[pimentos, sweet ...|
|    chinese|45887|[low sodium soy s...|
|    italian| 2698|[Italian parsley ...|
|    mexican|41995|[ground cinnamon,...|
|    italian|31908|[fresh parmesan c...|
|     indian|24717|[tumeric, vegetab...|
|    british|34466|[greek yogurt, le...|
|    italian| 1420|[italian seasonin...|
|       thai| 2941|[sugar, hot chili...|
| vietnamese| 8152|[soy sauce, veget...|
+-----------+-----+--------------------+
only showing top

In [7]:
df.select([F.count(F.when(F.isnull(c), c)).alias(c) for c in df.columns]).show() # check the NULL

+-------+---+-----------+
|cuisine| id|ingredients|
+-------+---+-----------+
|      0|  0|          0|
+-------+---+-----------+



In [8]:
# check the NaNs
df_temp = df.withColumn("ingredients", F.concat_ws(',' ,F.col("ingredients")))
df_temp.select([F.count(F.when(F.isnan(c), c)).alias(c) for c in df_temp.columns]).show()

+-------+---+-----------+
|cuisine| id|ingredients|
+-------+---+-----------+
|      0|  0|          0|
+-------+---+-----------+



In [9]:
len(df.select('cuisine').distinct().rdd.map(lambda r: r[0]).collect())

20

In [10]:
df.select('cuisine').distinct().rdd.map(lambda r: r[0]).collect()

['cajun_creole',
 'greek',
 'russian',
 'korean',
 'indian',
 'spanish',
 'french',
 'vietnamese',
 'italian',
 'mexican',
 'thai',
 'chinese',
 'jamaican',
 'british',
 'filipino',
 'southern_us',
 'moroccan',
 'irish',
 'japanese',
 'brazilian']

#PREPROCESSING

In [11]:
spaceDeleteUDF = F.udf(lambda s: s.replace(" ", ""), StringType())
df_temp = df_temp.withColumn("ingredients", spaceDeleteUDF("ingredients"))

In [12]:
df_temp.show()

+-----------+-----+--------------------+
|    cuisine|   id|         ingredients|
+-----------+-----+--------------------+
|      greek|10259|romainelettuce,bl...|
|southern_us|25693|plainflour,ground...|
|   filipino|20130|eggs,pepper,salt,...|
|     indian|22213|water,vegetableoi...|
|     indian|13162|blackpepper,shall...|
|   jamaican| 6602|plainflour,sugar,...|
|    spanish|42779|oliveoil,salt,med...|
|    italian| 3735|sugar,pistachionu...|
|    mexican|16903|oliveoil,purpleon...|
|    italian|12734|choppedtomatoes,f...|
|    italian| 5875|pimentos,sweetpep...|
|    chinese|45887|lowsodiumsoysauce...|
|    italian| 2698|Italianparsleylea...|
|    mexican|41995|groundcinnamon,fr...|
|    italian|31908|freshparmesanchee...|
|     indian|24717|tumeric,vegetable...|
|    british|34466|greekyogurt,lemon...|
|    italian| 1420|italianseasoning,...|
|       thai| 2941|sugar,hotchili,as...|
| vietnamese| 8152|soysauce,vegetabl...|
+-----------+-----+--------------------+
only showing top

In [13]:
#reg
regexTokenizer = RegexTokenizer(inputCol="ingredients", outputCol="w_ingredients", pattern="\\W")
# stop words
stopwords = ["salt", "sugar", 
             "water", "eggs"] 
             #"pepper", "natural", "boneless","fresh", "red", "orange", "yellow", "green", "blue", "purple"] 
stopwordsRemover = StopWordsRemover(inputCol="w_ingredients", outputCol="filtered").setStopWords(stopwords)
#tf-idf
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=100000)
idf = IDF(inputCol="rawFeatures", outputCol="features", minDocFreq=3) #minDocFreq: remove sparse terms
#labels
label_stringIdx = StringIndexer(inputCol = "cuisine", outputCol = "label")

In [33]:
pipeline = Pipeline(stages=[ 
                            regexTokenizer, 
                            stopwordsRemover, 
                            hashingTF, 
                            idf, 
                            label_stringIdx])

pipelineFit = pipeline.fit(df_temp)
next_df = pipelineFit.transform(df_temp)

In [15]:
next_df.show(truncate=False)

+-----------+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+----------------------------------------------------------------------------------------------------------------------------------------

#ML FLOW

https://towardsdatascience.com/multi-class-text-classification-with-pyspark-7d78d022ed35

In [16]:
(trainingData, testData) = next_df.randomSplit([0.7, 0.3], seed = 8)

print("We have %d training examples and %d test examples." % (trainingData.count(), testData.count()))

We have 27961 training examples and 11813 test examples.


In [17]:
lr = LogisticRegression(maxIter=20, regParam=0.3, elasticNetParam=0)
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)

In [18]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")
evaluator.evaluate(predictions)

0.6781952559808299

In [23]:
sub_df = spark.read.json('test.json', multiLine=True)
sub_df_temp = sub_df.withColumn("ingredients", F.concat_ws(',' ,F.col("ingredients")))
sub_df_temp = sub_df_temp.withColumn("ingredients", spaceDeleteUDF("ingredients"))

In [24]:
sub_df_temp.show()

+-----+--------------------+
|   id|         ingredients|
+-----+--------------------+
|18009|bakingpowder,eggs...|
|28583|sugar,eggyolks,co...|
|41580|sausagelinks,fenn...|
|29752|meatcuts,filepowd...|
|35687|groundblackpepper...|
|38527|bakingpowder,all-...|
|19666|grapejuice,orange...|
|41217|groundginger,whit...|
|28753|dicedonions,tacos...|
|22659|eggs,cherries,dat...|
|21749|pasta,oliveoil,cr...|
|44967|water,butter,grou...|
|42969|currypowder,groun...|
|44883|pasta,marinarasau...|
|20827|salt,custardpowde...|
|23196|vegetableoilcooki...|
|35387|vanillaicecream,b...|
|33780|molasses,hotsauce...|
|19001|choppedgreenchili...|
|16526|coldwater,chicken...|
+-----+--------------------+
only showing top 20 rows



In [26]:
pipeline_sub = Pipeline(stages=[ 
                            regexTokenizer, 
                            stopwordsRemover, 
                            hashingTF, 
                            idf])#, 
                            #label_stringIdx])

pipelineFit_sub = pipeline_sub.fit(sub_df_temp)
next_df_sub = pipelineFit_sub.transform(sub_df_temp)

In [27]:
lrModel2 = lr.fit(next_df)
predictions = lrModel.transform(next_df_sub)

In [28]:
from pyspark.ml.feature import IndexToString

In [34]:
idx_to_string = IndexToString(
    inputCol="prediction", outputCol="cuisine", labels=pipelineFit.stages[4].labels)


idx_to_string.transform(predictions).show()

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+------------+
|   id|         ingredients|       w_ingredients|            filtered|         rawFeatures|            features|       rawPrediction|         probability|prediction|     cuisine|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+------------+
|18009|bakingpowder,eggs...|[bakingpowder, eg...|[bakingpowder, al...|(100000,[6372,276...|(100000,[6372,276...|[1.43560682080981...|[0.15425482491723...|       2.0| southern_us|
|28583|sugar,eggyolks,co...|[sugar, eggyolks,...|[eggyolks, cornst...|(100000,[4989,649...|(100000,[4989,649...|[0.76118380093307...|[0.05243422879243...|       2.0| southern_us|
|41580|sausagelinks,fenn...|[sausagelinks, fe...|[sausagelinks, fe...|(100000,[15862,31...|(100000,[15862

In [35]:
predictions = idx_to_string.transform(predictions)

In [36]:
spark2sub_1 = predictions.select('id', 'cuisine')

spark2sub_1.toPandas().to_csv('spark2sub_1_lr_tfidf.csv', index=False)

In [37]:
! kaggle competitions submit -c whats-cooking -f spark2sub_1_lr_tfidf.csv -m "1st try"

100% 139k/139k [00:00<00:00, 302kB/s]
Successfully submitted to What's Cooking?

0.69046 on kaggle

In [None]:
# split_udf = F.udf(lambda value: value[1].item(), DoubleType())

In [None]:
# predictions_gbt_2 = gbt_model_sub.transform(subData)

# predictions_gbt_2 = predictions_gbt_2.withColumn('c1', split_udf('probability'))

# spark_gbt_2 = predictions_gbt_2.select('id', 'c1').withColumnRenamed('c1', 'insomnia')

# spark_gbt_2.toPandas().to_csv('spark_gbt_sub2.csv', index=False)