<a href="https://colab.research.google.com/github/Verschworer/HSE_ML_final/blob/main/spark_lr_tfidf.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
! pip install -q kaggle
from google.colab import files
files.upload()
! mkdir ~/.kaggle
! cp kaggle.json ~/.kaggle/
! chmod 600 ~/.kaggle/kaggle.json
! kaggle competitions download -c whats-cooking  

Saving kaggle.json to kaggle.json
Downloading sample_submission.csv.zip to /content
  0% 0.00/25.8k [00:00<?, ?B/s]
100% 25.8k/25.8k [00:00<00:00, 22.4MB/s]
Downloading train.json.zip to /content
  0% 0.00/1.76M [00:00<?, ?B/s]
100% 1.76M/1.76M [00:00<00:00, 58.4MB/s]
Downloading test.json.zip to /content
  0% 0.00/426k [00:00<?, ?B/s]
100% 426k/426k [00:00<00:00, 128MB/s]


### Data

In [2]:
! pip install -q pyspark

[K     |████████████████████████████████| 281.3 MB 37 kB/s 
[K     |████████████████████████████████| 198 kB 65.3 MB/s 
[?25h  Building wheel for pyspark (setup.py) ... [?25l[?25hdone


In [23]:
# import libraries

import pyspark
import pyspark.sql.functions as F
from pyspark.sql.types import * #JVM типы данных


from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession

from pyspark.ml.feature import VectorAssembler, StringIndexer, RegexTokenizer, StopWordsRemover, HashingTF, IDF, IndexToString
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml import Pipeline

from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import LogisticRegression, GBTClassifier, NaiveBayes, RandomForestClassifier, OneVsRest

import numpy as np
import pandas as pd

from itertools import chain

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [4]:
# spark up

spark = SparkSession.builder.master(
    "local[5]"
    ).appName(
    "team8final"
    ).config(
        "spark.sql.execution.arrow.pyspark.enabled", 
        "true").getOrCreate()

In [5]:
spark # spark is alive

In [6]:
! unzip '/content/train.json.zip'
! unzip '/content/test.json.zip'
! unzip '/content/sample_submission.csv.zip'

Archive:  /content/train.json.zip
  inflating: train.json              
   creating: __MACOSX/
  inflating: __MACOSX/._train.json   
Archive:  /content/test.json.zip
  inflating: test.json               
  inflating: __MACOSX/._test.json    
Archive:  /content/sample_submission.csv.zip
  inflating: sample_submission.csv   


In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
df = spark.read.json('train.json', multiLine=True)

In [8]:
print((df.count(), len(df.columns)))

(39774, 3)


In [9]:
df.show()

+-----------+-----+--------------------+
|    cuisine|   id|         ingredients|
+-----------+-----+--------------------+
|      greek|10259|[romaine lettuce,...|
|southern_us|25693|[plain flour, gro...|
|   filipino|20130|[eggs, pepper, sa...|
|     indian|22213|[water, vegetable...|
|     indian|13162|[black pepper, sh...|
|   jamaican| 6602|[plain flour, sug...|
|    spanish|42779|[olive oil, salt,...|
|    italian| 3735|[sugar, pistachio...|
|    mexican|16903|[olive oil, purpl...|
|    italian|12734|[chopped tomatoes...|
|    italian| 5875|[pimentos, sweet ...|
|    chinese|45887|[low sodium soy s...|
|    italian| 2698|[Italian parsley ...|
|    mexican|41995|[ground cinnamon,...|
|    italian|31908|[fresh parmesan c...|
|     indian|24717|[tumeric, vegetab...|
|    british|34466|[greek yogurt, le...|
|    italian| 1420|[italian seasonin...|
|       thai| 2941|[sugar, hot chili...|
| vietnamese| 8152|[soy sauce, veget...|
+-----------+-----+--------------------+
only showing top

In [10]:
df.select([F.count(F.when(F.isnull(c), c)).alias(c) for c in df.columns]).show() # check the NULL

+-------+---+-----------+
|cuisine| id|ingredients|
+-------+---+-----------+
|      0|  0|          0|
+-------+---+-----------+



In [11]:
# check the NaNs
df_temp = df.withColumn("ingredients", F.concat_ws(',' ,F.col("ingredients")))
df_temp.select([F.count(F.when(F.isnan(c), c)).alias(c) for c in df_temp.columns]).show()


+-------+---+-----------+
|cuisine| id|ingredients|
+-------+---+-----------+
|      0|  0|          0|
+-------+---+-----------+



In [12]:
len(df.select('cuisine').distinct().rdd.map(lambda r: r[0]).collect())

20

In [13]:
df.groupBy('cuisine').count().orderBy('count', ascending=False).show()

+------------+-----+
|     cuisine|count|
+------------+-----+
|     italian| 7838|
|     mexican| 6438|
| southern_us| 4320|
|      indian| 3003|
|     chinese| 2673|
|      french| 2646|
|cajun_creole| 1546|
|        thai| 1539|
|    japanese| 1423|
|       greek| 1175|
|     spanish|  989|
|      korean|  830|
|  vietnamese|  825|
|    moroccan|  821|
|     british|  804|
|    filipino|  755|
|       irish|  667|
|    jamaican|  526|
|     russian|  489|
|   brazilian|  467|
+------------+-----+



In [None]:
# df.select('cuisine').distinct().rdd.map(lambda r: r[0]).collect()

#PREPROCESSING

In [14]:
df_temp = df_temp.withColumn("ingredients", F.lower(F.col("ingredients")))

In [15]:
spaceDeleteUDF = F.udf(lambda s: s.replace(" ", ""), StringType())
df_temp = df_temp.withColumn("ingredients", spaceDeleteUDF("ingredients"))

In [16]:
df_temp.show()

+-----------+-----+--------------------+
|    cuisine|   id|         ingredients|
+-----------+-----+--------------------+
|      greek|10259|romainelettuce,bl...|
|southern_us|25693|plainflour,ground...|
|   filipino|20130|eggs,pepper,salt,...|
|     indian|22213|water,vegetableoi...|
|     indian|13162|blackpepper,shall...|
|   jamaican| 6602|plainflour,sugar,...|
|    spanish|42779|oliveoil,salt,med...|
|    italian| 3735|sugar,pistachionu...|
|    mexican|16903|oliveoil,purpleon...|
|    italian|12734|choppedtomatoes,f...|
|    italian| 5875|pimentos,sweetpep...|
|    chinese|45887|lowsodiumsoysauce...|
|    italian| 2698|italianparsleylea...|
|    mexican|41995|groundcinnamon,fr...|
|    italian|31908|freshparmesanchee...|
|     indian|24717|tumeric,vegetable...|
|    british|34466|greekyogurt,lemon...|
|    italian| 1420|italianseasoning,...|
|       thai| 2941|sugar,hotchili,as...|
| vietnamese| 8152|soysauce,vegetabl...|
+-----------+-----+--------------------+
only showing top

In [33]:
#reg
regexTokenizer = RegexTokenizer(inputCol="ingredients", outputCol="w_ingredients", pattern="\\W")
# stop words
stopwords = ["salt", "sugar", 
             "water"] 
             #"pepper", "natural", "boneless","fresh", "red", "orange", "yellow", "green", "blue", "purple"] 
stopwordsRemover = StopWordsRemover(inputCol="w_ingredients", outputCol="filtered").setStopWords(stopwords)
#tf-idf
hashingTF = HashingTF(inputCol="filtered", outputCol="rawFeatures", numFeatures=6714)
idf = IDF(inputCol="rawFeatures", outputCol="features") #minDocFreq: remove sparse terms
#labels
label_stringIdx = StringIndexer(inputCol = "cuisine", outputCol = "label")

In [34]:
pipeline = Pipeline(stages=[ 
                            regexTokenizer, 
                            stopwordsRemover, 
                            hashingTF, 
                            idf, 
                            label_stringIdx])

pipelineFit = pipeline.fit(df_temp)
next_df = pipelineFit.transform(df_temp)

In [19]:
next_df.show(truncate=False)

+-----------+-----+----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+-------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------

#ML FLOW

https://towardsdatascience.com/multi-class-text-classification-with-pyspark-7d78d022ed35

In [35]:
label_collect = next_df.select("label").groupBy("label").count().collect()
unique_label = [x["label"] for x in label_collect]
total_label = sum([x["count"] for x in label_collect])
unique_label_count = len(label_collect)
bin_count = [x["count"] for x in label_collect]

class_weights_label = {i: ii for i, ii in zip(unique_label, total_label / (unique_label_count * np.array(bin_count)))}
print(class_weights_label)


{8.0: 1.3975404075895994, 0.0: 0.25372544016330695, 7.0: 1.2922027290448344, 18.0: 4.066871165644172, 1.0: 0.30890027958993477, 4.0: 0.7439955106621773, 11.0: 2.3960240963855424, 14.0: 2.473507462686567, 3.0: 0.6622377622377622, 19.0: 4.258458244111349, 2.0: 0.46034722222222224, 17.0: 3.7807984790874523, 10.0: 2.0108190091001013, 13.0: 2.4222898903775882, 6.0: 1.2863518758085382, 15.0: 2.634039735099338, 5.0: 0.7515873015873016, 9.0: 1.6925106382978723, 16.0: 2.981559220389805, 12.0: 2.4105454545454545}


In [40]:
mapping_expr = F.create_map([F.lit(x) for x in chain(*class_weights_label.items())])

next_df = next_df.withColumn("weight", mapping_expr.getItem(F.col("label")))



In [41]:
(trainingData, testData) = next_df.randomSplit([0.7, 0.3], seed = 8)

print("We have %d training examples and %d test examples." % (trainingData.count(), testData.count()))

We have 27961 training examples and 11813 test examples.


In [42]:
lr = LogisticRegression(maxIter=40, 
                        regParam=0.2, 
                        elasticNetParam=0, 
                        tol=1E-6, 
                        fitIntercept=True, 
                        weightCol="weight")
lrModel = lr.fit(trainingData)
predictions = lrModel.transform(testData)

In [43]:
evaluator = MulticlassClassificationEvaluator(predictionCol="prediction", metricName="accuracy") # metricName="accuracy"
evaluator.evaluate(predictions)

0.7128587149750275

In [46]:
lr_for_OvR = LogisticRegression(maxIter=40, 
                        regParam=0.2, 
                        elasticNetParam=0, 
                        tol=1E-6, 
                        fitIntercept=True)

ovr = OneVsRest(classifier=lr_for_OvR, weightCol="weight")

In [47]:
ovrModel = ovr.fit(trainingData)
ovr_predictions = ovrModel.transform(testData)
evaluator.evaluate(ovr_predictions)

0.7115042749513248

In [48]:
sub_df = spark.read.json('test.json', multiLine=True)
sub_df_temp = sub_df.withColumn("ingredients", F.concat_ws(',' ,F.col("ingredients")))
sub_df_temp = sub_df_temp.withColumn("ingredients", F.lower(F.col("ingredients")))
sub_df_temp = sub_df_temp.withColumn("ingredients", spaceDeleteUDF("ingredients"))

In [49]:
sub_df_temp.show()

+-----+--------------------+
|   id|         ingredients|
+-----+--------------------+
|18009|bakingpowder,eggs...|
|28583|sugar,eggyolks,co...|
|41580|sausagelinks,fenn...|
|29752|meatcuts,filepowd...|
|35687|groundblackpepper...|
|38527|bakingpowder,all-...|
|19666|grapejuice,orange...|
|41217|groundginger,whit...|
|28753|dicedonions,tacos...|
|22659|eggs,cherries,dat...|
|21749|pasta,oliveoil,cr...|
|44967|water,butter,grou...|
|42969|currypowder,groun...|
|44883|pasta,marinarasau...|
|20827|salt,custardpowde...|
|23196|vegetableoilcooki...|
|35387|vanillaicecream,b...|
|33780|molasses,hotsauce...|
|19001|choppedgreenchili...|
|16526|coldwater,chicken...|
+-----+--------------------+
only showing top 20 rows



In [50]:
pipeline_sub = Pipeline(stages=[ 
                            regexTokenizer, 
                            stopwordsRemover, 
                            hashingTF, 
                            idf])#, 
                            #label_stringIdx])

pipelineFit_sub = pipeline_sub.fit(sub_df_temp)
next_df_sub = pipelineFit_sub.transform(sub_df_temp)

In [74]:
lrModel2 = lr.fit(next_df)
predictions = lrModel.transform(next_df_sub)

In [51]:
ovrModel_sub = ovr.fit(next_df)
predictions_sub = ovrModel_sub.transform(next_df_sub)

In [53]:
idx_to_string = IndexToString(
    inputCol="prediction", outputCol="cuisine", labels=pipelineFit.stages[4].labels)


idx_to_string.transform(predictions_sub).show()

+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+------------+
|   id|         ingredients|       w_ingredients|            filtered|         rawFeatures|            features|       rawPrediction|prediction|     cuisine|
+-----+--------------------+--------------------+--------------------+--------------------+--------------------+--------------------+----------+------------+
|18009|bakingpowder,eggs...|[bakingpowder, eg...|[bakingpowder, eg...|(6714,[2035,2094,...|(6714,[2035,2094,...|[-3.3923268620506...|      14.0|     british|
|28583|sugar,eggyolks,co...|[sugar, eggyolks,...|[eggyolks, cornst...|(6714,[293,395,20...|(6714,[293,395,20...|[-3.7661119491585...|       2.0| southern_us|
|41580|sausagelinks,fenn...|[sausagelinks, fe...|[sausagelinks, fe...|(6714,[1654,2377,...|(6714,[1654,2377,...|[-2.8994937898562...|       9.0|       greek|
|29752|meatcuts,filepowd...|[meatcuts, filepo...|[me

In [54]:
predictions_sub = idx_to_string.transform(predictions_sub)

In [79]:
spark2sub_lr = predictions.select('id', 'cuisine')

spark2sub_lr.toPandas().to_csv('spark2sub_2_lr_tfidf.csv', index=False)

In [56]:
spark2sub_ovr = predictions_sub.select('id', 'cuisine')

spark2sub_ovr.toPandas().to_csv('spark2sub_3_ovr_tfidf.csv', index=False)

In [57]:
! kaggle competitions submit -c whats-cooking -f spark2sub_3_ovr_tfidf.csv -m "3nd try"

100% 138k/138k [00:01<00:00, 116kB/s]
Successfully submitted to What's Cooking?

1 - 0.69046 on kaggle

2 - 0.70132 on kaggle

3 - 0.71771 on kaggle

In [None]:
# split_udf = F.udf(lambda value: value[1].item(), DoubleType())

In [None]:
# predictions_gbt_2 = gbt_model_sub.transform(subData)

# predictions_gbt_2 = predictions_gbt_2.withColumn('c1', split_udf('probability'))

# spark_gbt_2 = predictions_gbt_2.select('id', 'c1').withColumnRenamed('c1', 'insomnia')

# spark_gbt_2.toPandas().to_csv('spark_gbt_sub2.csv', index=False)