
***
# Classification - ML Section

***


## Importing Packages 

In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 47 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 36.7 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=84e5eff64b79760488cca6143bfe682ab812488887349c02fa468f5110ce5dae
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


In [2]:
#import libary
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import pyspark
from pyspark.sql import types
from pyspark.sql.functions import col, isnan, when, count, explode, array, lit


from pyspark.ml.feature import Imputer, VectorAssembler, StringIndexer
from pyspark.ml.regression import RandomForestRegressor, DecisionTreeRegressor, GBTRegressor, LinearRegression
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder

warnings.filterwarnings('ignore')

In [9]:
pandadf = pd.read_csv('cleaning_data 2.csv')

## connect to  spark 

In [4]:
from pyspark.sql import SparkSession


In [6]:
spark = pyspark.sql.SparkSession.builder.config("spark.executor.memory", "16g").config("spark.driver.memory", "16g").getOrCreate()

In [7]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")


In [10]:
df = spark.createDataFrame(pandadf)


## Loading Dataset 

In [11]:
#SHOW DATA
df

DataFrame[restaurant_name: string, country: string, region: string, city: string, latitude: double, longitude: double, claimed: string, popularity_detailed: string, popularity_generic: string, cuisines: string, vegetarian_friendly: bigint, vegan_options: bigint, gluten_free: bigint, open_days_per_week: bigint, open_hours_per_week: double, working_shifts_per_week: bigint, avg_rating: double, total_reviews_count: bigint, default_language: string, reviews_count_in_default_language: bigint, excellent: double, very_good: double, average: double, poor: double, terrible: double, food: double, service: double, value: double, atmosphere: double, price_range: string]

In [13]:
print('Shape of the dataset: ',(df.count(), len(df.columns)))

Shape of the dataset:  (749544, 30)


## drop non values

In [14]:
#drop all null valures
data=df.dropna()


***
# ML Classification Section 

***

## Feature Engineering and Feature Selection

In [17]:
data.printSchema()#show 

root
 |-- restaurant_name: string (nullable = true)
 |-- country: string (nullable = true)
 |-- region: string (nullable = true)
 |-- city: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- claimed: string (nullable = true)
 |-- popularity_detailed: string (nullable = true)
 |-- popularity_generic: string (nullable = true)
 |-- cuisines: string (nullable = true)
 |-- vegetarian_friendly: long (nullable = true)
 |-- vegan_options: long (nullable = true)
 |-- gluten_free: long (nullable = true)
 |-- open_days_per_week: long (nullable = true)
 |-- open_hours_per_week: double (nullable = true)
 |-- working_shifts_per_week: long (nullable = true)
 |-- avg_rating: double (nullable = true)
 |-- total_reviews_count: long (nullable = true)
 |-- default_language: string (nullable = true)
 |-- reviews_count_in_default_language: long (nullable = true)
 |-- excellent: double (nullable = true)
 |-- very_good: double (nullable = true)
 |-- 

In [26]:
from pyspark.sql.types import FloatType 


spark = data.replace( "mid" , '0', subset=["price_range"]) #replace mid range to 0




In [27]:
spark = spark.replace('cheap' , '1', subset=["price_range"]) #replace  cheap range 1


In [20]:
from pyspark.sql.types import IntegerType 


In [21]:
spark = spark.withColumn("vegetarian_friendly", spark["vegetarian_friendly"].cast(IntegerType())) #convert vegetarian_friendly type to  int type


In [22]:
spark = spark.withColumn("avg_rating", spark["avg_rating"].cast(FloatType())) #convert avg_rating type to  int type


In [23]:
spark = spark.withColumn("vegan_options", spark["vegan_options"].cast(IntegerType())) #convert vegan_options type to  int type


In [24]:
spark = spark.withColumn("reviews_count_in_default_language", spark["reviews_count_in_default_language"].cast(IntegerType())) #convert reviews_count_in_default_language type to  int type


In [41]:

spark = spark.withColumn("price_range", spark["price_range"].cast(IntegerType())) #convert price_range type to  int type


In [28]:
spark.groupBy("price_range").count().show()


+-----------+------+
|price_range| count|
+-----------+------+
|          0|521534|
|          1|227990|
+-----------+------+



In [29]:
major_df = spark.filter(col("price_range") == 0)
minor_df = spark.filter(col("price_range") == 1)



#we will calculate the ratio to determine the difference between the number of price_range 0 and price_range 1 transactions.
ratio = int(major_df.count()/minor_df.count())
print("ratio: {}".format(ratio))

ratio: 2


In [42]:
combined_df = spark 

In [43]:
combined_df.printSchema()

root
 |-- restaurant_name: string (nullable = true)
 |-- country: string (nullable = true)
 |-- region: string (nullable = true)
 |-- city: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- claimed: string (nullable = true)
 |-- popularity_detailed: string (nullable = true)
 |-- popularity_generic: string (nullable = true)
 |-- cuisines: string (nullable = true)
 |-- vegetarian_friendly: long (nullable = true)
 |-- vegan_options: long (nullable = true)
 |-- gluten_free: long (nullable = true)
 |-- open_days_per_week: long (nullable = true)
 |-- open_hours_per_week: double (nullable = true)
 |-- working_shifts_per_week: long (nullable = true)
 |-- avg_rating: double (nullable = true)
 |-- total_reviews_count: long (nullable = true)
 |-- default_language: string (nullable = true)
 |-- reviews_count_in_default_language: long (nullable = true)
 |-- excellent: double (nullable = true)
 |-- very_good: double (nullable = true)
 |-- 

In [44]:
#This step will label encode all the categorical columns and store them in different columns with the same name + '_idx', 
#so category will become category_idx 
cat_cols = ['cuisines','claimed'] #cuisines and price_range

#StringIndexer() is equivalent to LabelEncoder()
for c in cat_cols: 
    indexer = StringIndexer(inputCol=c, outputCol=c+'_idx') #we pass the columns from the list as input one by one
    combined_df = indexer.fit(combined_df).transform(combined_df) #here we fit and transform the data altogether
    
final_df2 = combined_df.drop(*cat_cols) #we will drop all the categorical columns we defined earlier

## Split Data

In [45]:
#drop unneed columns

final_df=final_df2.drop('average','reviews_count_in_default_language','atmosphere','default_language','working_shifts_per_week','open_hours_per_week','open_days_per_week','popularity_generic','popularity_detailed','longitude','latitude','region','city','country','restaurant_name')

cols = final_df.columns #extract the column names from the dataframe
cols.remove('price_range') #remove price_range -> we need this to be our label

#vector assembler will take all the columns and convert them into one column called features
assembler = VectorAssembler(inputCols=cols, outputCol='features')

#the .transform will apply the changes here
final_df = assembler.transform(final_df)

In [46]:
#we can now see that features column will appear within the dataframe
final_df

DataFrame[vegetarian_friendly: bigint, vegan_options: bigint, gluten_free: bigint, avg_rating: double, total_reviews_count: bigint, excellent: double, very_good: double, poor: double, terrible: double, food: double, service: double, value: double, price_range: int, cuisines_idx: double, claimed_idx: double, features: vector]


Lets Split

80% in training set and 20% is testing set.

In [47]:
# We will now create a new dataframe only consisting of the features column and the label column (actually stars column but renamed)
df_data = final_df.select(col('features'), col('price_range').alias('label'))

#simple data splitting
df_train, df_test = df_data.randomSplit([0.8, 0.2])

## Train Models

####  Note: minimum 3 models

In [48]:
# Import some classifiers and multiclass evaluator
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, LogisticRegression, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

### 1st Classification Model 

In [49]:
# Decision Tree
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features" ,maxBins=800000)
model_dt = dt.fit(df_train)

### 2nd Classification Model  

In [50]:
# Random Forest
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=8,maxDepth=5 ,maxBins=800000)
model_rf = rf.fit(df_train)

### 3rd Classification Model  

In [51]:
# Logistic Regression
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, labelCol="label", featuresCol="features")
model_lr = lr.fit(df_train)

### 4 Classification Model  

In [54]:
#NaiveBayes
from pyspark.ml.classification import NaiveBayes
nb = NaiveBayes(smoothing=1.0, 
                modelType="gaussian", 
                featuresCol='features', labelCol='label')
model_niv = nb.fit(df_train)

## Model Evaluation

#### Note: This should include confusion matrix and classification report

### 1st Model  Evaluation

In [55]:
#  pyspark has .transform
pred_dt = model_dt.transform(df_test)


## report  and matrix


In [56]:
import sklearn 
from pyspark.ml.classification import RandomForestClassifier

y_true = pred_dt.select(['label']).collect()
y_pred = pred_dt.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

           0       0.71      0.99      0.82    104211
           1       0.67      0.07      0.12     45543

    accuracy                           0.71    149754
   macro avg       0.69      0.53      0.47    149754
weighted avg       0.70      0.71      0.61    149754



In [57]:
#matrix
from sklearn import metrics
matrix=metrics.confusion_matrix(y_true,y_pred)
print(matrix)

[[102701   1510]
 [ 42494   3049]]


### 2nd Model  Evaluation

In [58]:
#  pyspark has .transform
pred_rf = model_rf.transform(df_test)


## report  and matrix


In [59]:

y_true3 = pred_rf.select(['label']).collect()
y_pred3 = pred_rf.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true3, y_pred3))

              precision    recall  f1-score   support

           0       0.70      1.00      0.82    104211
           1       0.56      0.00      0.00     45543

    accuracy                           0.70    149754
   macro avg       0.63      0.50      0.41    149754
weighted avg       0.65      0.70      0.57    149754



In [60]:
from sklearn import metrics
matrix=metrics.confusion_matrix(y_true3,y_pred3)
print(matrix)

[[104171     40]
 [ 45492     51]]


### 3rd Model  Evaluation

In [61]:
#  pyspark has .transform
pred_lr = model_lr.transform(df_test)


## report  and matrix


In [62]:
y_true4 = pred_lr.select(['label']).collect()
y_pred4 = pred_lr.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true4, y_pred4))

              precision    recall  f1-score   support

           0       0.70      1.00      0.82    104211
           1       0.00      0.00      0.00     45543

    accuracy                           0.70    149754
   macro avg       0.35      0.50      0.41    149754
weighted avg       0.48      0.70      0.57    149754



In [63]:
from sklearn import metrics
matrix=metrics.confusion_matrix(y_true4,y_pred4)
print(matrix)

[[104211      0]
 [ 45543      0]]


### 4th Model  Evaluation


In [64]:
pred_nb = model_niv.transform(df_test)

## report  and matrix


In [65]:
y_true6 = pred_nb.select(['label']).collect()
y_pred6 = pred_nb.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true6, y_pred6))

              precision    recall  f1-score   support

           0       0.86      0.20      0.32    104211
           1       0.33      0.92      0.49     45543

    accuracy                           0.42    149754
   macro avg       0.60      0.56      0.41    149754
weighted avg       0.70      0.42      0.37    149754



In [66]:
from sklearn import metrics
matrix=metrics.confusion_matrix(y_true6,y_pred6)
print(matrix)

[[20687 83524]
 [ 3491 42052]]


## Show ML Evaluation as Dataframe


In [70]:
# Accuracy Metric
evaluator_A = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="accuracy")

# F1 Metric
evaluator_F = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="f1")

# Weighted Precision
evaluator_P = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedPrecision")

# Weighted Recall
evaluator_R = MulticlassClassificationEvaluator(labelCol="label", predictionCol="prediction", metricName="weightedRecall")

# Our models
models = [pred_dt, pred_rf, pred_lr,pred_nb]

# Empty lists that will store the scores for each metric for each model.
accuracy = []
F1 = []
precision = []
recall = []

# Simple loop to populate the empty lists with scores of models for each metric.
for model in models:
    accuracy.append(evaluator_A.evaluate(model))
    F1.append(evaluator_F.evaluate(model))
    precision.append(evaluator_P.evaluate(model))
    recall.append(evaluator_R.evaluate(model))

In [71]:
# We will convert all lists created above into a dataframe for easy viewing.
df_ev = pd.DataFrame(list(zip(accuracy, F1, precision, recall)), 
                     columns = ['Accuracy', 'F1-Score', 'Weighted Precision', 'Weighted Recall'],
                     index = ['Decision Tree', 'Random Forest', 'Logistic Regression','nive'])

In [72]:
df_ev

Unnamed: 0,Accuracy,F1-Score,Weighted Precision,Weighted Recall
Decision Tree,0.706158,0.610118,0.695609,0.706158
Random Forest,0.695955,0.571756,0.654799,0.695955
Logistic Regression,0.695881,0.57109,0.484251,0.695881
nive,0.418947,0.373724,0.697246,0.418947


the best model is Decision Tree with 0.70 Accuracy , and the worest model is nive	 with 0.41 


### select model

Decision Tree is the best model with accurcy 0.70 and 


## Model Optimization - Hyperparameter Tuning 

In [73]:
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder, CrossValidatorModel

#initialize our grid -> we are using variation with only one parameter called maxIter (maximum iteration)
grid = ParamGridBuilder().addGrid(dt.maxDepth, [2]).build()

#CrossValidator will by default have 3 folds. We can explicitly specify that using numFolds = 3
cv = CrossValidator(estimator=dt, estimatorParamMaps=grid, evaluator=evaluator_A, parallelism=2)

#lets fit again on our training set
cvModel = cv.fit(df_train)

In [74]:
#average metrics on 4 different models 
cvModel.avgMetrics 


[0.6958035292181495]

In [75]:
#lets try to get the accuracy of our model on the testing set
evaluator_A.evaluate(cvModel.transform(df_test))

0.6958812452421972

In [None]:
# Add your steps here

# ML Pipeline for Best Model

In [76]:
from pyspark.ml import Pipeline


In [77]:
pipeline = Pipeline(stages=[rf])


In [78]:

# Fit the pipeline to training documents.
model = pipeline.fit(df_train)

In [79]:
#lets try to get the accuracy of our model on the testing set
evaluator_A.evaluate(model.transform(df_test))

0.6959546990397585