## Importing Packages 

In [1]:
!pip install pyspark

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting pyspark
  Downloading pyspark-3.3.1.tar.gz (281.4 MB)
[K     |████████████████████████████████| 281.4 MB 47 kB/s 
[?25hCollecting py4j==0.10.9.5
  Downloading py4j-0.10.9.5-py2.py3-none-any.whl (199 kB)
[K     |████████████████████████████████| 199 kB 63.0 MB/s 
[?25hBuilding wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.3.1-py2.py3-none-any.whl size=281845512 sha256=72ee22ea9a0bd5a7ce9e397e20153ec200778b26988da5f90a6f65f6d20a0575
  Stored in directory: /root/.cache/pip/wheels/43/dc/11/ec201cd671da62fa9c5cc77078235e40722170ceba231d7598
Successfully built pyspark
Installing collected packages: py4j, pyspark
Successfully installed py4j-0.10.9.5 pyspark-3.3.1


In [22]:
#import libary
import warnings
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

import pyspark
from pyspark.sql import types
from pyspark.sql.functions import col, isnan, when, count, explode, array, lit

# Import some classifiers and multiclass evaluator
from pyspark.ml.classification import DecisionTreeClassifier, RandomForestClassifier, LogisticRegression, GBTClassifier
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.ml.classification import NaiveBayes

#feature libary
from pyspark.ml.feature import Imputer, VectorAssembler, StringIndexer
from pyspark.ml.classification import FMClassifier
#import evaluation
from pyspark.ml.evaluation import RegressionEvaluator
from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
#TO convert type
from pyspark.sql.types import IntegerType 
from pyspark.sql.types import FloatType 


warnings.filterwarnings('ignore')

## Loading Dataset 

In [25]:
pandadf = pd.read_csv('/content/drive/MyDrive/cleaning_data.csv') #read ssv file as pandas

## connect to  spark 

In [None]:
from pyspark.sql import SparkSession # import SparkSession


In [24]:
spark = pyspark.sql.SparkSession.builder.config("spark.executor.memory", "16g").config("spark.driver.memory", "16g").getOrCreate()

In [None]:
spark.conf.set("spark.sql.execution.arrow.pyspark.enabled", "true")


In [None]:
#convert from pandas to spark
df = spark.createDataFrame(pandadf)

## Feature Engineering and Feature Selection

In [108]:


spark_df = data3.replace( "Unclaimed" , '0', subset=["claimed"]) #replace Unclaimed to 0
 spark_df= spark_df.replace( "Claimed" , '1', subset=["claimed"]) #replace claimed to 1




### convert some columns types 

In [110]:
spark = spark_df.withColumn("claimed", spark_df["claimed"].cast(FloatType())) #convert claimed type to  int type
spark = spark.withColumn("vegetarian_friendly", spark["vegetarian_friendly"].cast(IntegerType())) #convert vegetarian_friendly type to  int type
spark = spark.withColumn("avg_rating", spark["avg_rating"].cast(IntegerType())) #convert avg_rating type to  int type
spark = spark.withColumn("vegan_options", spark["vegan_options"].cast(IntegerType())) #convert vegan_options type to  int type
spark = spark.withColumn("reviews_count_in_default_language", spark["reviews_count_in_default_language"].cast(IntegerType())) #convert reviews_count_in_default_language type to  int type


In [112]:
#new pyspark frame
combined_df=spark


In [113]:
#This step will label encode all the categorical columns and store them in different columns with the same name + '_idx', 
#so category will become category_idx 
cat_cols = ['price_range']

#StringIndexer() is equivalent to LabelEncoder()
for c in cat_cols: 
    indexer = StringIndexer(inputCol=c, outputCol=c+'_idx') #we pass the columns from the list as input one by one
    combined_df = indexer.fit(combined_df).transform(combined_df) #here we fit and transform the data altogether
    
final_df2 = combined_df.drop(*cat_cols) #we will drop all the categorical columns we defined earlier

## Split Data

In [115]:
#drop unneed columns

final_df=final_df2.drop('avg_rating','average', 'atmosphere','default_language','working_shifts_per_week','open_hours_per_week','open_days_per_week','cuisines','popularity_generic','popularity_detailed','longitude','latitude','region','city','country','restaurant_name')

cols = final_df.columns #extract the column names from the dataframe
cols.remove('claimed') #remove claimed -> we need this to be our label

#vector assembler will take all the columns and convert them into one column called features
assembler = VectorAssembler(inputCols=cols, outputCol='features')

#the .transform will apply the changes here
final_df = assembler.transform(final_df)


Lets Split

80% in training set and 20% is testing set.

In [117]:
# We will now create a new dataframe only consisting of the features column and the label column (actually stars column but renamed)
df_data = final_df.select(col('features'), col('claimed').alias('label'))

#simple data splitting
df_train, df_test = df_data.randomSplit([0.8, 0.2])

## Train Models



### 1st  Model 

In [119]:
# Decision Tree
dt = DecisionTreeClassifier(labelCol="label", featuresCol="features" ,maxBins=800000)
model_dt = dt.fit(df_train)#fit the model

### 2nd  Model  

In [120]:
# Random Forest
rf = RandomForestClassifier(labelCol="label", featuresCol="features", numTrees=20,maxDepth=10 ,maxBins=800000)
model_rf = rf.fit(df_train) #fit the model

### 3rd  Model  

In [121]:
# Logistic Regression
lr = LogisticRegression(maxIter=10, regParam=0.3, elasticNetParam=0.8, labelCol="label", featuresCol="features")
model_lr = lr.fit(df_train)#fit the model

### 4th  Model  

In [None]:
# Gradient Boost
gbt = GBTClassifier(labelCol="label", featuresCol="features", maxIter=10)
model_gbt = gbt.fit(df_train)

### 5  Model  

In [None]:
#NaiveBayes
nb = NaiveBayes(smoothing=1.0, 
                modelType="gaussian", 
                featuresCol='features', labelCol='label')
model_niv = nb.fit(df_train)#fit the model

### 6  Model  

In [None]:
fm = FMClassifier(labelCol="label", featuresCol="features", stepSize=0.001)
model_fm = em.fit(df_train)#fit the model

## Model Evaluation



### 1st Model  Evaluation

In [122]:
#D tree
pred_dt = model_dt.transform(df_test)


## report  and matrix


In [123]:
#report
import sklearn 
from pyspark.ml.classification import RandomForestClassifier

y_true = pred_dt.select(['label']).collect()
y_pred = pred_dt.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true, y_pred))

              precision    recall  f1-score   support

         0.0       0.63      0.63      0.63     65367
         1.0       0.71      0.71      0.71     84482

    accuracy                           0.68    149849
   macro avg       0.67      0.67      0.67    149849
weighted avg       0.68      0.68      0.68    149849



accuracy for Decision Tree is 68

In [None]:
#matrix
from sklearn import metrics
matrix=metrics.confusion_matrix(y_true,y_pred)
print(matrix)

[[106 167]
 [ 71 343]]


### 2nd Model  Evaluation

In [124]:
#  random forst tree
pred_rf = model_rf.transform(df_test)


## report  and matrix


In [125]:

y_true3 = pred_rf.select(['label']).collect()
y_pred3 = pred_rf.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true3, y_pred3))

              precision    recall  f1-score   support

         0.0       0.63      0.68      0.65     65367
         1.0       0.74      0.70      0.72     84482

    accuracy                           0.69    149849
   macro avg       0.68      0.69      0.69    149849
weighted avg       0.69      0.69      0.69    149849



accuracy for rando forst is 0.69

In [None]:
from sklearn import metrics
matrix=metrics.confusion_matrix(y_true3,y_pred3)
print(matrix)

[[ 99 174]
 [ 72 342]]


### 3rd Model  Evaluation

In [126]:
# Logistic Regression transform
pred_lr = model_lr.transform(df_test)


## report  and matrix


In [127]:
y_true4 = pred_lr.select(['label']).collect()
y_pred4 = pred_lr.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true4, y_pred4))

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00     65367
         1.0       0.56      1.00      0.72     84482

    accuracy                           0.56    149849
   macro avg       0.28      0.50      0.36    149849
weighted avg       0.32      0.56      0.41    149849



accuracy for  Logistic Regression is 0.56

In [None]:
from sklearn import metrics
matrix=metrics.confusion_matrix(y_true4,y_pred4)
print(matrix)

[[  0 273]
 [  0 414]]


### 4th Model  Evaluation


In [None]:
#   Gradient Boost

pred_gbt = model_gbt.transform(df_test)

## report  and matrix


In [None]:
y_true5 = pred_gbt.select(['label']).collect()
y_pred5 = pred_gbt.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true5, y_pred5))

              precision    recall  f1-score   support

           0       0.62      0.42      0.50       273
           1       0.69      0.83      0.75       414

    accuracy                           0.67       687
   macro avg       0.65      0.63      0.63       687
weighted avg       0.66      0.67      0.65       687




accuracy for  Gradient Boost is 0.67

In [None]:
from sklearn import metrics
matrix=metrics.confusion_matrix(y_true5,y_pred5)
print(matrix)

[[116 157]
 [ 72 342]]


## 5th Model Evaluation

In [None]:
#NaiveBayes

pred_nb = model_niv.transform(df_test)

## report  and matrix


In [None]:
y_true6 = pred_nb.select(['label']).collect()
y_pred6 = pred_nb.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true6, y_pred6))

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         0
         1.0       0.34      0.52      0.41       275
         2.0       0.00      0.00      0.00       420

    accuracy                           0.21       695
   macro avg       0.11      0.17      0.14       695
weighted avg       0.13      0.21      0.16       695




accuracy for NaiveBayes is 0.21

In [None]:
from sklearn import metrics
matrix=metrics.confusion_matrix(y_true6,y_pred6)
print(matrix)

[[  0   0   0]
 [131 144   0]
 [140 280   0]]


## 6 Model Evaluation

In [None]:
#Factorization machines classifier
pred_fm = model_fm.transform(df_test)

## report  and matrix


In [None]:
y_true7 = pred_fm.select(['label']).collect()
y_pred7 = pred_fm.select(['prediction']).collect()

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(y_true7, y_pred7))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       273
           1       0.60      1.00      0.75       414

    accuracy                           0.60       687
   macro avg       0.30      0.50      0.38       687
weighted avg       0.36      0.60      0.45       687



accuracy for Factorization is 0.60


accuracy for Factorization is 0.60

In [None]:
from sklearn import metrics
matrix=metrics.confusion_matrix(y_true7,y_pred7)
print(matrix)

[[  0 273]
 [  1 413]]


### The best model is random tree with 0.69 , and the worest model is nive with 0.21
