<a href="https://colab.research.google.com/github/RifatMuhtasim/Apache_Spark/blob/main/1.07.PySpark_Multiple_Linear_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import gdown
import os
import zipfile

# Replace 'output_path' with the path where you want to save the file
output_path = 'dataset.zip'

if os.path.exists(output_path):
    print("File exists!")

else:
    print("File does not exist.")
    # Replace 'file_id' with the ID of your file in Google Drive
    file_id = '1matQHY1Q4xHT4m_jxVSZ8Oefgk6ZiDi2'
    gdown.download(f'https://drive.google.com/uc?id={file_id}', output_path, quiet=False)

    # Path to your .zip file (Must Change. Same as the Output Path)
    zip_file_path = '/content/dataset.zip'

    # Extract the contents of the .zip file to the root directory
    with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
        zip_ref.extractall('/content/')

    # List the contents of the root directory
    extracted_files = !ls -a /content/
    print("Files extracted to root directory:", extracted_files)

File exists!


In [2]:
#Load Spark
from pyspark.sql import SparkSession
spark = SparkSession.builder.appName("MyApp").getOrCreate()
spark

# Load Dataset

In [3]:
#Load tips dataset
ds = spark.read.csv("tips.csv", header=True, inferSchema=True)
ds.show()

+----------+----+------+------+---+------+----+
|total_bill| tip|   sex|smoker|day|  time|size|
+----------+----+------+------+---+------+----+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|
|     26.88|3.12|  Male|    No|Sun|Dinner|   4|
|     15.04|1.96|  Male|    No|Sun|Dinner|   2|
|     14.78|3.23|  Male|    No|Sun|Dinner|   2|
|     10.27|1.71|  Male|    No|Sun|Dinner|   2|
|     35.26| 5.0|Female|    No|Sun|Dinner|   4|
|     15.42|1.57|  Male|    No|Sun|Dinner|   2|
|     18.43| 3.0|  Male|    No|Sun|Dinner|   4|
|     14.83|3.02|Female|    No|Sun|Dinner|   2|
|     21.58|3.92|  Male|    No|Sun|Dinner|   2|
|     10.33|1.67|Female|    No|Sun|Dinner|   3|
|     16.29|3.71|  Male|    No|Sun|Dinne

In [4]:
ds.columns

['total_bill', 'tip', 'sex', 'smoker', 'day', 'time', 'size']

# Feature Engineering

## Handling Categorical Features

In [5]:
from pyspark.ml.feature import StringIndexer

indexer=StringIndexer(inputCols=["sex", "smoker", 'day', 'time'],outputCols=["sex_indexed", "smoker_indexed", 'day_indexed', 'time_indexed'])
ds_indexed = indexer.fit(ds).transform(ds)
ds_indexed.show()

+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+
|total_bill| tip|   sex|smoker|day|  time|size|sex_indexed|smoker_indexed|day_indexed|time_indexed|
+----------+----+------+------+---+------+----+-----------+--------------+-----------+------------+
|     16.99|1.01|Female|    No|Sun|Dinner|   2|        1.0|           0.0|        1.0|         0.0|
|     10.34|1.66|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|         0.0|
|     21.01| 3.5|  Male|    No|Sun|Dinner|   3|        0.0|           0.0|        1.0|         0.0|
|     23.68|3.31|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|        1.0|         0.0|
|     24.59|3.61|Female|    No|Sun|Dinner|   4|        1.0|           0.0|        1.0|         0.0|
|     25.29|4.71|  Male|    No|Sun|Dinner|   4|        0.0|           0.0|        1.0|         0.0|
|      8.77| 2.0|  Male|    No|Sun|Dinner|   2|        0.0|           0.0|        1.0|         0.0|


## VectorAssembler

In [6]:
from pyspark.ml.feature import VectorAssembler
feature_assembler = VectorAssembler(inputCols=['total_bill', 'size', 'sex_indexed', 'smoker_indexed', 'day_indexed', 'time_indexed'], outputCol="Independent Features")
output = feature_assembler.transform(ds_indexed)

In [7]:
#Show Independent Features
output.select('Independent Features').show()

+--------------------+
|Independent Features|
+--------------------+
|[16.99,2.0,1.0,0....|
|[10.34,3.0,0.0,0....|
|[21.01,3.0,0.0,0....|
|[23.68,2.0,0.0,0....|
|[24.59,4.0,1.0,0....|
|[25.29,4.0,0.0,0....|
|[8.77,2.0,0.0,0.0...|
|[26.88,4.0,0.0,0....|
|[15.04,2.0,0.0,0....|
|[14.78,2.0,0.0,0....|
|[10.27,2.0,0.0,0....|
|[35.26,4.0,1.0,0....|
|[15.42,2.0,0.0,0....|
|[18.43,4.0,0.0,0....|
|[14.83,2.0,1.0,0....|
|[21.58,2.0,0.0,0....|
|[10.33,3.0,1.0,0....|
|[16.29,3.0,0.0,0....|
|[16.97,3.0,1.0,0....|
|(6,[0,1],[20.65,3...|
+--------------------+
only showing top 20 rows



In [8]:
#Final Dataset
finalized_data = output.select("Independent Features", "tip")
finalized_data.show()

+--------------------+----+
|Independent Features| tip|
+--------------------+----+
|[16.99,2.0,1.0,0....|1.01|
|[10.34,3.0,0.0,0....|1.66|
|[21.01,3.0,0.0,0....| 3.5|
|[23.68,2.0,0.0,0....|3.31|
|[24.59,4.0,1.0,0....|3.61|
|[25.29,4.0,0.0,0....|4.71|
|[8.77,2.0,0.0,0.0...| 2.0|
|[26.88,4.0,0.0,0....|3.12|
|[15.04,2.0,0.0,0....|1.96|
|[14.78,2.0,0.0,0....|3.23|
|[10.27,2.0,0.0,0....|1.71|
|[35.26,4.0,1.0,0....| 5.0|
|[15.42,2.0,0.0,0....|1.57|
|[18.43,4.0,0.0,0....| 3.0|
|[14.83,2.0,1.0,0....|3.02|
|[21.58,2.0,0.0,0....|3.92|
|[10.33,3.0,1.0,0....|1.67|
|[16.29,3.0,0.0,0....|3.71|
|[16.97,3.0,1.0,0....| 3.5|
|(6,[0,1],[20.65,3...|3.35|
+--------------------+----+
only showing top 20 rows



# Machine Learning

In [9]:
#Train test split
train_data, test_data = finalized_data.randomSplit([0.75, 0.25])

In [10]:
from pyspark.ml.regression import LinearRegression
regressor = LinearRegression(featuresCol='Independent Features', labelCol='tip')
regressor = regressor.fit(train_data)

## Predictions

In [11]:
pred_results = regressor.evaluate(test_data)
pred_results.predictions.show()

+--------------------+----+------------------+
|Independent Features| tip|        prediction|
+--------------------+----+------------------+
|(6,[0,1],[9.55,2.0])|1.45| 1.933698562798512|
|(6,[0,1],[12.69,2...| 2.0| 2.216816695894769|
|(6,[0,1],[17.59,3...|2.64|2.8113423541825227|
|(6,[0,1],[18.24,2...|3.76|  2.71723250439293|
|(6,[0,1],[19.82,2...|3.18|2.8596932210464607|
|(6,[0,1],[20.23,2...|2.01| 2.896660875367947|
|(6,[0,1],[20.65,3...|3.35|3.0872472864355625|
|(6,[0,1],[48.33,4...| 9.0| 5.735731884829282|
|[7.56,2.0,0.0,0.0...|1.44|1.8283679312775032|
|[8.35,2.0,1.0,0.0...| 1.5|1.9702063676216728|
|[8.58,1.0,0.0,1.0...|1.92|1.6921308774493269|
|[8.77,2.0,0.0,0.0...| 2.0|1.9004126821400136|
|[9.78,2.0,0.0,0.0...|1.73|2.0285342546767673|
|[9.94,2.0,0.0,0.0...|1.56|2.0059057444720585|
|[10.09,2.0,1.0,1....| 2.0| 2.051605228023176|
|[10.33,2.0,1.0,0....| 2.0| 2.148733088491287|
|[10.34,3.0,0.0,0....|1.66|2.1946888553288706|
|[10.63,2.0,1.0,1....| 2.0|1.9891537666980166|
|[11.59,2.0,0

## Preformance Metrics

In [12]:
pred_results.r2, pred_results.meanAbsoluteError, pred_results.meanSquaredError

(0.5422114966028786, 0.6543397271207358, 0.9119344249568091)