<a href="https://colab.research.google.com/github/Vivek-afk81/pyspark-learning-notes/blob/main/pySparkML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Using MLlib


In [11]:
import os
import pyspark
import pandas as pd

In [12]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [13]:
# Change to a specific directory
os.chdir('/content/drive/My Drive/pyspark')

# Verify current directory
print(os.getcwd())

/content/drive/My Drive/pyspark


In [14]:
!ls

missing_values_practice.csv  sample_pyspark_data.xlsx	Untitled0.ipynb
pySpark1.ipynb		     student_practice_data.csv
pyspark_mllib_practice.csv   test1.csv


In [16]:
# Create a spark session

from pyspark.sql import SparkSession

spark=SparkSession.builder.appName('ML_practice').getOrCreate()

In [18]:
print(os.getcwd())

/content/drive/MyDrive/pyspark


In [19]:
# Read the dataset

training=spark.read.csv('/content/drive/My Drive/pyspark/pyspark_mllib_practice.csv',header=True,inferSchema=True)

In [20]:
training.show()

+-------------+---+----------+------+---------+------------+----------+-----------------+----------+---------+
|         name|age|experience|salary|education|  department|is_manager|performance_score|hired_date| location|
+-------------+---+----------+------+---------+------------+----------+-----------------+----------+---------+
| Aarav Sharma| 28|         3| 45000|Bachelors|Data Science|         0|              4.2|2021-06-15|    Delhi|
|   Diya Patel| 32|         7| 68000|  Masters|Data Science|         0|              4.6|2018-03-01|   Mumbai|
|  Karan Verma| 40|        15|120000|  Masters| Engineering|         1|              4.8|2010-11-20|Bangalore|
|   Meera Iyer| 26|         2| 42000|Bachelors|     Product|         0|              3.9|2022-01-10|  Chennai|
|  Rohit Singh| 35|        10| 90000|Bachelors| Engineering|         1|              4.1|2014-05-05|     Pune|
|    Sneha Rao| 29|         5| 60000|  Masters|Data Science|         0|              4.3|2019-09-23|Hyderabad|
|

In [21]:
training.columns

['name',
 'age',
 'experience',
 'salary',
 'education',
 'department',
 'is_manager',
 'performance_score',
 'hired_date',
 'location']

In [22]:
'''Instead of looking at these values one by one,
we combine them and treat them as one input for the ML model.
 [age,experience,performance_score]---> new feature --->independent feature'''

'Instead of looking at these values one by one, \nwe combine them and treat them as one input for the ML model.\n [age,experience,performance_score]---> new feature --->independent feature'

In [24]:
from pyspark.ml.feature import VectorAssembler
feature_assembler=VectorAssembler(
    inputCols=['age','experience','performance_score'],
    outputCol="Independent_features"
)

In [25]:
output=feature_assembler.transform(training)
output.show()

+-------------+---+----------+------+---------+------------+----------+-----------------+----------+---------+--------------------+
|         name|age|experience|salary|education|  department|is_manager|performance_score|hired_date| location|Independent_features|
+-------------+---+----------+------+---------+------------+----------+-----------------+----------+---------+--------------------+
| Aarav Sharma| 28|         3| 45000|Bachelors|Data Science|         0|              4.2|2021-06-15|    Delhi|      [28.0,3.0,4.2]|
|   Diya Patel| 32|         7| 68000|  Masters|Data Science|         0|              4.6|2018-03-01|   Mumbai|      [32.0,7.0,4.6]|
|  Karan Verma| 40|        15|120000|  Masters| Engineering|         1|              4.8|2010-11-20|Bangalore|     [40.0,15.0,4.8]|
|   Meera Iyer| 26|         2| 42000|Bachelors|     Product|         0|              3.9|2022-01-10|  Chennai|      [26.0,2.0,3.9]|
|  Rohit Singh| 35|        10| 90000|Bachelors| Engineering|         1|     

In [26]:
output.columns

['name',
 'age',
 'experience',
 'salary',
 'education',
 'department',
 'is_manager',
 'performance_score',
 'hired_date',
 'location',
 'Independent_features']

In [28]:
final_data=output.select('Independent_features','salary')
final_data.show()

+--------------------+------+
|Independent_features|salary|
+--------------------+------+
|      [28.0,3.0,4.2]| 45000|
|      [32.0,7.0,4.6]| 68000|
|     [40.0,15.0,4.8]|120000|
|      [26.0,2.0,3.9]| 42000|
|     [35.0,10.0,4.1]| 90000|
|      [29.0,5.0,4.3]| 60000|
|     [45.0,20.0,4.9]|150000|
|      [31.0,6.0,4.0]| 70000|
|      [24.0,1.0,3.6]| 35000|
|     [38.0,12.0,4.4]| 98000|
|      [27.0,4.0,3.8]| 52000|
|     [50.0,27.0,4.7]|170000|
|      [34.0,9.0,4.1]| 82000|
|      [30.0,6.0,3.9]| 61000|
|      [23.0,0.0,3.2]| 32000|
|     [37.0,11.0,4.5]| 95000|
|     [42.0,18.0,4.6]|135000|
|      [29.0,5.0,3.7]| 58000|
|      [33.0,8.0,4.0]| 76000|
|      [28.0,4.0,4.1]| 54000|
+--------------------+------+
only showing top 20 rows


In [29]:
from pyspark.ml.regression import LinearRegression

train_data,test_data=final_data.randomSplit([.75,.25])
regressor=LinearRegression(featuresCol='Independent_features', labelCol='salary')
regressor=regressor.fit(train_data)

In [32]:
regressor

LinearRegressionModel: uid=LinearRegression_c6c1aa638d9c, numFeatures=3


####Model Coefficients
Each coefficient represents how much the salary changes
when that feature increases by 1 unit, keeping others constant.



In [38]:
#coeff
print(regressor.coefficients)
for feature, coef in zip(['age','experience','performance_score'], regressor.coefficients):
    print(f"{feature}: {coef}")


age: 1787.3501662422125
experience: 3731.5447276926425
performance_score: -357.49574720290803


####Intercept
Intercept represents the base salary when all features are zero.
It is a mathematical starting point, not a real-world salary.


In [41]:
print("Intercept:", regressor.intercept)


Intercept: -9803.78814767789


In [35]:
#Prediction
pred_results=regressor.evaluate(test_data)

In [36]:
pred_results.predictions.show()

+--------------------+------+------------------+
|Independent_features|salary|        prediction|
+--------------------+------+------------------+
|      [26.0,2.0,3.9]| 42000| 42736.17221591358|
|      [30.0,6.0,3.9]| 61000|64811.751791653005|
|      [31.0,6.0,4.0]| 70000| 66563.35238317492|
|      [33.0,8.0,4.0]| 76000| 77601.14217104463|
+--------------------+------+------------------+



#### Evaluation metrics
wrt our data

MSE → average squared error

RMSE → error in salary units (₹)

R² → how well model explains salary (0–1)

In [42]:
print("MSE:", pred_results.meanSquaredError)
print("RMSE:", pred_results.rootMeanSquaredError)
print("R2 Score:", pred_results.r2)


MSE: 7361401.086695009
RMSE: 2713.190204665904
R2 Score: 0.9554360887676427


####Residuals (prediction error)

In [43]:
pred_results.predictions.select(
    "Independent_features",
    "salary",
    "prediction",
    (pred_results.predictions.salary - pred_results.predictions.prediction).alias("residual")
).show()


+--------------------+------+------------------+-------------------+
|Independent_features|salary|        prediction|           residual|
+--------------------+------+------------------+-------------------+
|      [26.0,2.0,3.9]| 42000| 42736.17221591358| -736.1722159135825|
|      [30.0,6.0,3.9]| 61000|64811.751791653005| -3811.751791653005|
|      [31.0,6.0,4.0]| 70000| 66563.35238317492|  3436.647616825081|
|      [33.0,8.0,4.0]| 76000| 77601.14217104463|-1601.1421710446302|
+--------------------+------+------------------+-------------------+

