<a href="https://colab.research.google.com/github/Vivek-afk81/pyspark-learning-notes/blob/main/pySparkML.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Using MLlib


In [None]:
import os
import pyspark
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
# Change to a specific directory
os.chdir('/content/drive/My Drive/pyspark')

# Verify current directory
print(os.getcwd())

/content/drive/My Drive/pyspark


In [None]:
!ls

missing_values_practice.csv  sample_pyspark_data.xlsx	Untitled0.ipynb
pySpark1.ipynb		     student_practice_data.csv
pyspark_mllib_practice.csv   test1.csv


In [None]:
# Create a spark session

from pyspark.sql import SparkSession

spark=SparkSession.builder.appName('ML_practice').getOrCreate()

In [None]:
print(os.getcwd())

/content/drive/MyDrive/pyspark


In [None]:
# Read the dataset

training=spark.read.csv('/content/drive/My Drive/pyspark/pyspark_mllib_practice.csv',header=True,inferSchema=True)

In [None]:
training.show()

+-------------+---+----------+------+---------+------------+----------+-----------------+----------+---------+
|         name|age|experience|salary|education|  department|is_manager|performance_score|hired_date| location|
+-------------+---+----------+------+---------+------------+----------+-----------------+----------+---------+
| Aarav Sharma| 28|         3| 45000|Bachelors|Data Science|         0|              4.2|2021-06-15|    Delhi|
|   Diya Patel| 32|         7| 68000|  Masters|Data Science|         0|              4.6|2018-03-01|   Mumbai|
|  Karan Verma| 40|        15|120000|  Masters| Engineering|         1|              4.8|2010-11-20|Bangalore|
|   Meera Iyer| 26|         2| 42000|Bachelors|     Product|         0|              3.9|2022-01-10|  Chennai|
|  Rohit Singh| 35|        10| 90000|Bachelors| Engineering|         1|              4.1|2014-05-05|     Pune|
|    Sneha Rao| 29|         5| 60000|  Masters|Data Science|         0|              4.3|2019-09-23|Hyderabad|
|

In [None]:
training.columns

['name',
 'age',
 'experience',
 'salary',
 'education',
 'department',
 'is_manager',
 'performance_score',
 'hired_date',
 'location']

In [None]:
'''Instead of looking at these values one by one,
we combine them and treat them as one input for the ML model.
 [age,experience,performance_score]---> new feature --->independent feature'''

'Instead of looking at these values one by one, \nwe combine them and treat them as one input for the ML model.\n [age,experience,performance_score]---> new feature --->independent feature'

In [None]:
from pyspark.ml.feature import VectorAssembler
feature_assembler=VectorAssembler(
    inputCols=['age','experience','performance_score'],
    outputCol="Independent_features"
)

In [None]:
output=feature_assembler.transform(training)
output.show()

+-------------+---+----------+------+---------+------------+----------+-----------------+----------+---------+--------------------+
|         name|age|experience|salary|education|  department|is_manager|performance_score|hired_date| location|Independent_features|
+-------------+---+----------+------+---------+------------+----------+-----------------+----------+---------+--------------------+
| Aarav Sharma| 28|         3| 45000|Bachelors|Data Science|         0|              4.2|2021-06-15|    Delhi|      [28.0,3.0,4.2]|
|   Diya Patel| 32|         7| 68000|  Masters|Data Science|         0|              4.6|2018-03-01|   Mumbai|      [32.0,7.0,4.6]|
|  Karan Verma| 40|        15|120000|  Masters| Engineering|         1|              4.8|2010-11-20|Bangalore|     [40.0,15.0,4.8]|
|   Meera Iyer| 26|         2| 42000|Bachelors|     Product|         0|              3.9|2022-01-10|  Chennai|      [26.0,2.0,3.9]|
|  Rohit Singh| 35|        10| 90000|Bachelors| Engineering|         1|     

In [None]:
output.columns

['name',
 'age',
 'experience',
 'salary',
 'education',
 'department',
 'is_manager',
 'performance_score',
 'hired_date',
 'location',
 'Independent_features']

In [None]:
final_data=output.select('Independent_features','salary')
final_data.show()

+--------------------+------+
|Independent_features|salary|
+--------------------+------+
|      [28.0,3.0,4.2]| 45000|
|      [32.0,7.0,4.6]| 68000|
|     [40.0,15.0,4.8]|120000|
|      [26.0,2.0,3.9]| 42000|
|     [35.0,10.0,4.1]| 90000|
|      [29.0,5.0,4.3]| 60000|
|     [45.0,20.0,4.9]|150000|
|      [31.0,6.0,4.0]| 70000|
|      [24.0,1.0,3.6]| 35000|
|     [38.0,12.0,4.4]| 98000|
|      [27.0,4.0,3.8]| 52000|
|     [50.0,27.0,4.7]|170000|
|      [34.0,9.0,4.1]| 82000|
|      [30.0,6.0,3.9]| 61000|
|      [23.0,0.0,3.2]| 32000|
|     [37.0,11.0,4.5]| 95000|
|     [42.0,18.0,4.6]|135000|
|      [29.0,5.0,3.7]| 58000|
|      [33.0,8.0,4.0]| 76000|
|      [28.0,4.0,4.1]| 54000|
+--------------------+------+
only showing top 20 rows


In [None]:
from pyspark.ml.regression import LinearRegression

train_data,test_data=final_data.randomSplit([.75,.25])
regressor=LinearRegression(featuresCol='Independent_features', labelCol='salary')
regressor=regressor.fit(train_data)

In [None]:
regressor

LinearRegressionModel: uid=LinearRegression_c6c1aa638d9c, numFeatures=3


####Model Coefficients
Each coefficient represents how much the salary changes
when that feature increases by 1 unit, keeping others constant.



In [None]:
#coeff
print(regressor.coefficients)
for feature, coef in zip(['age','experience','performance_score'], regressor.coefficients):
    print(f"{feature}: {coef}")


age: 1787.3501662422125
experience: 3731.5447276926425
performance_score: -357.49574720290803


####Intercept
Intercept represents the base salary when all features are zero.
It is a mathematical starting point, not a real-world salary.


In [None]:
print("Intercept:", regressor.intercept)


Intercept: -9803.78814767789


In [None]:
#Prediction
pred_results=regressor.evaluate(test_data)

In [None]:
pred_results.predictions.show()

+--------------------+------+------------------+
|Independent_features|salary|        prediction|
+--------------------+------+------------------+
|      [26.0,2.0,3.9]| 42000| 42736.17221591358|
|      [30.0,6.0,3.9]| 61000|64811.751791653005|
|      [31.0,6.0,4.0]| 70000| 66563.35238317492|
|      [33.0,8.0,4.0]| 76000| 77601.14217104463|
+--------------------+------+------------------+



##  Regression Evaluation Metrics

### MAE (Mean Absolute Error)
MAE = (1 / N) * Σ | y - y_hat |  
Average absolute prediction error (robust to outliers)



### MSE (Mean Squared Error)
MSE = (1 / N) * Σ ( y - y_hat )²  
Penalizes large errors heavily



### RMSE (Root Mean Squared Error)
RMSE = √[ (1 / N) * Σ ( y - y_hat )² ]  
Error in same units as target



### R² (Coefficient of Determination)
R² = 1 − [ Σ ( y − y_hat )² / Σ ( y − y_mean )² ]  
Explained variance (1 = perfect)



### Adjusted R²
Adjusted R² = 1 − (1 − R²) * (n − 1) / (n − p − 1)  
Penalizes unnecessary features



### MAPE (Mean Absolute Percentage Error)
MAPE = (100 / N) * Σ | ( y − y_hat ) / y |  
Scale-independent ( undefined when y = 0)


### Metric Selection
- Outliers present → MAE  
- Large errors costly → MSE / RMSE  
- Model comparison → RMSE, R²  
- Different scales → MAPE  

---


In [None]:
print("MSE:", pred_results.meanSquaredError)
print("RMSE:", pred_results.rootMeanSquaredError)
print("R2 Score:", pred_results.r2)


MSE: 7361401.086695009
RMSE: 2713.190204665904
R2 Score: 0.9554360887676427


####Residuals (prediction error)

In [None]:
pred_results.predictions.select(
    "Independent_features",
    "salary",
    "prediction",
    (pred_results.predictions.salary - pred_results.predictions.prediction).alias("residual")
).show()


+--------------------+------+------------------+-------------------+
|Independent_features|salary|        prediction|           residual|
+--------------------+------+------------------+-------------------+
|      [26.0,2.0,3.9]| 42000| 42736.17221591358| -736.1722159135825|
|      [30.0,6.0,3.9]| 61000|64811.751791653005| -3811.751791653005|
|      [31.0,6.0,4.0]| 70000| 66563.35238317492|  3436.647616825081|
|      [33.0,8.0,4.0]| 76000| 77601.14217104463|-1601.1421710446302|
+--------------------+------+------------------+-------------------+



### Classification model

Instead of predicting exact salary, we will predict a class:

for example: “Is this employee High Salary or Not High Salary?”

In [None]:
from pyspark.ml import classification
from pyspark.sql.functions import when

classification_data=output.withColumn("label",when(output.salary>=80000,1).otherwise(0))
classification_data.select("salary","label").show()

+------+-----+
|salary|label|
+------+-----+
| 45000|    0|
| 68000|    0|
|120000|    1|
| 42000|    0|
| 90000|    1|
| 60000|    0|
|150000|    1|
| 70000|    0|
| 35000|    0|
| 98000|    1|
| 52000|    0|
|170000|    1|
| 82000|    1|
| 61000|    0|
| 32000|    0|
| 95000|    1|
|135000|    1|
| 58000|    0|
| 76000|    0|
| 54000|    0|
+------+-----+
only showing top 20 rows


In [None]:
# Independent features remains same
final_clf_data=classification_data.select(
    "Independent_features",
    "label"
)

In [None]:
# TRain Test Split

train_clf,test_clf=final_clf_data.randomSplit([.75,.25],seed=42)

In [None]:
from pyspark.ml.classification import LogisticRegression

LR_clf=LogisticRegression(
    featuresCol="Independent_features",
    labelCol="label"
)

LR_model=LR_clf.fit(train_clf)

In [None]:
LR_model

LogisticRegressionModel: uid=LogisticRegression_8e4240ef955e, numClasses=2, numFeatures=3

In [None]:
#Predictions

predictions=LR_model.transform(test_clf)
predictions.select("label","prediction","probability").show()

+-----+----------+--------------------+
|label|prediction|         probability|
+-----+----------+--------------------+
|    0|       0.0|           [1.0,0.0]|
|    0|       0.0|           [1.0,0.0]|
|    0|       0.0|[0.99999999997949...|
|    0|       0.0|           [1.0,0.0]|
|    0|       1.0|[0.00115280442558...|
|    1|       1.0|[2.63073090138506...|
|    1|       1.0|[7.67812909265611...|
+-----+----------+--------------------+



### Classification Evaluation Metrics

### Confusion Matrix
- **TP**: Predicted +, Actual +  
- **TN**: Predicted −, Actual −  
- **FP**: Predicted +, Actual −  
- **FN**: Predicted −, Actual +  


### Accuracy

Overall correctness ( unreliable for imbalanced data)


### Precision
Prediction quality for positive class (controls **FP**)


### Recall (Sensitivity / TPR)
Coverage of actual positives (controls **FN**)


### F1-Score

Balance between Precision & Recall (robust to imbalance)



### Specificity (TNR)
Correctly identified negatives



### ROC–AUC
- **ROC**: TPR vs FPR across thresholds  
- **AUC**: Class separability (0.5=random, 1=perfect)



### Metric Selection
- **Balanced data** → Accuracy  
- **Imbalanced data** → F1 / ROC–AUC  
- **Avoid FP** → Precision  
- **Avoid FN** → Recall  

---


In [53]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
evaluator = BinaryClassificationEvaluator()
print("AUC Score:", evaluator.evaluate(predictions))


AUC Score: 1.0


###Feature Scaling


In [54]:
from pyspark.ml.feature import StandardScaler

scaler = StandardScaler(
    inputCol="Independent_features",
    outputCol="Scaled_features"
)

scaler_model = scaler.fit(classification_data)
scaled_data = scaler_model.transform(classification_data)

scaled_data.select("Scaled_features").show()


+--------------------+
|     Scaled_features|
+--------------------+
|[3.87979380895428...|
|[4.43405006737632...|
|[5.54256258422040...|
|[3.60266567974326...|
|[4.84974226119285...|
|[4.01835787355979...|
|[6.23538290724795...|
|[4.29548600277081...|
|[3.32553755053224...|
|[5.26543445500938...|
|[3.74122974434877...|
|[6.92820323027550...|
|[4.71117819658734...|
|[4.15692193816530...|
|[3.18697348592673...|
|[5.12687039040387...|
|[5.81969071343142...|
|[4.01835787355979...|
|[4.57261413198183...|
|[3.87979380895428...|
+--------------------+
only showing top 20 rows


In [56]:
#using scaled features
final_scaled_data = scaled_data.select(
    "Scaled_features",
    "label"
)

train_scaled, test_scaled = final_scaled_data.randomSplit([0.75, 0.25], seed=42)


In [57]:
lr_scaled = LogisticRegression(
    featuresCol="Scaled_features",
    labelCol="label"
)

lr_scaled_model = lr_scaled.fit(train_scaled)


Notice that the AUC Score is 1

Small datasets are easy to overfit, and our dataset ony has 25 rows with no noise or contradictions


In [58]:
scaled_predictions = lr_scaled_model.transform(test_scaled)
print("Scaled AUC:", evaluator.evaluate(scaled_predictions))


Scaled AUC: 1.0


In [59]:
## Using RandomForest Classifier
from pyspark.ml.classification import RandomForestClassifier

rf = RandomForestClassifier(
    featuresCol="Scaled_features",
    labelCol="label",
    numTrees=50
)

rf_model = rf.fit(train_scaled)
rf_preds = rf_model.transform(test_scaled)

print("Random Forest AUC:", evaluator.evaluate(rf_preds))


Random Forest AUC: 1.0
