<a href="https://colab.research.google.com/github/Rajaanthonysamy/pyspark/blob/main/06_pyspark_mllib.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np

# Create a dictionary with sample data for 20 entries
np.random.seed(42) # for reproducibility
data = {
    'name': [f'Person_{i}' for i in range(1, 21)],
    'age': np.random.randint(20, 60, 20).tolist(),
    'experience': np.random.randint(0, 30, 20).tolist(),
    'salary': np.random.randint(30000, 120000, 20).tolist()
}

# Create a pandas DataFrame from the dictionary
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file
csv_file_path = 'sample_data.csv'
df.to_csv(csv_file_path, index=False)

print(f"CSV file '{csv_file_path}' created successfully!")

# Display the first few rows of the DataFrame
display(df.head())

CSV file 'sample_data.csv' created successfully!


Unnamed: 0,name,age,experience,salary
0,Person_1,58,1,114654
1,Person_2,48,27,65773
2,Person_3,34,20,97435
3,Person_4,27,0,86886
4,Person_5,40,11,96803


In [2]:
from pyspark.sql import SparkSession

In [3]:
spark_session = SparkSession.builder.appName("CSV to Parquet").getOrCreate()

In [5]:
df = spark_session.read.csv("sample_data.csv", header=True, inferSchema=True)

In [6]:
df.printSchema()

root
 |-- name: string (nullable = true)
 |-- age: integer (nullable = true)
 |-- experience: integer (nullable = true)
 |-- salary: integer (nullable = true)



In [7]:
df.columns

['name', 'age', 'experience', 'salary']

In [8]:
from pyspark.ml.feature import VectorAssembler

vector = VectorAssembler(inputCols=['age', 'experience'], outputCol='Independent Features')

In [9]:
output = vector.transform(df)

In [10]:
output.show()

+---------+---+----------+------+--------------------+
|     name|age|experience|salary|Independent Features|
+---------+---+----------+------+--------------------+
| Person_1| 58|         1|114654|          [58.0,1.0]|
| Person_2| 48|        27| 65773|         [48.0,27.0]|
| Person_3| 34|        20| 97435|         [34.0,20.0]|
| Person_4| 27|         0| 86886|          [27.0,0.0]|
| Person_5| 40|        11| 96803|         [40.0,11.0]|
| Person_6| 58|        25| 61551|         [58.0,25.0]|
| Person_7| 38|        21| 41394|         [38.0,21.0]|
| Person_8| 42|        28| 99092|         [42.0,28.0]|
| Person_9| 30|        11| 33890|         [30.0,11.0]|
|Person_10| 30|        24| 71606|         [30.0,24.0]|
|Person_11| 43|        16|110038|         [43.0,16.0]|
|Person_12| 55|        26|117313|         [55.0,26.0]|
|Person_13| 59|        26| 40627|         [59.0,26.0]|
|Person_14| 43|         9| 38792|          [43.0,9.0]|
|Person_15| 22|        27|103969|         [22.0,27.0]|
|Person_16

In [11]:
df_input_data = output.select('Independent Features', 'salary')

In [12]:
from pyspark.ml.regression import LinearRegression
train_data , test_data = df_input_data.randomSplit([0.75, 0.25])
regressor = LinearRegression(featuresCol='Independent Features', labelCol='salary')

In [14]:
reg = regressor.fit(train_data)

In [15]:
reg.coefficients

DenseVector([-943.1726, 1899.6857])

In [16]:
reg.intercept

74862.48596035271

In [20]:
pred_res = reg.evaluate(test_data)

In [21]:
pred_res.predictions.show()

+--------------------+------+------------------+
|Independent Features|salary|        prediction|
+--------------------+------+------------------+
|         [22.0,27.0]|103969|105404.20216310205|
|          [27.0,0.0]| 86886| 49396.82513458326|
|         [30.0,24.0]| 71606| 92159.76407628492|
|         [43.0,16.0]|110038| 64701.03437124465|
|         [48.0,27.0]| 65773| 80881.71396050924|
|         [49.0,29.0]| 98148| 83737.91273826855|
|          [58.0,1.0]|114654|22058.159516575208|
|         [59.0,26.0]| 40627| 68607.12940509814|
+--------------------+------+------------------+



In [22]:
pred_res.meanAbsoluteError, pred_res.meanSquaredError


(31863.73473054033, 1709774609.5317976)