In [5]:
from pyspark.sql import SparkSession
import pandas as pd

In [3]:
spark=SparkSession.builder.appName('Practise').getOrCreate()

In [4]:
spark

In [6]:
#header true to consider row  1 as column names and inferSchema true to take the csv data types
df=spark.read.csv('/content/sample_data/california_housing_test.csv',header=True,inferSchema=True)

In [34]:
df.show(3)

+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|longitude|latitude|housing_median_age|total_rooms|total_bedrooms|population|households|median_income|median_house_value|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
|  -122.05|   37.37|              27.0|     3885.0|         661.0|    1537.0|     606.0|       6.6085|          344700.0|
|   -118.3|   34.26|              43.0|     1510.0|         310.0|     809.0|     277.0|        3.599|          176500.0|
|  -117.81|   33.78|              27.0|     3589.0|         507.0|    1484.0|     495.0|       5.7934|          270500.0|
+---------+--------+------------------+-----------+--------------+----------+----------+-------------+------------------+
only showing top 3 rows



In [36]:
df.filter('housing_median_age>=27').select(['latitude','housing_median_age']).show(3)

+--------+------------------+
|latitude|housing_median_age|
+--------+------------------+
|   37.37|              27.0|
|   34.26|              43.0|
|   33.78|              27.0|
+--------+------------------+
only showing top 3 rows



In [37]:
#Group By operation
df=spark.read.csv('/content/test.csv',header=True,inferSchema=True)

In [38]:
df.show(3)

+---+-------+---+------+---------+------+-----------+----------+-----------------+
| id|   name|age|gender|     city|salary| department| join_date|performance_score|
+---+-------+---+------+---------+------+-----------+----------+-----------------+
|  1|  Alice| 23|Female|   Mumbai| 55000|         HR|2022-03-15|              8.2|
|  2|    Bob| 34|  Male|    Delhi| 72000|Engineering|2021-07-22|              7.5|
|  3|Charlie| 29|  Male|Bangalore| 61000|  Marketing|2023-01-10|              6.9|
+---+-------+---+------+---------+------+-----------+----------+-----------------+
only showing top 3 rows



In [41]:
df.groupBy('gender').sum('salary').show()

+------+-----------+
|gender|sum(salary)|
+------+-----------+
|  NULL|       NULL|
|Female|     566000|
|  Male|     698000|
+------+-----------+



In [65]:
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import col, isnan
df = df.na.drop(subset=["age", "salary", "performance_score"])
df = df.filter(~(isnan(col("age")) | isnan(col("salary")) | isnan(col("performance_score"))))
featureassembler=VectorAssembler(inputCols=['age','salary'],outputCol='X')


In [66]:
output=featureassembler.transform(df)


In [67]:
output.show(3)

+---+-------+---+------+---------+------+-----------+----------+-----------------+--------------+
| id|   name|age|gender|     city|salary| department| join_date|performance_score|             X|
+---+-------+---+------+---------+------+-----------+----------+-----------------+--------------+
|  1|  Alice| 23|Female|   Mumbai| 55000|         HR|2022-03-15|              8.2|[23.0,55000.0]|
|  2|    Bob| 34|  Male|    Delhi| 72000|Engineering|2021-07-22|              7.5|[34.0,72000.0]|
|  3|Charlie| 29|  Male|Bangalore| 61000|  Marketing|2023-01-10|              6.9|[29.0,61000.0]|
+---+-------+---+------+---------+------+-----------+----------+-----------------+--------------+
only showing top 3 rows



In [68]:
final_data=output.select('X', 'performance_score')
final_data = final_data.na.drop()

In [69]:
from pyspark.ml.regression import LinearRegression
train_data,test_data=final_data.randomSplit([0.75,0.25])
regressor=LinearRegression(featuresCol='X',labelCol='performance_score')
regressor=regressor.fit(train_data)

In [70]:
regressor.intercept

9.593835463301682

In [76]:
pred_results=regressor.evaluate(test_data)

In [77]:
pred_results.meanAbsoluteError,pred_results.meanSquaredError

(0.4545144074693399, 0.29938206252284505)