## Pyspark Dataframes

In [1]:
#Testing Pyspark Installation
import findspark

findspark.init()
findspark.find()

import pyspark
findspark.find()

'C:\\spark'

In [2]:
#Filter Operations 
# & | ==
#intialize Spark Context
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
spark= SparkSession.builder.appName('Salary').getOrCreate()

In [3]:
salp=spark.read.csv('salary.csv',header=True,inferSchema=True)
salp.show(4)

+------+-----------+---------+-----------+------+--------+-----+
|salary|yearsworked|yearsrank|     market|degree|position|Field|
+------+-----------+---------+-----------+------+--------+-----+
| 53000|          0|        0|1.169999957|     1|       1|    3|
| 58000|          0|        0| 1.24000001|     1|       1|    2|
| 45500|          0|        0|1.210000038|     1|       1|    3|
| 35782|          2|        1| 0.99000001|     1|       1|    4|
+------+-----------+---------+-----------+------+--------+-----+
only showing top 4 rows



## Filer Operations

In [4]:
#Salary of the People Less = to 30000
salp.filter('salary<=30000').show(5)

+------+-----------+---------+-----------+------+--------+-----+
|salary|yearsworked|yearsrank|     market|degree|position|Field|
+------+-----------+---------+-----------+------+--------+-----+
| 30000|          0|        0|0.779999971|     0|       1|    3|
| 29500|          0|        0|0.790000021|     1|       1|    3|
| 29000|          0|        0|0.779999971|     0|       1|    3|
| 30000|          1|        0|0.790000021|     1|       1|    3|
| 30000|          0|        0|0.810000002|     1|       1|    3|
+------+-----------+---------+-----------+------+--------+-----+



In [5]:
salp.filter('salary<=30000').select(['salary','degree']).show(5)

+------+------+
|salary|degree|
+------+------+
| 30000|     0|
| 29500|     1|
| 29000|     0|
| 30000|     1|
| 30000|     1|
+------+------+



In [6]:
salp.filter((salp['salary']<=30000) & (salp['salary']>=20000)).show(5)

+------+-----------+---------+-----------+------+--------+-----+
|salary|yearsworked|yearsrank|     market|degree|position|Field|
+------+-----------+---------+-----------+------+--------+-----+
| 30000|          0|        0|0.779999971|     0|       1|    3|
| 29500|          0|        0|0.790000021|     1|       1|    3|
| 29000|          0|        0|0.779999971|     0|       1|    3|
| 30000|          1|        0|0.790000021|     1|       1|    3|
| 30000|          0|        0|0.810000002|     1|       1|    3|
+------+-----------+---------+-----------+------+--------+-----+



In [7]:
salp.filter(~(salp['salary']<=30000)).select(['salary','degree']).show(5)

+------+------+
|salary|degree|
+------+------+
| 53000|     1|
| 58000|     1|
| 45500|     1|
| 35782|     1|
| 34731|     1|
+------+------+
only showing top 5 rows



## PySpark GroupBy & Aggregate Funcions

In [8]:
salp.printSchema()

root
 |-- salary: integer (nullable = true)
 |-- yearsworked: integer (nullable = true)
 |-- yearsrank: integer (nullable = true)
 |-- market: double (nullable = true)
 |-- degree: integer (nullable = true)
 |-- position: integer (nullable = true)
 |-- Field: integer (nullable = true)



In [9]:
#Grouping
salp.groupby('position').sum().show(10)
#Grouped to find Maximum Salery

+--------+-----------+----------------+--------------+-------------+-----------+-------------+----------+
|position|sum(salary)|sum(yearsworked)|sum(yearsrank)|  sum(market)|sum(degree)|sum(position)|sum(Field)|
+--------+-----------+----------------+--------------+-------------+-----------+-------------+----------+
|       1|    5700818|             452|           417|136.850000977|        137|          143|       414|
|       3|   12714723|            4199|          2152| 197.74000091|        203|          624|       440|
|       2|    7505742|            1860|          1001| 149.26000059|        152|          318|       435|
+--------+-----------+----------------+--------------+-------------+-----------+-------------+----------+



In [10]:
salp.groupby('position').count().show(10) #Also Consisting Max, avg ,min

+--------+-----+
|position|count|
+--------+-----+
|       1|  143|
|       3|  208|
|       2|  159|
+--------+-----+



In [29]:
from pyspark.ml.feature import VectorAssembler
featureassembler=VectorAssembler(inputCols=['salary','Field'],
                                 outputCol="Independent Features")

In [30]:
output=featureassembler.transform(salp)

In [31]:
output.show(5)

+------+-----------+---------+-----------+------+--------+-----+--------------------+
|salary|yearsworked|yearsrank|     market|degree|position|Field|Independent Features|
+------+-----------+---------+-----------+------+--------+-----+--------------------+
| 53000|          0|        0|1.169999957|     1|       1|    3|       [53000.0,3.0]|
| 58000|          0|        0| 1.24000001|     1|       1|    2|       [58000.0,2.0]|
| 45500|          0|        0|1.210000038|     1|       1|    3|       [45500.0,3.0]|
| 35782|          2|        1| 0.99000001|     1|       1|    4|       [35782.0,4.0]|
| 34731|          2|        2|0.910000026|     1|       1|    4|       [34731.0,4.0]|
+------+-----------+---------+-----------+------+--------+-----+--------------------+
only showing top 5 rows



In [32]:
output.columns

['salary',
 'yearsworked',
 'yearsrank',
 'market',
 'degree',
 'position',
 'Field',
 'Independent Features']

In [33]:
output.select("Independent Features","salary").show(5)

+--------------------+------+
|Independent Features|salary|
+--------------------+------+
|       [53000.0,3.0]| 53000|
|       [58000.0,2.0]| 58000|
|       [45500.0,3.0]| 45500|
|       [35782.0,4.0]| 35782|
|       [34731.0,4.0]| 34731|
+--------------------+------+
only showing top 5 rows



In [34]:
final=output.select("Independent Features","salary")

In [35]:
from pyspark.ml.regression import LinearRegression
#Train Test Split
train_data,test_data=final.randomSplit([0.7, 0.3])
regressor=LinearRegression(featuresCol='Independent Features', labelCol='salary')
regressor=regressor.fit(train_data)

In [36]:
##Coefficients
regressor.coefficients

DenseVector([1.0, -0.0])

In [37]:
## Intercepts
regressor.intercept

1.2859587591397332e-10

In [38]:
#Predictions
pred=regressor.evaluate(test_data)

In [39]:
pred.predictions.show(5)

+--------------------+------+------------------+
|Independent Features|salary|        prediction|
+--------------------+------+------------------+
|       [29000.0,3.0]| 29000|29000.000000000015|
|       [29500.0,3.0]| 29500|29500.000000000015|
|       [30662.0,3.0]| 30662|30662.000000000015|
|       [30927.0,3.0]| 30927|30927.000000000015|
|       [31577.0,3.0]| 31577| 31577.00000000001|
+--------------------+------+------------------+
only showing top 5 rows



In [41]:
pred.meanAbsoluteError,pred.meanSquaredError

(2.3002872208815925e-11, 7.551285572086571e-22)