In [1]:
import os
import sys

os.environ['PYSPARK_PYTHON'] = 'python'

In [3]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import avg
from pyspark.sql.functions import col

In [5]:
# Create a Spark session
spark = SparkSession.builder.appName("Average Salary Calculation").getOrCreate()

# Print the version of Spark
print(spark.version)

3.4.1


In [7]:
file_path="employee_data.txt"

In [9]:
#converting the textfile into dataframe
df = spark.read.option("header", "true") \
               .option("delimiter", "\t") \
               .csv(file_path)

In [11]:
df.printSchema()

root
 |-- EmployeeID: string (nullable = true)
 |-- Name: string (nullable = true)
 |-- Age: string (nullable = true)
 |-- Department: string (nullable = true)
 |-- Salary: string (nullable = true)



In [13]:
df.show(5)

+----------+-------------+---+----------+------+
|EmployeeID|         Name|Age|Department|Salary|
+----------+-------------+---+----------+------+
|    E00001|   Jane Davis| 34|        HR| 77511|
|    E00002|Jessica Brown| 53|     Sales| 52623|
|    E00003|   Emma Moore| 42| Marketing| 87842|
|    E00004|  Emily Jones| 61|   Finance| 76855|
|    E00005|Michael Davis| 51|        HR|134142|
+----------+-------------+---+----------+------+
only showing top 5 rows



In [15]:
# Convert the 'Salary' column to integer
df = df.withColumn("Salary", df["Salary"].cast("integer"))

# Calculate the average salary
average_salary = df.agg(avg("Salary")).collect()[0][0]

print(f"The average salary is: ${average_salary:.2f}")

The average salary is: $90213.05


In [17]:
# Convert the 'Salary' column to integer
#df = df.withColumn("Salary", df["Salary"].cast("integer"))
#As above line is already compiled previously so wont be addinf but its need to be done if not done
salaries_rdd = df.select("Salary").rdd \
    .filter(lambda row: row[0] is not None) \
    .map(lambda row: row[0])
# Calculate the sum of the salaries using reduce
total_salary = salaries_rdd.reduce(lambda x, y: x + y)
total_employees = salaries_rdd.count()
# Calculate the average salary
average_salary = total_salary / total_employees

print(f"The average salary is: ${average_salary:.2f}")

The average salary is: $90213.05


In [19]:
spark.stop()

In [21]:
from pyspark import SparkConf, SparkContext

In [23]:
conf = SparkConf().setAppName("Average Salary Calculation using RDD")
sc = SparkContext(conf=conf)

In [25]:
file_path = "employee_data.txt"

# Read the text file into an RDD
lines = sc.textFile(file_path)

In [27]:
print("\nSample Data:")
for line in lines.take(5):
    print(line)


Sample Data:
EmployeeID	Name	Age	Department	Salary
E00001	Jane Davis	34	HR	77511
E00002	Jessica Brown	53	Sales	52623
E00003	Emma Moore	42	Marketing	87842
E00004	Emily Jones	61	Finance	76855


In [29]:
# Split the lines into columns and filter out the header
header = lines.first()  # extract header
data = lines.filter(lambda line: line != header)  # remove header
columns = data.map(lambda line: line.split("\t"))  # split lines by tab

In [31]:
# Extract salaries and convert to integer
salaries = columns.map(lambda col: int(col[4]))

In [33]:
# Calculate the sum of salaries using reduce
total_salary = salaries.reduce(lambda x, y: x + y)

# Count the number of employees
total_employees = salaries.count()

# Calculate the average salary
average_salary = total_salary / total_employees

print(f"\nThe average salary is: ${average_salary:.2f}")


The average salary is: $90213.05
