In [1]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m4.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=15b3e08f28e45c478f638999e7f202f762b50e4cb60aeec692c80d29f238faf6
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [5]:
import pyspark
from pyspark.sql import SparkSession
import pandas as pd

# Initialize Spark Session
spark = SparkSession.builder.appName("MySparkApp").getOrCreate()

# Create a new database in Spark SQL
spark.sql("CREATE DATABASE IF NOT EXISTS my_database")

# Use the created database
spark.sql("USE my_database")

# Verify that the database is being used
spark.sql("SHOW DATABASES").show()

# Create a sample CSV data
data = {
    "name": ["John", "Jane", "Mike", "Emily", "Alex"],
    "age": [28, 32, 45, 23, 36],
    "gender": ["Male", "Female", "Male", "Female", "Male"],
    "salary": [60000, 72000, 84000, 52000, 67000]
}

df = pd.DataFrame(data)

# Save the DataFrame as a CSV file
csv_file_path = "/content/sample_people.csv"
df.to_csv(csv_file_path, index=False)

# Confirm the CSV file is created
print(f"CSV file created at: {csv_file_path}")



+-----------+
|  namespace|
+-----------+
|    default|
|my_database|
+-----------+

CSV file created at: /content/sample_people.csv


In [7]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col, when, avg, round

# Initialize Spark Session
spark = SparkSession.builder.appName("EmployeeSalaryETL").getOrCreate()

# Extract: Load the CSV file
csv_file_path = "/content/movie_data.csv"
df = spark.read.csv(csv_file_path, header=True, inferSchema=True)

# Transform
# 1. Filter: Include only employees aged 30 and above
df_filtered = df.filter(col("age") >= 30)

# 2. Add New Column: Calculate 10% bonus
df_with_bonus = df_filtered.withColumn("salary_with_bonus",col("salary") * 1.1)

# 3. Aggregation: Compute average salary by gender
avg_salary_by_gender = df_filtered.groupBy("gender").agg(avg("salary"), 2).alias("avg_salary")

# Load: Save the transformed data to Parquet
output_path = "/content/employee_data.parquet"
df_with_bonus.write.mode("overwrite").parquet(output_path)

# Display results
print("Employees aged 30 and above with bonus:")
df_with_bonus.show()

print("\nAverage salary by gender:")
avg_salary_by_gender.show()


Employees aged 30 and above with bonus:
+----+---+------+------+-----------------+
|name|age|gender|salary|salary_with_bonus|
+----+---+------+------+-----------------+
|Jane| 32|Female| 72000|          79200.0|
|Mike| 45|  Male| 84000|          92400.0|
|Alex| 36|  Male| 67000|          73700.0|
+----+---+------+------+-----------------+


Average salary by gender:
+------+----------+
|gender|avg_salary|
+------+----------+
|Female|   72000.0|
|  Male|   75500.0|
+------+----------+



In [2]:

### **Exercise: PySpark Data Transformations on Movie Data**

#### **Objective:**
# You have a dataset containing movie details. The goal is to use PySpark to apply data transformations to derive insights.


### **Tasks**:

from pyspark.sql import SparkSession
from pyspark.sql.functions import col, year, avg, round

# Create a SparkSession
spark = SparkSession.builder.appName("MovieDataAnalysis").getOrCreate()

# 1. Load the Dataset
df = spark.read.csv("movie_data.csv", header=True, inferSchema=True)

# 2. Filter Movies by Genre
sci_fi_movies = df.filter(col("genre") == "Sci-Fi")
print("Sci-Fi Movies:")
sci_fi_movies.show()

# 3. Top-Rated Movies
top_rated_movies = df.orderBy(col("rating").desc()).limit(3)
print("Top 3 Highest-Rated Movies:")
top_rated_movies.show()

# 4. Movies Released After 2010
movies_after_2010 = df.filter(year(col("date")) > 2010)
print("Movies Released After 2010:")
movies_after_2010.show()

# 5. Calculate Average Box Office Collection by Genre
avg_box_office_by_genre = df.groupBy("genre").agg(avg("box_office").alias("avg_box_office"))
print("Average Box Office Collection by Genre:")
avg_box_office_by_genre.show()

# 6. Add a New Column for Box Office in Billions
df_with_billions = df.withColumn("box_office_billions", round(col("box_office") / 1000000000, 2))
print("Movies with Box Office in Billions:")
df_with_billions.show()

# 7. Sort Movies by Box Office Collection
sorted_movies = df.orderBy(col("box_office").desc())
print("Movies Sorted by Box Office Collection:")
sorted_movies.show()

# 8. Count the Number of Movies per Genre
movies_per_genre = df.groupBy("genre").count()
print("Number of Movies per Genre:")
movies_per_genre.show()

# Stop the SparkSession
spark.stop()

Sci-Fi Movies:
+--------+------------+------+------+----------+----------+
|movie_id|       title| genre|rating|box_office|      date|
+--------+------------+------+------+----------+----------+
|       1|   Inception|Sci-Fi|   8.8| 830000000|2010-07-16|
|       3|Interstellar|Sci-Fi|   8.6| 677000000|2014-11-07|
+--------+------------+------+------+----------+----------+

Top 3 Highest-Rated Movies:
+--------+---------------+------+------+----------+----------+
|movie_id|          title| genre|rating|box_office|      date|
+--------+---------------+------+------+----------+----------+
|       2|The Dark Knight|Action|   9.0|1004000000|2008-07-18|
|       1|      Inception|Sci-Fi|   8.8| 830000000|2010-07-16|
|       3|   Interstellar|Sci-Fi|   8.6| 677000000|2014-11-07|
+--------+---------------+------+------+----------+----------+

Movies Released After 2010:
+--------+-----------------+---------+------+----------+----------+
|movie_id|            title|    genre|rating|box_office|  