In [None]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=248c3d3e122b8ea5998df8e165f7bfb5ac0ca1558ba421c2d96160a80ea7c494
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [None]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import col

# Initialize SparkSession

spark = SparkSession.builder \
.appName("PySpark DataFrame Example") \
.getOrCreate()

# Sample data representing employees

data = [

    ("John Doe", "Engineering", 75000),

    ("Jane Smith", "Marketing", 60000),

    ("Sam Brown", "Engineering", 80000),

    ("Emily Davis", "HR", 50000),

    ("Michael Johnson", "Marketing", 70000),

]

# Define schema for DataFrame

columns = ["Name", "Department", "Salary"]

# Create DataFrame

df = spark.createDataFrame(data, schema=columns)

# Show the DataFrame

df.show()



+---------------+-----------+------+
|           Name| Department|Salary|
+---------------+-----------+------+
|       John Doe|Engineering| 75000|
|     Jane Smith|  Marketing| 60000|
|      Sam Brown|Engineering| 80000|
|    Emily Davis|         HR| 50000|
|Michael Johnson|  Marketing| 70000|
+---------------+-----------+------+



In [None]:
# Filter: Select employees with a salary greater than 65,000
high_salary_df = df.filter(col("Salary") > 65000)
print("Employees with Salary > 65,000:")
# Show the filtered DataFrame
high_salary_df.show()

Employees with Salary > 65,000:
+---------------+-----------+------+
|           Name| Department|Salary|
+---------------+-----------+------+
|       John Doe|Engineering| 75000|
|      Sam Brown|Engineering| 80000|
|Michael Johnson|  Marketing| 70000|
+---------------+-----------+------+



In [None]:
# Group by Department and Calculate the average salary
avg_salary_by_df = df.groupBy("Department").avg("Salary")
print("Average Salary by Department:")
# Show the average salary by department
avg_salary_by_df.show()

Average Salary by Department:
+-----------+-----------+
| Department|avg(Salary)|
+-----------+-----------+
|Engineering|    77500.0|
|  Marketing|    65000.0|
|         HR|    50000.0|
+-----------+-----------+



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Customer Transaction Analysis") \
    .getOrCreate()

# Sample data for customers
customers = [
    (1, "Ravi", "Mumbai"),
    (2, "Priya", "Delhi"),
    (3, "Vijay", "Bangalore"),
    (4, "Anita", "Chennai"),
    (5, "Raj", "Hyderabad"),
]

# Sample data for transactions
transactions = [
    (1, 1, 10000.50),
    (2, 2, 20000.75),
    (3, 1, 15000.25),
    (4, 3, 30000.00),
    (5, 2, 40000.50),
    (6, 4, 25000.00),
    (7, 5, 18000.75),
    (8, 1, 5000.00),
]

# Define schema for DataFrames
customer_columns = ["customer_id", "Name", "city"]
transaction_columns = ["Transaction_id", "customer_id", "Amount"]  # Changed CustomerId to customer_id

# Create DataFrames
customer_df = spark.createDataFrame(customers, schema=customer_columns)
transactions_df = spark.createDataFrame(transactions, schema=transaction_columns)

# Show the DataFrames
print("Customers DataFrame:")
customer_df.show()

print("Transactions DataFrame:")
transactions_df.show()

Customers DataFrame:
+-----------+-----+---------+
|customer_id| Name|     city|
+-----------+-----+---------+
|          1| Ravi|   Mumbai|
|          2|Priya|    Delhi|
|          3|Vijay|Bangalore|
|          4|Anita|  Chennai|
|          5|  Raj|Hyderabad|
+-----------+-----+---------+

Transactions DataFrame:
+--------------+-----------+--------+
|Transaction_id|customer_id|  Amount|
+--------------+-----------+--------+
|             1|          1| 10000.5|
|             2|          2|20000.75|
|             3|          1|15000.25|
|             4|          3| 30000.0|
|             5|          2| 40000.5|
|             6|          4| 25000.0|
|             7|          5|18000.75|
|             8|          1|  5000.0|
+--------------+-----------+--------+



In [None]:
# Join the DataFrames on customer_id
customer_transactions_df = customer_df.join(transactions_df, on="customer_id")
print("Customer Transactions DataFrame:")
customer_transactions_df.show()

# Calculate the total amount spent by each customer
total_spent_df = customer_transactions_df.groupBy("Name").sum("Amount").withColumnRenamed("sum(Amount)", "TotalSpent")
print("Total Amount Spent by Each Customer:")
total_spent_df.show()

# Find customers who have spent more than 30000
big_spender_df = total_spent_df.filter(col("TotalSpent") > 30000)
print("Customers who have spent more than 30000:")
big_spender_df.show()

# Count the number of transactions per customer
transaction_count_df = customer_transactions_df.groupBy("Name").count().withColumnRenamed("count", "TransactionCount")
print("Number of Transactions per Customer:")
transaction_count_df.show()

# Sort customers by total amount spent in descending order
sorted_spenders_df = total_spent_df.orderBy(col("TotalSpent").desc())
print("Customers Sorted by Total Amount Spent:")
sorted_spenders_df.show()

Customer Transactions DataFrame:
+-----------+-----+---------+--------------+--------+
|customer_id| Name|     city|Transaction_id|  Amount|
+-----------+-----+---------+--------------+--------+
|          1| Ravi|   Mumbai|             1| 10000.5|
|          1| Ravi|   Mumbai|             3|15000.25|
|          1| Ravi|   Mumbai|             8|  5000.0|
|          2|Priya|    Delhi|             2|20000.75|
|          2|Priya|    Delhi|             5| 40000.5|
|          3|Vijay|Bangalore|             4| 30000.0|
|          4|Anita|  Chennai|             6| 25000.0|
|          5|  Raj|Hyderabad|             7|18000.75|
+-----------+-----+---------+--------------+--------+

Total Amount Spent by Each Customer:
+-----+----------+
| Name|TotalSpent|
+-----+----------+
| Ravi|  30000.75|
|Priya|  60001.25|
|Vijay|   30000.0|
|Anita|   25000.0|
|  Raj|  18000.75|
+-----+----------+

Customers who have spent more than 30000:
+-----+----------+
| Name|TotalSpent|
+-----+----------+
| Ravi|  3

In [None]:
# ### **Exercise: Product Sales Analysis**

# #### **Step 1: Create DataFrames**

# You will create two DataFrames: one for products and another for sales transactions. Then, you’ll perform operations like joining these DataFrames and analyzing the data.

# ```python
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Product Sales Analysis") \
    .getOrCreate()

# Sample data for products
products = [
    (1, "Laptop", "Electronics", 50000),
    (2, "Smartphone", "Electronics", 30000),
    (3, "Table", "Furniture", 15000),
    (4, "Chair", "Furniture", 5000),
    (5, "Headphones", "Electronics", 2000),
]

# Sample data for sales transactions
sales = [
    (1, 1, 2),
    (2, 2, 1),
    (3, 3, 3),
    (4, 1, 1),
    (5, 4, 5),
    (6, 2, 2),
    (7, 5, 10),
    (8, 3, 1),
]

# Define schema for DataFrames
product_columns = ["ProductID", "ProductName", "Category", "Price"]
sales_columns = ["SaleID", "ProductID", "Quantity"]

# Create DataFrames
product_df = spark.createDataFrame(products, schema=product_columns)
sales_df = spark.createDataFrame(sales, schema=sales_columns)

# Show the DataFrames
print("Products DataFrame:")
product_df.show()

print("Sales DataFrame:")
sales_df.show()
# ```

# #### **Step 2: Perform the Following Tasks**

# 1. **Join the DataFrames:**
#    - Join the `product_df` and `sales_df` DataFrames on `ProductID` to create a combined DataFrame with product and sales data.
product_and_sales_df = product_df.join(sales_df, on="ProductID")
print("Product and Sales DataFrame:")
product_and_sales_df.show()

# 2. **Calculate Total Sales Value:**
#    - For each product, calculate the total sales value by multiplying the price by the quantity sold.
total_sales_df = product_and_sales_df.withColumn("TotalSales", col("Price") * col("Quantity"))
print("Total Sales DataFrame:")
total_sales_df.show()

# 3. **Find the Total Sales for Each Product Category:**
#    - Group the data by the `Category` column and calculate the total sales value for each product category.
total_sales_by_category_df = total_sales_df.groupBy("Category").sum("TotalSales").withColumnRenamed("sum(TotalSales)", "TotalSalesCategory")
print("Total Sales by Product Category:")
total_sales_by_category_df.show()

# 4. **Identify the Top-Selling Product:**
#    - Find the product that generated the highest total sales value.
top_selling_product_df = total_sales_df.groupBy("ProductName").sum("TotalSales").withColumnRenamed("sum(TotalSales)", "TotalSalesProduct")
top_selling_product_df

# 5. **Sort the Products by Total Sales Value:**
#    - Sort the products by total sales value in descending order.
sorted_products_df = top_selling_product_df.orderBy(col("TotalSalesProduct").desc())
print("Sorted Products by Total Sales Value:")
sorted_products_df.show()

# 6. **Count the Number of Sales for Each Product:**
#    - Count the number of sales transactions for each product.
sales_count_df = total_sales_df.groupBy("ProductName").count().withColumnRenamed("count", "SalesCount")
print("Number of Sales for Each Product:")
sales_count_df.show()

# 7. **Filter the Products with Total Sales Value Greater Than ₹50,000:**
#    - Filter out the products that have a total sales value greater than ₹50,000.
high_value_products_df = total_sales_df.filter(col("TotalSales") > 50000)
print("Products with Total Sales Value Greater Than ₹50,000:")
high_value_products_df.show()

Products DataFrame:
+---------+-----------+-----------+-----+
|ProductID|ProductName|   Category|Price|
+---------+-----------+-----------+-----+
|        1|     Laptop|Electronics|50000|
|        2| Smartphone|Electronics|30000|
|        3|      Table|  Furniture|15000|
|        4|      Chair|  Furniture| 5000|
|        5| Headphones|Electronics| 2000|
+---------+-----------+-----------+-----+

Sales DataFrame:
+------+---------+--------+
|SaleID|ProductID|Quantity|
+------+---------+--------+
|     1|        1|       2|
|     2|        2|       1|
|     3|        3|       3|
|     4|        1|       1|
|     5|        4|       5|
|     6|        2|       2|
|     7|        5|      10|
|     8|        3|       1|
+------+---------+--------+

Product and Sales DataFrame:
+---------+-----------+-----------+-----+------+--------+
|ProductID|ProductName|   Category|Price|SaleID|Quantity|
+---------+-----------+-----------+-----+------+--------+
|        1|     Laptop|Electronics|50000|   

In [None]:
from pyspark.sql import SparkSession

# Initialize SparkSession

spark = SparkSession.builder \
  .appName("RDD Transformation Example") \
    .getOrCreate()

#Get the SparkContext from the SparkSession
sc = spark.sparkContext
print("Spark Session Created")

data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
rdd = sc.parallelize(data)


# Print the original RDD
print("Original RDD:", rdd.collect())

Spark Session Created
Original RDD: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [None]:
rdd2 = rdd.map(lambda x: x* 2)

# Print the transformed RDD

print("RDD after map transformation (x2):", rdd2.collect())


rdd3 = rdd2.filter(lambda x: x % 2 == 0)

# Print the filtered RDD

print("RDD after filter transformation (even numbers):", rdd3.collect())



sentences = ["Hello world", "PySpark is great" "RDD transformations"]

rdd4 = sc.parallelize (sentences)

words_rdd = rdd4.flatMap(lambda sentence: sentence.split(" "))

#Print the flatMapped RDD

print("RDD after flatMap transformation (split into words):", words_rdd.collect())

RDD after map transformation (x2): [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
RDD after filter transformation (even numbers): [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
RDD after flatMap transformation (split into words): ['Hello', 'world', 'PySpark', 'is', 'greatRDD', 'transformations']


In [None]:
results = rdd3.collect()

print(results)


count = rdd3.count()

print(f"Number of elements: {count}")


total_sum =rdd.reduce(lambda x, y: x + y)
print("Total sum: (total_sum)")

[2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
Number of elements: 10
Total sum: (total_sum)


In [None]:
import pandas as pd

# Creating a DataFrame from a dictionary
data = {'Name': ['John', 'Emma', 'Alex'],
        'Age': [28, 32, 25],
        'City': ['New York', 'London', 'Paris']}
df = pd.DataFrame(data)

# Creating a DataFrame from a list of lists
data = [['John', 28, 'New York'],
        ['Emma', 32, 'London'],
        ['Alex', 25, 'Paris']]
df = pd.DataFrame(data, columns=['Name', 'Age', 'City'])

In [None]:
# Display the first few rows
print(df.head())

# Get basic information about the DataFrame
print(df.info())

# Get statistical summary of numerical columns
print(df.describe())

   Name  Age      City
0  John   28  New York
1  Emma   32    London
2  Alex   25     Paris
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    3 non-null      object
 1   Age     3 non-null      int64 
 2   City    3 non-null      object
dtypes: int64(1), object(2)
memory usage: 200.0+ bytes
None
             Age
count   3.000000
mean   28.333333
std     3.511885
min    25.000000
25%    26.500000
50%    28.000000
75%    30.000000
max    32.000000


In [None]:
# Select a single column
ages = df['Age']
# display the column
print(ages)

# Select multiple columns
subset = df[['Name', 'City']]
print(subset)
# Select rows based on index
first_two = df.loc[0:1]
print(first_two)

0    28
1    32
2    25
Name: Age, dtype: int64
   Name      City
0  John  New York
1  Emma    London
2  Alex     Paris
   Name  Age      City
0  John   28  New York
1  Emma   32    London


In [None]:
# Rename columns
df = df.rename(columns={'Name': 'Full Name', 'City': 'Location'})
print(df)

In [None]:
# Filter rows based on a condition
young_people = df[df['Age'] < 30]
print(young_people)
# Multiple conditions
young_new_yorkers = df[(df['Age'] < 30) & (df['City'] == 'New York')]
print(young_new_yorkers)

   Name  Age      City
0  John   28  New York
2  Alex   25     Paris
   Name  Age      City
0  John   28  New York


Sep 04

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize a Spark session
spark = SparkSession.builder \
    .appName("Employee Data Analysis") \
    .getOrCreate()

# Sample employee data
data = [
    (1, 'Arjun', 'IT', 75000),
    (2, 'Vijay', 'Finance', 85000),
    (3, 'Shalini', 'IT', 90000),
    (4, 'Sneha', 'HR', 50000),
    (5, 'Rahul', 'Finance', 60000),
    (6, 'Amit', 'IT', 55000)
]

# Define schema (columns)
columns = ['EmployeeID', 'EmployeeName', 'Department', 'Salary']

# Create DataFrame
employee_df = spark.createDataFrame(data, columns)

# Show the DataFrame
employee_df.show()

# Filter employees who have a salary greater than 60,000
high_salary_df = employee_df.filter(col('Salary') > 60000)

# Calculate the average salary by department
avg_salary_df = employee_df.groupBy("Department").avg("Salary")
avg_salary_df.show()

# Sort employees in descending order of salary
sorted_df = employee_df.orderBy(col("Salary").desc())
sorted_df.show()

+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Arjun|        IT| 75000|
|         2|       Vijay|   Finance| 85000|
|         3|     Shalini|        IT| 90000|
|         4|       Sneha|        HR| 50000|
|         5|       Rahul|   Finance| 60000|
|         6|        Amit|        IT| 55000|
+----------+------------+----------+------+

+----------+-----------------+
|Department|      avg(Salary)|
+----------+-----------------+
|   Finance|          72500.0|
|        IT|73333.33333333333|
|        HR|          50000.0|
+----------+-----------------+

+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         3|     Shalini|        IT| 90000|
|         2|       Vijay|   Finance| 85000|
|         1|       Arjun|        IT| 75000|
|         5|       Rahul|   Finance| 60000|
|         6|        Amit|        

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize Spark session
spark = SparkSession.builder \
    .appName("Employee Data Handling") \
    .getOrCreate()

# Sample employee data with null values
data = [
    (1, 'Arjun', 'IT', 75000),
    (2, 'Vijay', 'Finance', 85000),
    (3, None, 'IT', 90000),
    (4, 'Sneha', 'HR', None),
    (5, 'Rahul', None, 60000),
    (6, 'Amit', 'IT', 55000)
]

# Define schema (columns)
columns = ['EmployeeID', 'EmployeeName', 'Department', 'Salary']

# Create DataFrame
employee_df = spark.createDataFrame(data, columns)

# Show the DataFrame
employee_df.show()

# Fill null values in 'EmployeeName' and 'Department' with 'Unknown'
filled_df = employee_df.fillna({'EmployeeName': 'Unknown', 'Department': 'Unknown'})
filled_df.show()

# Drop rows where 'Salary' is null
dropped_null_salary_df = employee_df.dropna(subset=['Salary'])
dropped_null_salary_df.show()

# Fill null values in 'Salary' with 50000
salary_filled_df = employee_df.fillna({'Salary': 50000})
salary_filled_df.show()

# Check for null values in the entire DataFrame
null_counts = employee_df.select([col(c).isNull().alias(c) for c in employee_df.columns]).show()

# Replace all null values in the DataFrame with 'N/A'
na_filled_df = employee_df.na.fill('N/A')
na_filled_df.show()

+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Arjun|        IT| 75000|
|         2|       Vijay|   Finance| 85000|
|         3|        NULL|        IT| 90000|
|         4|       Sneha|        HR|  NULL|
|         5|       Rahul|      NULL| 60000|
|         6|        Amit|        IT| 55000|
+----------+------------+----------+------+

+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+----------+------+
|         1|       Arjun|        IT| 75000|
|         2|       Vijay|   Finance| 85000|
|         3|     Unknown|        IT| 90000|
|         4|       Sneha|        HR|  NULL|
|         5|       Rahul|   Unknown| 60000|
|         6|        Amit|        IT| 55000|
+----------+------------+----------+------+

+----------+------------+----------+------+
|EmployeeID|EmployeeName|Department|Salary|
+----------+------------+-----

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql import functions as F
from pyspark.sql.window import Window
from pyspark.sql.functions import rank, col, sum

# Initialize a Spark Session
spark = SparkSession.builder\
    .appName("Advanced Dataframe Operations")\
    .getOrCreate()

# Sample data
data1 = [
    (1, 'Arjun', 'IT', 75000, '2022-01-15'),
    (2, 'Vijay', 'Finance', 85000, '2022-03-12'),
    (3, 'Shalini', 'IT', 90000, '2021-06-30')
]

data2 = [
    (4, 'Sneha', 'HR', 50000, '2022-05-01'),
    (5, 'Rahul', 'Finance', 60000, '2022-08-20'),
    (6, 'Amit', 'IT', 55000, '2021-12-15')
]

columns = ['EmployeeID', 'EmployeeName', 'Department', 'Salary', 'JoiningDate']

# Create DataFrames
employee_df1 = spark.createDataFrame(data=data1, schema=columns)
employee_df2 = spark.createDataFrame(data=data2, schema=columns)

employee_df1.show()
employee_df2.show()

# Union two DataFrames (removes duplicates)
union_df = employee_df1.union(employee_df2).dropDuplicates()
print("Union of two dataframes (Remove duplicates): ")
union_df.show()

# Union of two DataFrames (includes everything)
union_all_df = employee_df1.union(employee_df2)
print("Union of two dataframes: ")
union_all_df.show()

# Rank employees by salary within each department
window_spec = Window.partitionBy("Department").orderBy(col("Salary").desc())
ranked_df = union_all_df.withColumn("Rank", rank().over(window_spec))
ranked_df.show()

# Convert 'JoiningDate' from string to date type
date_converted_df = union_all_df.withColumn("JoiningDate", F.to_date(col("JoiningDate"), "yyyy-MM-dd"))
date_converted_df.show()

+----------+------------+----------+------+-----------+
|EmployeeID|EmployeeName|Department|Salary|JoiningDate|
+----------+------------+----------+------+-----------+
|         1|       Arjun|        IT| 75000| 2022-01-15|
|         2|       Vijay|   Finance| 85000| 2022-03-12|
|         3|     Shalini|        IT| 90000| 2021-06-30|
+----------+------------+----------+------+-----------+

+----------+------------+----------+------+-----------+
|EmployeeID|EmployeeName|Department|Salary|JoiningDate|
+----------+------------+----------+------+-----------+
|         4|       Sneha|        HR| 50000| 2022-05-01|
|         5|       Rahul|   Finance| 60000| 2022-08-20|
|         6|        Amit|        IT| 55000| 2021-12-15|
+----------+------------+----------+------+-----------+

Union of two dataframes (Remove duplicates): 
+----------+------------+----------+------+-----------+
|EmployeeID|EmployeeName|Department|Salary|JoiningDate|
+----------+------------+----------+------+-----------+


In [None]:
from pyspark.sql.window import Window
from pyspark.sql.functions import rank

# Define a window specification for cumulative sum of salaries within each department
window_spec_sum = Window.partitionBy("Department").orderBy("JoiningDate").rowsBetween(Window.unboundedPreceding, Window.currentRow)

# Calculate the running total of salaries
running_total_df = date_converted_df.withColumn("RunningTotal", sum(col("Salary")).over(window_spec_sum))
running_total_df.show()

# Calculate the number of years since joining
experience_df = running_total_df.withColumn("YearsOfExperience", F.round(F.datediff(F.current_date(), col("JoiningDate")) / 365, 2))
experience_df.show()

# Add a new column for next evaluation date (one year after joining)
eval_date_df = experience_df.withColumn("NextEvaluationDate", F.date_add(col("JoiningDate"), 365))
eval_date_df.show()

# Calculate average salary per department
avg_salary_df = union_all_df.groupBy("Department").agg(F.avg("Salary").alias("AverageSalary"))
avg_salary_df.show()

# Calculate the total number of employees
total_employees_df = union_all_df.agg(F.count("EmployeeID").alias("TotalEmployees"))
total_employees_df.show()

# Convert employee names to uppercase
uppercase_name_df = union_all_df.withColumn("EmployeeNameUpper", F.upper(col("EmployeeName")))
uppercase_name_df.show()


+----------+------------+----------+------+-----------+------------+
|EmployeeID|EmployeeName|Department|Salary|JoiningDate|RunningTotal|
+----------+------------+----------+------+-----------+------------+
|         2|       Vijay|   Finance| 85000| 2022-03-12|       85000|
|         5|       Rahul|   Finance| 60000| 2022-08-20|      145000|
|         4|       Sneha|        HR| 50000| 2022-05-01|       50000|
|         3|     Shalini|        IT| 90000| 2021-06-30|       90000|
|         6|        Amit|        IT| 55000| 2021-12-15|      145000|
|         1|       Arjun|        IT| 75000| 2022-01-15|      220000|
+----------+------------+----------+------+-----------+------------+

+----------+------------+----------+------+-----------+------------+-----------------+
|EmployeeID|EmployeeName|Department|Salary|JoiningDate|RunningTotal|YearsOfExperience|
+----------+------------+----------+------+-----------+------------+-----------------+
|         2|       Vijay|   Finance| 85000| 2022

Sep 05

In [None]:
from pyspark.sql import SparkSession
spark = SparkSession.builder \
    .appName("DataIngesttion") \
    .getOrCreate()

csv_file_path = "/content/people.csv"
df_csv = spark.read.format("csv").option("header", "true").load(csv_file_path)
df_csv.show()

+----+----+-------+
|Name| Age| Gender|
+----+----+-------+
|John|  28|   Male|
|Jane|  32| Female|
+----+----+-------+



In [None]:
from pyspark.sql.types import StructType, StructField, StringType, IntegerType

# Define the schema for the JSON file (Note only after defining the schema it works fine on colab)
schema = StructType([
    StructField("name", StringType(), True),
    StructField("age", IntegerType(), True),
    StructField("gender", StringType(), True),
    StructField("address", StructType([
        StructField("street", StringType(), True),
        StructField("city", StringType(), True)
    ]), True)
])


# Load the complex JSON file with the correct path
json_file_path = "/content/Sample.json"

# Read the JSON file with schema
df_json_complex = spark.read.schema(schema).json(json_file_path)

# Read the file as text to inspect its contents
with open(json_file_path, 'r') as f:
    data = f.read()
    print(data)

[
  {
    "name": "John",
    "age": 28,
    "gender": "Male",
    "address": {
      "street": "123 Main St",
      "city": "New York"
    }
  },
  {
    "name": "Jane",
    "age": 32,
    "gender": "Female",
    "address": {
      "street": "456 Elm St",
      "city": "San Francisco"
    }
  }
]


In [None]:
import pandas as pd
from pyspark.sql import SparkSession

# Create a sample DataFrame
data = {
    "name": ["John", "Jane", "Mike", "Emily"],
    "age": [28, 32, 45, 23],
    "gender": ["Male", "Female", "Male", "Female"],
    "city": ["New York", "San Francisco", "Los Angeles", "Chicago"]
}
df = pd.DataFrame(data)

# Save the DataFrame to a CSV file in the Colab environment
csv_file_path = "/content/sample_people.csv"
df.to_csv(csv_file_path, index=False)

# Confirm the file has been created
print(f"CSV file created at: {csv_file_path}")

# Initialize Spark Session
spark = SparkSession.builder.appName("CreateViewExample").getOrCreate()

# Load the CSV file into a PySpark DataFrame
df_people = spark.read.format("csv").option("header", "true").option("inferSchema", "true").load(csv_file_path)

# Show the DataFrame
df_people.show()



CSV file created at: /content/sample_people.csv
+-----+---+------+-------------+
| name|age|gender|         city|
+-----+---+------+-------------+
| John| 28|  Male|     New York|
| Jane| 32|Female|San Francisco|
| Mike| 45|  Male|  Los Angeles|
|Emily| 23|Female|      Chicago|
+-----+---+------+-------------+

+----+---+------+-------------+
|name|age|gender|         city|
+----+---+------+-------------+
|Jane| 32|Female|San Francisco|
|Mike| 45|  Male|  Los Angeles|
+----+---+------+-------------+

+-----+---+--------+
| name|age|    city|
+-----+---+--------+
| John| 28|New York|
|Emily| 23| Chicago|
+-----+---+--------+



True

In [None]:
# Create a temporary view
df_people.createOrReplaceTempView("people_temp_view")

# Run an SQL query on the view
result_temp_view = spark.sql("SELECT name, age, gender, city FROM people_temp_view WHERE age > 30")

# Show the result
result_temp_view.show()



+----+---+------+-------------+
|name|age|gender|         city|
+----+---+------+-------------+
|Jane| 32|Female|San Francisco|
|Mike| 45|  Male|  Los Angeles|
+----+---+------+-------------+



In [None]:
# Create a global temporary view
df_people.createOrReplaceGlobalTempView("people_global_view")

# Query the global temporary view
result_global_view = spark.sql("SELECT name, age, city FROM global_temp.people_global_view WHERE age < 30")

# Show the result
result_global_view.show()



+-----+---+--------+
| name|age|    city|
+-----+---+--------+
| John| 28|New York|
|Emily| 23| Chicago|
+-----+---+--------+



In [None]:
# List all temporary views and tables
spark.catalog.listTables()



[Table(name='people_temp_view', catalog=None, namespace=[], description=None, tableType='TEMPORARY', isTemporary=True)]

In [None]:
# Drop the local temporary view
spark.catalog.dropTempView("people_temp_view")

# Drop the global temporary view
spark.catalog.dropGlobalTempView("people_global_view")

True

Afternoon

In [None]:
# Full refresh: Load the entire dataset
df_sales = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("/content/sales_data.csv")

# Apply transformation (if necessary)
df_transformed = df_sales.withColumn("total_sales", df_sales["quantity"] * df_sales["price"])

# Full refresh: Partition the data by 'date' and overwrite the existing data
output_path = "/content/partitioned_data"
df_transformed.write.partitionBy("date").mode("overwrite").parquet(output_path)

# Verify partitioned data
partitioned_df = spark.read.parquet(output_path)
partitioned_df.show()

+--------------+-----------+--------+--------+-----+-------------------+-----------+----------+
|transaction_id|customer_id| product|quantity|price|         updated_at|total_sales|      date|
+--------------+-----------+--------+--------+-----+-------------------+-----------+----------+
|             1|        101|  Laptop|       1| 1000|2024-09-01 08:00:00|       1000|2024-09-01|
|             2|        102|   Phone|       2|  500|2024-09-01 09:00:00|       1000|2024-09-01|
|             5|        105|Keyboard|       1|   50|2024-09-03 12:00:00|         50|2024-09-03|
|             6|        106|   Mouse|       3|   30|2024-09-03 13:00:00|         90|2024-09-03|
|             3|        103|  Tablet|       1|  300|2024-09-02 10:00:00|        300|2024-09-02|
|             4|        104| Monitor|       2|  200|2024-09-02 11:00:00|        400|2024-09-02|
+--------------+-----------+--------+--------+-----+-------------------+-----------+----------+



In [None]:
from pyspark.sql import functions as F

# Incremental load: Define the last ETL run timestamp (this should be tracked externally)
last_etl_run = "2024-01-01 00:00:00"

# Load only new or updated records since the last ETL run
df_incremental = spark.read.format("csv") \
    .option("header", "true") \
    .option("inferSchema", "true") \
    .load("/content/sales_data.csv") \
    .filter(F.col("updated_at") > last_etl_run)

# Apply transformations (if necessary)
df_transformed_incremental = df_incremental.withColumn("total_sales", df_incremental["quantity"] * df_incremental["price"])

# Incremental load: Append the new data to the existing partitioned dataset
output_path = "/content/partitioned_sales_data"
df_transformed_incremental.write.partitionBy("date").mode("append").parquet(output_path)

# Verify partitioned data after incremental load
partitioned_df = spark.read.parquet(output_path)
partitioned_df.show()

+--------------+-----------+--------+--------+-----+-------------------+-----------+----------+
|transaction_id|customer_id| product|quantity|price|         updated_at|total_sales|      date|
+--------------+-----------+--------+--------+-----+-------------------+-----------+----------+
|             1|        101|  Laptop|       1| 1000|2024-09-01 08:00:00|       1000|2024-09-01|
|             2|        102|   Phone|       2|  500|2024-09-01 09:00:00|       1000|2024-09-01|
|             5|        105|Keyboard|       1|   50|2024-09-03 12:00:00|         50|2024-09-03|
|             6|        106|   Mouse|       3|   30|2024-09-03 13:00:00|         90|2024-09-03|
|             3|        103|  Tablet|       1|  300|2024-09-02 10:00:00|        300|2024-09-02|
|             4|        104| Monitor|       2|  200|2024-09-02 11:00:00|        400|2024-09-02|
+--------------+-----------+--------+--------+-----+-------------------+-----------+----------+



In [None]:
# Install ipywidgets in Colab or Jupyter if needed
!pip install ipywidgets

Collecting jedi>=0.16 (from ipython>=4.0.0->ipywidgets)
  Using cached jedi-0.19.1-py2.py3-none-any.whl.metadata (22 kB)
Using cached jedi-0.19.1-py2.py3-none-any.whl (1.6 MB)
Installing collected packages: jedi
Successfully installed jedi-0.19.1


In [None]:
from pyspark.sql import SparkSession
import ipywidgets as widgets
from IPython.display import display

# Step 1: Initialize a Spark session
spark = SparkSession.builder.appName("PySpark with Widgets Example").getOrCreate()

# Step 2: Create a simple DataFrame
data = [
    ("John", 28, "Male", 60000),
    ("Jane", 32, "Female", 72000),
    ("Mike", 45, "Male", 84000),
    ("Emily", 23, "Female", 52000),
    ("Alex", 36, "Male", 67000)
]

df = spark.createDataFrame(data, ["name", "age", "gender", "salary"])

# Show the DataFrame
df.show()

+-----+---+------+------+
| name|age|gender|salary|
+-----+---+------+------+
| John| 28|  Male| 60000|
| Jane| 32|Female| 72000|
| Mike| 45|  Male| 84000|
|Emily| 23|Female| 52000|
| Alex| 36|  Male| 67000|
+-----+---+------+------+



In [None]:
# Step 3: Create widgets

# Dropdown widget to select column for filtering
column_dropdown = widgets.Dropdown(
    options=["age", "salary"],
    value="age",
    description="Filter By:",
)

# Slider widget to choose a value for filtering
slider = widgets.IntSlider(
    value=30,
    min=0,
    max=100,
    step=5,
    description="Threshold:",
    continuous_update=False
)

# Button to trigger filtering
button = widgets.Button(description="Apply Filter")

# Output area to show the results
output = widgets.Output()

# Display the widgets
display(column_dropdown, slider, button, output)

# Step 4: Define the function to apply filtering based on widget inputs
def apply_filter(b):
    column = column_dropdown.value
    threshold = slider.value

    # Clear previous output
    output.clear_output()

    # Filter the DataFrame based on widget values
    df_filtered = df.filter(df[column] > threshold)

    # Show the filtered DataFrame
    with output:
        print(f"Filtering by {column} > {threshold}")
        df_filtered.show()

# Step 5: Attach the function to the button click event
button.on_click(apply_filter)

Dropdown(description='Filter By:', options=('age', 'salary'), value='age')

IntSlider(value=30, continuous_update=False, description='Threshold:', step=5)

Button(description='Apply Filter', style=ButtonStyle())

Output()

In [None]:
import csv
import random
from datetime import datetime, timedelta

def generate_csv(filename, num_records=1000000):
    event_types = ["purchase", "view", "click"]
    start_time = datetime.now()

    with open(filename, 'w', newline='') as csvfile:
        writer = csv.writer(csvfile)
        writer.writerow(["event_time", "event_type", "user_id", "amount"])  # Header

        for i in range(num_records):
            event_time = start_time + timedelta(seconds=i)
            event_type = random.choice(event_types)
            user_id = f"user_{random.randint(1, 10000)}"
            amount = round(random.uniform(1, 100), 2)

            writer.writerow([event_time, event_type, user_id, amount])

            if i % 100000 == 0:
                print(f"Generated {i} records...")

    print(f"CSV file '{filename}' with {num_records} records has been generated.")

# Generate the CSV file
generate_csv("million_records.csv")

Generated 0 records...
Generated 100000 records...
Generated 200000 records...
Generated 300000 records...
Generated 400000 records...
Generated 500000 records...
Generated 600000 records...
Generated 700000 records...
Generated 800000 records...
Generated 900000 records...
CSV file 'million_records.csv' with 1000000 records has been generated.
