In [None]:
pip install pyspark

Collecting pyspark
  Downloading pyspark-3.5.2.tar.gz (317.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.3/317.3 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.2-py2.py3-none-any.whl size=317812365 sha256=b22b0a50c8b913231be7bbb6b91194bfe39fd8f83ad48ffd61f53f0d383e90e2
  Stored in directory: /root/.cache/pip/wheels/34/34/bd/03944534c44b677cd5859f248090daa9fb27b3c8f8e5f49574
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.2


In [None]:
from pyspark.sql import SparkSession

from pyspark.sql.functions import col

# Initialize SparkSession

spark = SparkSession.builder \
.appName("PySpark DataFrame Example") \
.getOrCreate()

# Sample data representing employees

data = [

    ("John Doe", "Engineering", 75000),

    ("Jane Smith", "Marketing", 60000),

    ("Sam Brown", "Engineering", 80000),

    ("Emily Davis", "HR", 50000),

    ("Michael Johnson", "Marketing", 70000),

]

# Define schema for DataFrame

columns = ["Name", "Department", "Salary"]

# Create DataFrame

df = spark.createDataFrame(data, schema=columns)

# Show the DataFrame

df.show()



+---------------+-----------+------+
|           Name| Department|Salary|
+---------------+-----------+------+
|       John Doe|Engineering| 75000|
|     Jane Smith|  Marketing| 60000|
|      Sam Brown|Engineering| 80000|
|    Emily Davis|         HR| 50000|
|Michael Johnson|  Marketing| 70000|
+---------------+-----------+------+



In [None]:
# Filter: Select employees with a salary greater than 65,000
high_salary_df = df.filter(col("Salary") > 65000)
print("Employees with Salary > 65,000:")
# Show the filtered DataFrame
high_salary_df.show()

Employees with Salary > 65,000:
+---------------+-----------+------+
|           Name| Department|Salary|
+---------------+-----------+------+
|       John Doe|Engineering| 75000|
|      Sam Brown|Engineering| 80000|
|Michael Johnson|  Marketing| 70000|
+---------------+-----------+------+



In [None]:
# Group by Department and Calculate the average salary
avg_salary_by_df = df.groupBy("Department").avg("Salary")
print("Average Salary by Department:")
# Show the average salary by department
avg_salary_by_df.show()

Average Salary by Department:
+-----------+-----------+
| Department|avg(Salary)|
+-----------+-----------+
|Engineering|    77500.0|
|  Marketing|    65000.0|
|         HR|    50000.0|
+-----------+-----------+



In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Customer Transaction Analysis") \
    .getOrCreate()

# Sample data for customers
customers = [
    (1, "Ravi", "Mumbai"),
    (2, "Priya", "Delhi"),
    (3, "Vijay", "Bangalore"),
    (4, "Anita", "Chennai"),
    (5, "Raj", "Hyderabad"),
]

# Sample data for transactions
transactions = [
    (1, 1, 10000.50),
    (2, 2, 20000.75),
    (3, 1, 15000.25),
    (4, 3, 30000.00),
    (5, 2, 40000.50),
    (6, 4, 25000.00),
    (7, 5, 18000.75),
    (8, 1, 5000.00),
]

# Define schema for DataFrames
customer_columns = ["customer_id", "Name", "city"]
transaction_columns = ["Transaction_id", "customer_id", "Amount"]  # Changed CustomerId to customer_id

# Create DataFrames
customer_df = spark.createDataFrame(customers, schema=customer_columns)
transactions_df = spark.createDataFrame(transactions, schema=transaction_columns)

# Show the DataFrames
print("Customers DataFrame:")
customer_df.show()

print("Transactions DataFrame:")
transactions_df.show()

Customers DataFrame:
+-----------+-----+---------+
|customer_id| Name|     city|
+-----------+-----+---------+
|          1| Ravi|   Mumbai|
|          2|Priya|    Delhi|
|          3|Vijay|Bangalore|
|          4|Anita|  Chennai|
|          5|  Raj|Hyderabad|
+-----------+-----+---------+

Transactions DataFrame:
+--------------+-----------+--------+
|Transaction_id|customer_id|  Amount|
+--------------+-----------+--------+
|             1|          1| 10000.5|
|             2|          2|20000.75|
|             3|          1|15000.25|
|             4|          3| 30000.0|
|             5|          2| 40000.5|
|             6|          4| 25000.0|
|             7|          5|18000.75|
|             8|          1|  5000.0|
+--------------+-----------+--------+



In [None]:
# Join the DataFrames on customer_id
customer_transactions_df = customer_df.join(transactions_df, on="customer_id")
print("Customer Transactions DataFrame:")
customer_transactions_df.show()

# Calculate the total amount spent by each customer
total_spent_df = customer_transactions_df.groupBy("Name").sum("Amount").withColumnRenamed("sum(Amount)", "TotalSpent")
print("Total Amount Spent by Each Customer:")
total_spent_df.show()

# Find customers who have spent more than 30000
big_spender_df = total_spent_df.filter(col("TotalSpent") > 30000)
print("Customers who have spent more than 30000:")
big_spender_df.show()

# Count the number of transactions per customer
transaction_count_df = customer_transactions_df.groupBy("Name").count().withColumnRenamed("count", "TransactionCount")
print("Number of Transactions per Customer:")
transaction_count_df.show()

# Sort customers by total amount spent in descending order
sorted_spenders_df = total_spent_df.orderBy(col("TotalSpent").desc())
print("Customers Sorted by Total Amount Spent:")
sorted_spenders_df.show()

Customer Transactions DataFrame:
+-----------+-----+---------+--------------+--------+
|customer_id| Name|     city|Transaction_id|  Amount|
+-----------+-----+---------+--------------+--------+
|          1| Ravi|   Mumbai|             1| 10000.5|
|          1| Ravi|   Mumbai|             3|15000.25|
|          1| Ravi|   Mumbai|             8|  5000.0|
|          2|Priya|    Delhi|             2|20000.75|
|          2|Priya|    Delhi|             5| 40000.5|
|          3|Vijay|Bangalore|             4| 30000.0|
|          4|Anita|  Chennai|             6| 25000.0|
|          5|  Raj|Hyderabad|             7|18000.75|
+-----------+-----+---------+--------------+--------+

Total Amount Spent by Each Customer:
+-----+----------+
| Name|TotalSpent|
+-----+----------+
| Ravi|  30000.75|
|Priya|  60001.25|
|Vijay|   30000.0|
|Anita|   25000.0|
|  Raj|  18000.75|
+-----+----------+

Customers who have spent more than 30000:
+-----+----------+
| Name|TotalSpent|
+-----+----------+
| Ravi|  3

In [None]:
# ### **Exercise: Product Sales Analysis**

# #### **Step 1: Create DataFrames**

# You will create two DataFrames: one for products and another for sales transactions. Then, you’ll perform operations like joining these DataFrames and analyzing the data.

# ```python
from pyspark.sql import SparkSession
from pyspark.sql.functions import col

# Initialize SparkSession
spark = SparkSession.builder \
    .appName("Product Sales Analysis") \
    .getOrCreate()

# Sample data for products
products = [
    (1, "Laptop", "Electronics", 50000),
    (2, "Smartphone", "Electronics", 30000),
    (3, "Table", "Furniture", 15000),
    (4, "Chair", "Furniture", 5000),
    (5, "Headphones", "Electronics", 2000),
]

# Sample data for sales transactions
sales = [
    (1, 1, 2),
    (2, 2, 1),
    (3, 3, 3),
    (4, 1, 1),
    (5, 4, 5),
    (6, 2, 2),
    (7, 5, 10),
    (8, 3, 1),
]

# Define schema for DataFrames
product_columns = ["ProductID", "ProductName", "Category", "Price"]
sales_columns = ["SaleID", "ProductID", "Quantity"]

# Create DataFrames
product_df = spark.createDataFrame(products, schema=product_columns)
sales_df = spark.createDataFrame(sales, schema=sales_columns)

# Show the DataFrames
print("Products DataFrame:")
product_df.show()

print("Sales DataFrame:")
sales_df.show()
# ```

# #### **Step 2: Perform the Following Tasks**

# 1. **Join the DataFrames:**
#    - Join the `product_df` and `sales_df` DataFrames on `ProductID` to create a combined DataFrame with product and sales data.
product_and_sales_df = product_df.join(sales_df, on="ProductID")
print("Product and Sales DataFrame:")
product_and_sales_df.show()

# 2. **Calculate Total Sales Value:**
#    - For each product, calculate the total sales value by multiplying the price by the quantity sold.
total_sales_df = product_and_sales_df.withColumn("TotalSales", col("Price") * col("Quantity"))
print("Total Sales DataFrame:")
total_sales_df.show()

# 3. **Find the Total Sales for Each Product Category:**
#    - Group the data by the `Category` column and calculate the total sales value for each product category.
total_sales_by_category_df = total_sales_df.groupBy("Category").sum("TotalSales").withColumnRenamed("sum(TotalSales)", "TotalSalesCategory")
print("Total Sales by Product Category:")
total_sales_by_category_df.show()

# 4. **Identify the Top-Selling Product:**
#    - Find the product that generated the highest total sales value.
top_selling_product_df = total_sales_df.groupBy("ProductName").sum("TotalSales").withColumnRenamed("sum(TotalSales)", "TotalSalesProduct")
top_selling_product_df

# 5. **Sort the Products by Total Sales Value:**
#    - Sort the products by total sales value in descending order.
sorted_products_df = top_selling_product_df.orderBy(col("TotalSalesProduct").desc())
print("Sorted Products by Total Sales Value:")
sorted_products_df.show()

# 6. **Count the Number of Sales for Each Product:**
#    - Count the number of sales transactions for each product.
sales_count_df = total_sales_df.groupBy("ProductName").count().withColumnRenamed("count", "SalesCount")
print("Number of Sales for Each Product:")
sales_count_df.show()

# 7. **Filter the Products with Total Sales Value Greater Than ₹50,000:**
#    - Filter out the products that have a total sales value greater than ₹50,000.
high_value_products_df = total_sales_df.filter(col("TotalSales") > 50000)
print("Products with Total Sales Value Greater Than ₹50,000:")
high_value_products_df.show()

Products DataFrame:
+---------+-----------+-----------+-----+
|ProductID|ProductName|   Category|Price|
+---------+-----------+-----------+-----+
|        1|     Laptop|Electronics|50000|
|        2| Smartphone|Electronics|30000|
|        3|      Table|  Furniture|15000|
|        4|      Chair|  Furniture| 5000|
|        5| Headphones|Electronics| 2000|
+---------+-----------+-----------+-----+

Sales DataFrame:
+------+---------+--------+
|SaleID|ProductID|Quantity|
+------+---------+--------+
|     1|        1|       2|
|     2|        2|       1|
|     3|        3|       3|
|     4|        1|       1|
|     5|        4|       5|
|     6|        2|       2|
|     7|        5|      10|
|     8|        3|       1|
+------+---------+--------+

Product and Sales DataFrame:
+---------+-----------+-----------+-----+------+--------+
|ProductID|ProductName|   Category|Price|SaleID|Quantity|
+---------+-----------+-----------+-----+------+--------+
|        1|     Laptop|Electronics|50000|   

In [None]:
from pyspark.sql import SparkSession

# Initialize SparkSession

spark = SparkSession.builder \
  .appName("RDD Transformation Example") \
    .getOrCreate()

#Get the SparkContext from the SparkSession
sc = spark.sparkContext
print("Spark Session Created")

data = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
rdd = sc.parallelize(data)


# Print the original RDD
print("Original RDD:", rdd.collect())

Spark Session Created
Original RDD: [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [None]:
rdd2 = rdd.map(lambda x: x* 2)

# Print the transformed RDD

print("RDD after map transformation (x2):", rdd2.collect())


rdd3 = rdd2.filter(lambda x: x % 2 == 0)

# Print the filtered RDD

print("RDD after filter transformation (even numbers):", rdd3.collect())



sentences = ["Hello world", "PySpark is great" "RDD transformations"]

rdd4 = sc.parallelize (sentences)

words_rdd = rdd4.flatMap(lambda sentence: sentence.split(" "))

#Print the flatMapped RDD

print("RDD after flatMap transformation (split into words):", words_rdd.collect())

RDD after map transformation (x2): [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
RDD after filter transformation (even numbers): [2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
RDD after flatMap transformation (split into words): ['Hello', 'world', 'PySpark', 'is', 'greatRDD', 'transformations']


In [None]:
results = rdd3.collect()

print(results)


count = rdd3.count()

print(f"Number of elements: {count}")


total_sum =rdd.reduce(lambda x, y: x + y)
print("Total sum: (total_sum)")

[2, 4, 6, 8, 10, 12, 14, 16, 18, 20]
Number of elements: 10
Total sum: (total_sum)


In [None]:
import pandas as pd

# Creating a DataFrame from a dictionary
data = {'Name': ['John', 'Emma', 'Alex'],
        'Age': [28, 32, 25],
        'City': ['New York', 'London', 'Paris']}
df = pd.DataFrame(data)

# Creating a DataFrame from a list of lists
data = [['John', 28, 'New York'],
        ['Emma', 32, 'London'],
        ['Alex', 25, 'Paris']]
df = pd.DataFrame(data, columns=['Name', 'Age', 'City'])

In [None]:
# Display the first few rows
print(df.head())

# Get basic information about the DataFrame
print(df.info())

# Get statistical summary of numerical columns
print(df.describe())

   Name  Age      City
0  John   28  New York
1  Emma   32    London
2  Alex   25     Paris
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    3 non-null      object
 1   Age     3 non-null      int64 
 2   City    3 non-null      object
dtypes: int64(1), object(2)
memory usage: 200.0+ bytes
None
             Age
count   3.000000
mean   28.333333
std     3.511885
min    25.000000
25%    26.500000
50%    28.000000
75%    30.000000
max    32.000000


In [None]:
# Select a single column
ages = df['Age']
# display the column
print(ages)

# Select multiple columns
subset = df[['Name', 'City']]
print(subset)
# Select rows based on index
first_two = df.loc[0:1]
print(first_two)

0    28
1    32
2    25
Name: Age, dtype: int64
   Name      City
0  John  New York
1  Emma    London
2  Alex     Paris
   Name  Age      City
0  John   28  New York
1  Emma   32    London


In [None]:
# Rename columns
df = df.rename(columns={'Name': 'Full Name', 'City': 'Location'})
print(df)

In [None]:
# Filter rows based on a condition
young_people = df[df['Age'] < 30]
print(young_people)
# Multiple conditions
young_new_yorkers = df[(df['Age'] < 30) & (df['City'] == 'New York')]
print(young_new_yorkers)

   Name  Age      City
0  John   28  New York
2  Alex   25     Paris
   Name  Age      City
0  John   28  New York
