In [17]:
import pandas as pd
import numpy as np

# Example 1

In [57]:

df = pd.DataFrame({
    'A': [1, 2, None, 4, 5],
    'B': [5, None, None, 3, 2]
})
df

Unnamed: 0,A,B
0,1.0,5.0
1,2.0,
2,,
3,4.0,3.0
4,5.0,2.0


In [54]:
df['A'].replace(np.nan, df['A'].dropna().mean())

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
Name: A, dtype: float64

In [55]:
df['A'].fillna(df['A'].dropna().mean())

0    1.0
1    2.0
2    3.0
3    4.0
4    5.0
Name: A, dtype: float64

# Example 2

In [62]:
df = pd.DataFrame({
    'category': ['A', 'A', 'B', 'B', 'C'],
    'sales': [100, 150, 200, 250, 300]
})

### method 1

In [70]:
df.groupby(by=df["category"])['sales'].sum()


category
A    250
B    450
C    300
Name: sales, dtype: int64

In [69]:
df.groupby(by=df["category"])['sales'].mean()


category
A    125.0
B    225.0
C    300.0
Name: sales, dtype: float64

### method 2

In [73]:
df.groupby(df['category']).agg(
    sum_sales = pd.NamedAgg('sales', aggfunc='sum'),
    avg_sales = pd.NamedAgg('sales', aggfunc='mean')
)

Unnamed: 0_level_0,sum_sales,avg_sales
category,Unnamed: 1_level_1,Unnamed: 2_level_1
A,250,125.0
B,450,225.0
C,300,300.0


# Example 3

In [74]:
df1 = pd.DataFrame({
    'id': [1, 2, 3],
    'name': ['Alice', 'Bob', 'Charlie']
})

df2 = pd.DataFrame({
    'id': [1, 2, 4],
    'age': [25, 30, 35]
})
# Merge df1 and df2 on 'id'


In [75]:
df1.merge(df2, on=['id'], how='inner')

Unnamed: 0,id,name,age
0,1,Alice,25
1,2,Bob,30


# Example 3

In [80]:
import findspark
from pyspark.sql import SparkSession

findspark.find()

'/opt/spark'

In [79]:
spark = SparkSession.builder.appName("ExampleApp").getOrCreate()
df = spark.createDataFrame([
    (1, "Alice", 25),
    (2, "Bob", 30),
    (3, "Charlie", 35)
], ["id", "name", "age"])
# Filter rows where age > 25 and select 'name' column


Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/06/11 08:06:54 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [81]:
df.filter(df.age > 25).select(df.name).show()

                                                                                

+-------+
|   name|
+-------+
|    Bob|
|Charlie|
+-------+



# Example 4

In [82]:
df = spark.createDataFrame([
    ("A", 100),
    ("A", 150),
    ("B", 200),
    ("B", 250),
    ("C", 300)
], ["category", "sales"])
# Group by 'category' and calculate the sum of sales


In [86]:
df.groupby(df.category).sum('sales').withColumnRenamed('sum(sales)', 'sales_sum').show()

+--------+---------+
|category|sales_sum|
+--------+---------+
|       A|      250|
|       B|      450|
|       C|      300|
+--------+---------+



In [None]:
# Example 5