In [30]:
import pandas as pd
import numpy as np

- **Pandas Merge Function**

In [31]:
customers = pd.DataFrame({
    'customer_id': [101, 102, 103, 104],
    'name': ['Alice', 'Bob', 'Charlie', 'Diana'],
    'segment': ['Gold', 'Silver', 'Gold', 'Bronze']
})

transactions = pd.DataFrame({
    'txn_id': [1, 2, 3, 4, 5, 6],
    'customer_id': [101, 101, 102, 105, 103, 103],
    'amount': [120.0, 80.0, 50.0, 200.0, 35.0, 65.0],
    'date': pd.to_datetime(['2025-12-01','2025-12-05','2025-12-02','2025-12-03','2025-12-04','2025-12-10'])
})


In [32]:
customers

Unnamed: 0,customer_id,name,segment
0,101,Alice,Gold
1,102,Bob,Silver
2,103,Charlie,Gold
3,104,Diana,Bronze


In [33]:
transactions

Unnamed: 0,txn_id,customer_id,amount,date
0,1,101,120.0,2025-12-01
1,2,101,80.0,2025-12-05
2,3,102,50.0,2025-12-02
3,4,105,200.0,2025-12-03
4,5,103,35.0,2025-12-04
5,6,103,65.0,2025-12-10


- **Exercise task**
    - Task 1: Join transactions with customer details so each transaction shows the customer’s name and segment.
    - Task 2: Show all customers, even with no transactions (left join customers→transactions).
    - Task 3: Find customers that have transactions but aren’t in the customer list.
    - Task 4: Total spend per customer, then attach customer info.
    - Task 5: Total spend per segment.



In [34]:
task1 = pd.merge(transactions,customers,on="customer_id",how="inner")
task1

Unnamed: 0,txn_id,customer_id,amount,date,name,segment
0,1,101,120.0,2025-12-01,Alice,Gold
1,2,101,80.0,2025-12-05,Alice,Gold
2,3,102,50.0,2025-12-02,Bob,Silver
3,5,103,35.0,2025-12-04,Charlie,Gold
4,6,103,65.0,2025-12-10,Charlie,Gold


In [35]:
task2 = pd.merge(customers,transactions,on="customer_id",how="left")
task2

Unnamed: 0,customer_id,name,segment,txn_id,amount,date
0,101,Alice,Gold,1.0,120.0,2025-12-01
1,101,Alice,Gold,2.0,80.0,2025-12-05
2,102,Bob,Silver,3.0,50.0,2025-12-02
3,103,Charlie,Gold,5.0,35.0,2025-12-04
4,103,Charlie,Gold,6.0,65.0,2025-12-10
5,104,Diana,Bronze,,,NaT


In [36]:
task3 = pd.merge(customers[["customer_id"]],transactions,on="customer_id",how="right",indicator=True)
task3

Unnamed: 0,customer_id,txn_id,amount,date,_merge
0,101,1,120.0,2025-12-01,both
1,101,2,80.0,2025-12-05,both
2,102,3,50.0,2025-12-02,both
3,105,4,200.0,2025-12-03,right_only
4,103,5,35.0,2025-12-04,both
5,103,6,65.0,2025-12-10,both


In [37]:
task3[task3["_merge"] == "right_only"][["customer_id","txn_id","amount","date"]]

Unnamed: 0,customer_id,txn_id,amount,date
3,105,4,200.0,2025-12-03


In [38]:
# task 4
grouped_tr = transactions.groupby("customer_id")["amount"].sum().reset_index()
grouped_tr

Unnamed: 0,customer_id,amount
0,101,200.0
1,102,50.0
2,103,100.0
3,105,200.0


In [39]:
task4 = pd.merge(customers,grouped_tr,on="customer_id",how="left")
task4.sort_values(by="amount",na_position="first")

Unnamed: 0,customer_id,name,segment,amount
3,104,Diana,Bronze,
1,102,Bob,Silver,50.0
2,103,Charlie,Gold,100.0
0,101,Alice,Gold,200.0


In [40]:
task5 = pd.merge(customers,transactions,on="customer_id",how="inner")
task5

Unnamed: 0,customer_id,name,segment,txn_id,amount,date
0,101,Alice,Gold,1,120.0,2025-12-01
1,101,Alice,Gold,2,80.0,2025-12-05
2,102,Bob,Silver,3,50.0,2025-12-02
3,103,Charlie,Gold,5,35.0,2025-12-04
4,103,Charlie,Gold,6,65.0,2025-12-10


In [41]:
task5.groupby("segment")["amount"].sum().reset_index()

Unnamed: 0,segment,amount
0,Gold,300.0
1,Silver,50.0


## Data Integration
- Actors and Directors Who Cooperated At Least Three Times


In [42]:
actors_and_directors = pd.DataFrame({
    'actor_id': [1, 1, 1, 1, 1, 2, 2],
    'director_id': [1, 1, 1, 2, 2, 1, 1],
    'timestamp': [0, 1, 2, 3, 4, 5, 6]
})
actors_and_directors

Unnamed: 0,actor_id,director_id,timestamp
0,1,1,0
1,1,1,1
2,1,1,2
3,1,2,3
4,1,2,4
5,2,1,5
6,2,1,6


In [43]:
actors_and_directors1 = actors_and_directors.groupby(["actor_id","director_id"])["timestamp"].count().reset_index()

In [44]:
actors_and_directors1

Unnamed: 0,actor_id,director_id,timestamp
0,1,1,3
1,1,2,2
2,2,1,2


In [45]:
actors_and_directors1.loc[actors_and_directors1.timestamp >= 3,["actor_id","director_id"]]

Unnamed: 0,actor_id,director_id
0,1,1


- **Replace Employee ID With The Unique Identifier**

In [46]:
employees_data = [
    [1, 'Alice'],
    [7, 'Bob'],
    [11, 'Meir'],
    [90, 'Winston'],
    [3, 'Jonathan']
]
employees = pd.DataFrame(employees_data, columns=['id', 'name'])

employee_uni_data = [
    [3, 1],
    [11, 2],
    [90, 3]
]
employee_uni = pd.DataFrame(employee_uni_data, columns=['id', 'unique_id'])

In [47]:
employees

Unnamed: 0,id,name
0,1,Alice
1,7,Bob
2,11,Meir
3,90,Winston
4,3,Jonathan


In [52]:
employee_uni

Unnamed: 0,id,unique_id
0,3,1
1,11,2
2,90,3


In [54]:
res = pd.merge(employees,employee_uni,on="id",how="left")
res[["unique_id","name"]]

Unnamed: 0,unique_id,name
0,,Alice
1,,Bob
2,2.0,Meir
3,3.0,Winston
4,1.0,Jonathan
