# **第4章 Python在處理數據中的應用**

## 4.1 讀取數據

### 4.1.1 讀取本地文件數據(CSV、Excel、文本)

In [None]:
# 生成要使用的樣本數據，可以跳過這部份旳代碼
import csv

# 資料
bills = [
    {"Bill ID": 1, "Bill Type": "信用卡", "Amount": 500.00, "Due Date": "2023-04-15"},
    {"Bill ID": 2, "Bill Type": "房貸", "Amount": 2000.00, "Due Date": "2023-04-20"},
    {"Bill ID": 3, "Bill Type": "信用卡", "Amount": 300.00, "Due Date": "2023-04-22"},
    {"Bill ID": 4, "Bill Type": "水電費", "Amount": 150.00, "Due Date": "2023-04-25"},
    {"Bill ID": 5, "Bill Type": "房貸", "Amount": 2000.00, "Due Date": "2023-04-30"}
]

# CSV 文件的欄位名稱
fields = ["Bill ID", "Bill Type", "Amount", "Due Date"]

# 寫入到 CSV 檔案
with open('bank_bills.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.DictWriter(file, fieldnames=fields)
    writer.writeheader()
    writer.writerows(bills)

print("CSV 文件已生成。")

In [None]:
# 引入Pandas庫
import pandas as pd

# 定義文件路徑
file_path = "bank_bills.csv"

# 使用Pandas的read_csv()函數讀取CSV文件
bills = pd.read_csv(file_path)

# 查看前5條記錄
print(bills.head())

### 4.1.2 讀取數據庫數據

In [None]:
# 這段代碼需要具體真實的SQLite數據庫
# 引入所需的模組
import sqlite3
import pandas as pd

# 連接SQLite數據庫
conn = sqlite3.connect('bank_customers.db')

# 讀取客戶信息
customers = pd.read_sql("SELECT * FROM customers", conn)

# 輸出結果
print(customers)

# 關閉數據庫連接
conn.close()

In [None]:
# 這段代碼需要具體真實的DB2數據庫
# 引入所需的模組
import pyodbc
import pandas as pd

# 連接DB2數據庫
conn = pyodbc.connect(
    "Driver={IBM DB2 ODBC DRIVER};",
    "Database=mydb;",
    "Hostname=myhost;",
    "Port=50000;",
    "Protocol=TCPIP;",
    "Uid=myuser;",
    "Pwd=mypassword;"
)-

# 讀取客戶信息
customers = pd.read_sql("SELECT * FROM customers", conn)

# 輸出結果
print(customers)

# 關閉數據庫連接
conn.close()

## 4.2 常用的數據處理方法

### 4.2.1 垂直合併(concatenate)

In [None]:
# 生成要使用的樣本數據，可以跳過這部份旳代碼
import csv

# 数据
branch1_accounts = [
    ["Account ID", "Account Type", "Balance"],
    ["1001", "儲蓄", "5000.00"],
    ["1002", "支票", "12000.00"],
    ["1003", "儲蓄", "8500.00"],
]

branch2_accounts = [
    ["Account ID", "Account Type", "Balance"],
    ["2001", "儲蓄", "6300.00"],
    ["2002", "支票", "18000.00"],
    ["2003", "定期", "25000.00"],
]

branch3_accounts = [
    ["Account ID", "Account Type", "Balance"],
    ["3001", "儲蓄", "4800.00"],
    ["3002", "支票", "15000.00"],
    ["3003", "定期", "30000.00"],
]

# 寫入 CSV 文件的函数
def write_csv(filename, data):
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerows(data)

# 生成 CSV 文件
write_csv('branch1_accounts.csv', branch1_accounts)
write_csv('branch2_accounts.csv', branch2_accounts)
write_csv('branch3_accounts.csv', branch3_accounts)

print("CSV 文件已生成。")

In [None]:
import pandas as pd

# 讀取各分行的賬戶數據
branch1_accounts = pd.read_csv('branch1_accounts.csv')
branch2_accounts = pd.read_csv('branch2_accounts.csv')
branch3_accounts = pd.read_csv('branch3_accounts.csv')

# 垂直合併數據
all_accounts = pd.concat([branch1_accounts, branch2_accounts, branch3_accounts])

# 輸出合併後的數據
print(all_accounts)

### 4.2.2 水平合併(merge)

In [None]:
# 生成要使用的樣本數據，可以跳過這部份旳代碼
import pandas as pd

# 客戶數據
customers_data = {
    'customer_id': [1001, 1002, 1003],
    'name': ['John Doe', 'Jane Smith', 'Bob Johnson'],
    'address': ['123 Main St, Anytown USA', '456 Oak Rd, Somewhere City', '789 Elm Ave, Otherplace'],
    'phone': ['555-1234', '555-5678', '555-9012']
}

# 建立 DataFrame 並輸出到CSV
customers_df = pd.DataFrame(customers_data)
customers_df.to_csv('customers.csv', index=False)

# 帳號數據
accounts_data = {
    'account_id': [10001, 10002, 10003, 10004],
    'customer_id': [1001, 1001, 1002, 1002],
    'account_type': ['儲蓄', '支票', '儲蓄', '定期'],
    'balance': [5000.00, 12000.00, 6300.00, 25000.00]
}

# 建立 DataFrame 並輸出到CSV
accounts_df = pd.DataFrame(accounts_data)
accounts_df.to_csv('accounts.csv', index=False)

In [None]:
import pandas as pd

# 讀取客戶基本信息
customers = pd.read_csv('customers.csv')

# 讀取客戶賬戶信息
accounts = pd.read_csv('accounts.csv')

# 根據客戶ID合併數據
customer_accounts = pd.merge(customers, accounts, on='customer_id', how='left')

# 輸出合併後的數據
print(customer_accounts)

### 4.2.3 數據篩選

**單一篩選條件的例子**

In [None]:
# 生成要使用的樣本數據，可以跳過這部份旳代碼
import pandas as pd

# 員工薪資資訊 DataFrame
data_salary = {
    'employee_id': [1, 2, 3, 4, 5],
    'name': ['Alice', 'Bob', 'Charlie', 'David', 'Eve'],
    'salary': [4500, 5200, 6000, 4800, 7000]
}

df_salary = pd.DataFrame(data_salary)

In [None]:
# 篩選出工資高於5,000的員工
high_salary = df_salary[df_salary['salary'] > 5000]
print(high_salary)

**布爾條件的概念**

In [None]:
# 生成要使用的樣本數據，可以跳過這部份旳代碼
import pandas as pd

# 銀行賬戶信息 DataFrame
data_account = {
    'account_id': [101, 102, 103, 104, 105],
    'account_type': ['儲蓄', '支票', '儲蓄', '儲蓄', '支票'],
    'balance': [500, 15000, 12000, 800, 7000]
}

df_account = pd.DataFrame(data_account)

In [None]:
(df_account['account_type'] == '儲蓄') & (df_account['balance'] > 10000)

In [None]:
# 篩選出賬戶類型為儲蓄，且餘額大於10,000或小於1,000的記錄
condition = (df_account['account_type'] == '儲蓄') & ((df_account['balance'] > 10000) | (df_account['balance'] < 1000))
filtered_df = df_account[condition]
print(filtered_df)

### 4.2.4 字典映射

In [None]:
# 生成要使用的樣本數據，可以跳過這部份旳代碼
import csv

# 數據
data = [
    {"biz_id": 1001, "biz_type_code": 1, "amount": 5000.00},
    {"biz_id": 1002, "biz_type_code": 2, "amount": 12000.00},
    {"biz_id": 1003, "biz_type_code": 3, "amount": 20000.00}
]

# CSV 文件名
file_name = 'bank_biz.csv'

# 字段名
fields = ["biz_id", "biz_type_code", "amount"]

# 寫入 CSV 文件
with open(file_name, mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=fields)

    # 寫入表頭
    writer.writeheader()

    # 寫入數據行
    for row in data:
        writer.writerow(row)

print(f'CSV 文件 "{file_name}" 已生成。')

In [None]:
import pandas as pd

# 定義業務類型對應的字典
biz_type_map = {
    1: '儲蓄',
    2: '支票',
    3: '定期'
}

# 讀取銀行業務數據
bank_biz = pd.read_csv('bank_biz.csv')

# 使用字典映射轉換業務類型
bank_biz['biz_type'] = bank_biz['biz_type_code'].map(biz_type_map)

# 輸出結果
print(bank_biz)

### 4.2.5 改變列值

In [None]:
import pandas as pd

# 創建客戶賬單數據表
bills = pd.DataFrame({
    'bill_id': [1001, 1002, 1003, 1004, 1005],
    'bill_type': ['水費', '電費', '房租', '信用卡', '網費'],
    'amount': [150, 200, 2000, 500, 100]
})

# 根據賬單金額給予狀態標記
bills['status'] = '正常'
bills.loc[bills['amount'] >= 1000, 'status'] = '高額'
bills.loc[bills['amount'] < 100, 'status'] = '低額'

# 輸出結果
print(bills)

### 4.2.6 樞紐分析表（Pivot table）

In [None]:
import csv

# 定義要寫入CSV的數據
data = [
    ["tx_id", "biz_type", "amount", "tx_date"],
    [1001, "儲蓄", 5000.00, "2023-04-01"],
    [1002, "支票", 8000.00, "2023-04-15"],
    [1003, "定期", 12000.00, "2023-05-01"],
    [1004, "儲蓄", 6000.00, "2023-05-10"],
    [1005, "支票", 10000.00, "2023-06-01"],
]

# 寫入CSV文件
with open('bank_transactions.csv', mode='w', newline='', encoding='utf-8') as file:
    writer = csv.writer(file)
    writer.writerows(data)

print("CSV文件生成完成")

In [None]:
import pandas as pd

# 讀取銀行交易數據
transactions = pd.read_csv('bank_transactions.csv')

# 添加 month 欄位
transactions['month'] = pd.to_datetime(transactions['tx_date']).dt.month

# 使用 pivot_table 生成交易金額報表
tx_report = transactions.pivot_table(
    index='month',
    columns='biz_type',
    values='amount',
    aggfunc='sum',
    fill_value=0
)

# 輸出報表
print(tx_report)

## 4.3 清洗和轉換數據

### 4.3.1 常用的 DataFrame 統計方法

df.describe()

In [None]:
import pandas as pd

# 創建一個 DataFrame
df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': [10, 20, 30, 40, 50]
})

# 生成數據摘要
print(df.describe())

df.shape

In [None]:
import pandas as pd

# 創建一個 DataFrame
df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': [10, 20, 30, 40, 50]
})

# 獲取 DataFrame 的行數和列數
print(df.shape)

df.info()

In [None]:
import pandas as pd

# 創建一個 DataFrame
df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5],
    'B': [10, 20, 30, 40, 50]
})

# 顯示 DataFrame 的基本信息
print(df.info())

df.head()和df.tail()

In [None]:
import pandas as pd

# 創建一個 DataFrame
df = pd.DataFrame({
    'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10],
    'B': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100]
})

# 顯示前 5 行數據
print(df.head())

# 顯示後 5 行數據
print(df.tail())

df.isnull().sum()

In [None]:
import pandas as pd

# 創建一個 DataFrame,包含缺失值
df = pd.DataFrame({
    'A': [1, 2, 3, None, 5],
    'B': [10, 20, None, 40, 50]
})

# 統計每個欄位的缺失值數量
print(df.isnull().sum())

### 4.3.2 處理缺失值

In [None]:
# 生成要使用的樣本數據，可以跳過這部份旳代碼
import csv

# 定義數據
data = [
    {"customer_id": 1001, "name": "John Doe", "address": "123 Main St, Anytown USA", "phone": "555-1234"},
    {"customer_id": 1002, "name": "Jane Smith", "address": "", "phone": "555-5678"},
    {"customer_id": 1003, "name": "Bob Johnson", "address": "789 Elm Ave, Otherplace", "phone": ""},
]

# 指定CSV文件的列名
fieldnames = ["customer_id", "name", "address", "phone"]

# 創建並且寫入文件
with open("customers.csv", mode="w", newline="") as file:
    writer = csv.DictWriter(file, fieldnames=fieldnames)

    # 寫入表頭
    writer.writeheader()

    # 寫入數據
    for row in data:
        writer.writerow(row)

In [None]:
import pandas as pd

# 讀取客戶資訊
customers = pd.read_csv('customers.csv')

# 檢查缺失值
print(customers.isnull().sum())

# 使用'Unknown'填充缺失地址
customers['address'] = customers['address'].fillna('Unknown')

# 輸出結果
print(customers)

### 4.3.3 數據格式轉換

#### 日期時間格式轉換

In [None]:
# 生成要使用的樣本數據，可以跳過這部份旳代碼
import csv

# 定義要寫入的資料
data = [
    ["trans_id", "trans_time", "amount", "account_id"],
    [1001, "2023-04-15 09:30:00", 500.00, 10001],
    [1002, "2023-04-16 14:45:00", 200.00, 10002],
    [1003, "2023-04-17 11:20:00", 300.00, 10003],
    [1004, "2023-04-18 16:55:00", 400.00, 10001],
    [1005, "2023-04-19 13:10:00", 150.00, 10002]
]

# 指定 CSV 檔案的名字
filename = "transactions.csv"

# 寫入 CSV 檔案
with open(filename, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerows(data)

print(f"CSV 檔案 '{filename}' 已成功生成。")

In [None]:
import pandas as pd

# 讀取交易記錄
transactions = pd.read_csv('transactions.csv')

# 將交易時間轉換為 datetime 格式
transactions['trans_time'] = pd.to_datetime(transactions['trans_time'])

# 提取日期部分
transactions['trans_date'] = transactions['trans_time'].dt.date

# 提取時間部分
transactions['trans_time'] = transactions['trans_time'].dt.time

# 輸出結果
print(transactions.head())

#### 數值格式轉換

In [None]:
# 生成要使用的樣本數據，可以跳過這部份旳代碼
import csv

# 定義數據
data = [
    {'id': 1, 'price': 9.99, 'quantity': 5},
    {'id': 2, 'price': 19.99, 'quantity': 10},
    {'id': 3, 'price': 29.99, 'quantity': 15}
]

# 定義CSV文件名
csv_file = 'data.csv'

# 寫入CSV文件
with open(csv_file, mode='w', newline='') as file:
    writer = csv.DictWriter(file, fieldnames=data[0].keys())
    writer.writeheader()
    for row in data:
        writer.writerow(row)

print(f"CSV文件 '{csv_file}' 已經生成。")

In [None]:
import pandas as pd

# 讀取數據
data = pd.read_csv('data.csv')

# 將 'price' 欄位轉換為浮點數格式
data['price'] = data['price'].astype(float)

# 將 'quantity' 欄位轉換為整數格式
data['quantity'] = data['quantity'].astype(int)

# 輸出結果
print(data.head())

#### 字符串格式轉換

In [None]:
# 生成要使用的樣本數據，可以跳過這部份旳代碼
import csv

# 定義CSV文件的列名
fieldnames = ['id', 'name', 'address']

# 定義CSV文件的內容
rows = [
    {'id': 1, 'name': 'John Doe', 'address': '123 Main St'},
    {'id': 2, 'name': 'Jane SMITH', 'address': '456 Oak Ave'},
    {'id': 3, 'name': 'BOB Johnson', 'address': '789 Elm St'}
]

# 打開CSV文件並寫入
with open('data.csv', 'w', newline='') as csvfile:
    # 創建CSV寫入器對象
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    # 寫入列名
    writer.writeheader()

    # 寫入數據行
    writer.writerows(rows)

print("CSV文件已生成：data.csv")

In [None]:
import pandas as pd

# 讀取數據
data = pd.read_csv('data.csv')

# 將 'name' 欄位轉換為小寫
data['name'] = data['name'].str.lower()

# 將 'address' 欄位移除首尾空格
data['address'] = data['address'].str.strip()

# 輸出結果
print(data.head())

In [None]:
# 生成要使用的樣本數據，可以跳過這部份旳代碼
import pandas as pd

# 生成 data.xlsx
data_xlsx = {
    'ID': [1, 2, 3],
    'Name': ['John', 'Jane', 'Bob'],
    'Amount': [100.00, 200.50, 150.25]
}

df_xlsx = pd.DataFrame(data_xlsx)
df_xlsx.to_excel('data.xlsx', index=False)  # 儲存為 XLSX 檔案

# 生成 data.csv
data_csv = {
    'Order_ID': [1001, 1002, 1003],
    'Product': ['A', 'B', 'C'],
    'Quantity': [10, 20, 15]
}

df_csv = pd.DataFrame(data_csv)
df_csv.to_csv('data.csv', index=False)  # 儲存為 CSV 檔案

In [None]:
import pandas as pd

# 讀取 Excel 文件,並指定數據類型
excel_data = pd.read_excel('data.xlsx', dtype={'ID': str, 'Amount': float})

# 讀取 CSV 文件,並指定數據類型
csv_data = pd.read_csv('data.csv', dtype={'Order_ID': int, 'Quantity': int})

# 輸出結果
print(excel_data.head())
print(csv_data.head())

## 4.4 比較數據

### 4.4.1 查找重複數據

In [None]:
# 生成要使用的樣本數據，可以跳過這部份旳代碼
import csv

# 定義 CSV 文件的欄位名稱
fieldnames = ['customer_id', 'name', 'address', 'phone']

# 定義 CSV 文件的數據
data = [
    {'customer_id': 1001, 'name': 'John Doe', 'address': '123 Main St, Anytown USA', 'phone': '555-1234'},
    {'customer_id': 1002, 'name': 'Jane Smith', 'address': '456 Oak Rd, Somewhere City', 'phone': '555-5678'},
    {'customer_id': 1003, 'name': 'Bob Johnson', 'address': '789 Elm Ave, Otherplace', 'phone': '555-9012'},
    {'customer_id': 1002, 'name': 'Jane Smith', 'address': '456 Oak Rd, Somewhere City', 'phone': '555-5678'}
]

# 創建並寫入 CSV 文件
with open('customers.csv', 'w', newline='', encoding='utf-8') as csvfile:
    writer = csv.DictWriter(csvfile, fieldnames=fieldnames)

    writer.writeheader()  # 寫入標題
    for row in data:
        writer.writerow(row)  # 寫入數據行

print("customers.csv 文件已成功生成。")

In [None]:
import pandas as pd

# 讀取客戶資訊
customers = pd.read_csv('customers.csv')

# 找出重複的客戶記錄
duplicate_customers = customers[customers.duplicated(subset=['customer_id', 'name', 'address', 'phone'], keep=False)]

# 輸出重複記錄
print(duplicate_customers)

# 刪除重複記錄
customers.drop_duplicates(subset=['customer_id', 'name', 'address', 'phone'], inplace=True)

# 輸出清理後的客戶資訊
print(customers)

### 4.4.2 對比歷史數據

In [None]:
# 生成要使用的樣本數據，可以跳過這部份旳代碼
import pandas as pd

# 定義上一季度貸款的數據
prev_quarter_data = {
    'loan_id': [1001, 1002, 1003],
    'customer_id': [1001, 1002, 1003],
    'loan_amount': [50000.00, 80000.00, 60000.00],
    'loan_date': ['2023-01-15', '2023-02-10', '2023-03-05']
}

# 創建DataFrame並導出為CSV文件
prev_quarter_df = pd.DataFrame(prev_quarter_data)
prev_quarter_df.to_csv('prev_quarter_loans.csv', index=False)

# 定義本季度貸款的數據
curr_quarter_data = {
    'loan_id': [2001, 2002, 2003],
    'customer_id': [1001, 1002, 1004],
    'loan_amount': [60000.00, 90000.00, 75000.00],
    'loan_date': ['2023-04-20', '2023-05-15', '2023-06-01']
}

# 創建DataFrame並導出為CSV文件
curr_quarter_df = pd.DataFrame(curr_quarter_data)
curr_quarter_df.to_csv('curr_quarter_loans.csv', index=False)

print("CSV文件已經創建完成。")

In [None]:
import pandas as pd

# 讀取上個季度的貸款數據
prev_quarter = pd.read_csv('prev_quarter_loans.csv')

# 讀取當前季度的貸款數據
curr_quarter = pd.read_csv('curr_quarter_loans.csv')

# 計算兩個季度的貸款總額
prev_total = prev_quarter['loan_amount'].sum()
curr_total = curr_quarter['loan_amount'].sum()

# 計算增長率
growth_rate = (curr_total - prev_total) / prev_total * 100

# 輸出結果
print(f"上個季度貸款總額: {prev_total:.2f}")
print(f"當前季度貸款總額: {curr_total:.2f}")
print(f"貸款業務增長率: {growth_rate:.2f}%")

## 4.5 輸出數據

### 4.5.1 輸出CSV文件

In [None]:
# 引入Pandas庫
import pandas as pd

# 創建一個範例DataFrame
data = {
    'Name': ['John', 'Jane', 'Bob'],
    'Age': [28, 24, 35],
    'City': ['New York', 'Los Angeles', 'Chicago']
}
df = pd.DataFrame(data)

# 將DataFrame輸出為CSV文件
df.to_csv('output.csv', index=False)

### 4.5.2 輸出Excel文件

In [None]:
# 將DataFrame輸出為Excel文件
df.to_excel('output.xlsx', index=False)

### 4.5.3 輸出JSON文件

In [None]:
# 將DataFrame輸出為JSON文件
df.to_json('output.json', orient='records', lines=True)

### 4.5.4 輸出其他格式數據

#### 輸出為HTML

In [None]:
# 將DataFrame輸出為HTML文件
df.to_html('output.html', index=False)

#### 輸出為SQL

In [None]:
# 引入所需的模組
import sqlite3

# 創建一個SQLite數據庫連接
conn = sqlite3.connect('example.db')

# 將DataFrame輸出為SQL表
df.to_sql('table_name', conn, if_exists='replace', index=False)

# 關閉數據庫連接
conn.close()

#### 輸出到剪貼板

In [None]:
# 將DataFrame輸出到剪貼板
df.to_clipboard(index=False)

### 4.5.6 綜合運用

In [None]:
# 引入Pandas庫
import pandas as pd

# 創建一個範例DataFrame
data = {
    'Product': ['Laptop', 'Printer', 'Tablet'],
    'Price': [1200.99, 150.75, 300.50],
    'Stock': [10, 20, 15]
}
df = pd.DataFrame(data)

# 輸出為多種格式
df.to_csv('products.csv', index=False)
df.to_excel('products.xlsx', index=False)
df.to_json('products.json', orient='records', lines=True)
df.to_html('products.html', index=False)

# 輸出到SQL數據庫
import sqlite3
conn = sqlite3.connect('products.db')
df.to_sql('products', conn, if_exists='replace', index=False)
conn.close()

## 4.6 Python處理數據實戰案例

### 4.6.1 讀取銀行業務數據

In [None]:
# 生成要使用的樣本數據，可以跳過這部份旳代碼
import pandas as pd

# 創建accounts DataFrame
accounts = pd.DataFrame({
    'account_id': [1001, 1002, 1003, 1004, 1005, 1006, 1007, 1008, 1009, 1010, 1011, 1012, 1013, 1014, 1015],
    'customer_id': [101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115],
    'account_type': ['儲蓄', '支票', '定期', '儲蓄', '支票', '定期', '儲蓄', '支票', '定期', '儲蓄', '儲蓄', '支票', '定期', '儲蓄', '支票'],
    'balance': [50000.00, 30000.00, 100000.00, 80000.00, 45000.00, 150000.00, 60000.00, 35000.00, 120000.00, 75000.00, 55000.00, 40000.00, 110000.00, 85000.00, 50000.00]
})

# 創建transactions DataFrame
transactions = pd.DataFrame({
    'trans_id': [2001, 2002, 2003, 2004, 2005, 2006, 2007, 2008, 2009, 2010],
    'customer_id': [101, 102, 103, 104, 105, 106, 107, 108, 109, 110],
    'trans_date': ['2023-04-15', '2023-04-20', '2023-04-25', '2023-05-01', '2023-05-05', '2023-05-10', '2023-05-15', '2023-05-20', '2023-05-25', '2023-06-01'],
    'amount': [2000.00, 5000.00, 3000.00, 1500.00, 8000.00, 10000.00, 4000.00, 7000.00, 6000.00, 5000.00],
    'trans_type': ['存款', '取款', '轉帳', '存款', '取款', '存款', '轉帳', '取款', '存款', '轉帳']
})

# 創建loans DataFrame
loans = pd.DataFrame({
    'loan_id': [3001, 3002, 3003, 3004, 3005, 3006, 3007, 3008, 3009, 3010, 3011, 3012, 3013, 3014, 3015],
    'customer_id': [101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115],
    'loan_type': ['房貸', '信用貸款', '汽車貸款', '房貸', '信用貸款', '房貸', '汽車貸款', '信用貸款', '房貸', '汽車貸款', '房貸', '信用貸款', '汽車貸款', '房貸', '信用貸款'],
    'loan_amount': [300000.00, 50000.00, 150000.00, 400000.00, 80000.00, 500000.00, 180000.00, 70000.00, 420000.00, 160000.00, 350000.00, 60000.00, 190000.00, 380000.00, 75000.00],
    'loan_date': ['2023-01-15', '2023-02-01', '2023-03-01', '2023-04-01', '2023-04-15', '2023-05-01', '2023-06-01', '2023-06-15', '2023-07-01', '2023-07-15', '2023-01-30', '2023-02-15', '2023-03-15', '2023-04-30', '2023-05-15']
})

# 生成CSV文件
accounts.to_csv('bank_accounts.csv', index = False)
transactions.to_csv('bank_transactions.csv', index = False)
loans.to_csv('bank_loans.csv', index = False)

print("所有CSV文件已成功生成")

In [None]:
import pandas as pd

# 讀取銀行客戶賬戶信息
accounts = pd.read_csv('bank_accounts.csv')

# 讀取銀行客戶交易記錄
transactions = pd.read_csv('bank_transactions.csv')

# 讀取銀行貸款業務信息
loans = pd.read_csv('bank_loans.csv')

### 4.6.2 合併銀行業務數據

In [None]:
# 合併賬戶和交易數據
account_transactions = pd.merge(accounts, transactions, on='customer_id', how='left')

# 合併貸款數據
loans_data = pd.merge(loans, accounts, on='customer_id', how='left')

### 4.6.3 瀏覽和理解數據

In [None]:
# 查看合併後賬戶和交易數據的前5行
print(account_transactions.head())

# 查看合併後貸款數據的前5行
print(loans_data.head())

# 查看合併後賬戶和交易數據的基本信息
print(account_transactions.info())

# 查看合併後貸款數據的基本信息
print(loans_data.info())

# 統計合併後賬戶和交易數據的摘要信息
print(account_transactions.describe())

# 統計合併後貸款數據的摘要信息
print(loans_data.describe())

### 4.6.4 分析銀行業務情況

**1. 分析銀行客戶賬戶情況**

In [None]:
# 統計accounts中不同類型賬戶的客戶數
account_types = accounts.groupby('account_type').size()
print(account_types)

In [None]:
# 計算accounts中不同類型賬戶的總餘額
account_balances = accounts.groupby('account_type')['balance'].sum()
print(account_balances)

**2. 分析銀行客戶交易情況**

In [None]:
#將trans_date轉換成datetime格式
account_transactions['trans_date'] = pd.to_datetime(account_transactions['trans_date'] )

# 計算account_transactions中第二季度的總交易額
curr_quarter_transactions = account_transactions[account_transactions['trans_date'].dt.quarter == 2]
total_transactions = curr_quarter_transactions['amount'].sum()
print(f"第二季度總交易額: {total_transactions:.2f}")

In [None]:
# 統計account_transactions中各客戶的總交易額
customer_transactions = account_transactions.groupby('customer_id')['amount'].sum().sort_values(ascending=False)
print(customer_transactions.head(10))

**3. 分析銀行貸款業務**

In [None]:
# 統計loans_data中不同類型貸款的總金額
loan_amounts = loans_data.groupby('loan_type')['loan_amount'].sum()
print(loan_amounts)

In [None]:
#將trans_date轉換成datetime格式
loans_data['loan_date'] = pd.to_datetime(loans_data['loan_date'] )

# 計算loans_data中近2個季度的貸款總額及增長率
for quarter in range(2, 0, -1):
    quarter_loans = loans_data[loans_data['loan_date'].dt.quarter == quarter]
    quarter_total = quarter_loans['loan_amount'].sum()
    if quarter > 1:
        prev_quarter_loans = loans_data[loans_data['loan_date'].dt.quarter == quarter - 1]
        prev_total = prev_quarter_loans['loan_amount'].sum()
        growth_rate = (quarter_total - prev_total) / prev_total * 100
        print(f"第{quarter}季度貸款總額: {quarter_total:.2f}, 增長率: {growth_rate:.2f}%")
    else:
        print(f"第{quarter}季度貸款總額: {quarter_total:.2f}")