### 데이터사이언스

In [11]:
import csv
import pandas as pd

from datetime import datetime
import matplotlib.pyplot as plt
import seaborn as sns

In [12]:
train_data = pd.read_csv('DS-main/datas/train.csv')
test_data = pd.read_csv('DS-main/datas/test.csv')

In [14]:


# 현재 날짜 계산 (Year, Month, Day 컬럼 사용)
train_data['Current Date'] = pd.to_datetime(train_data[['Year', 'Month', 'Day']])

# 새로운 변수들 생성

# 1. 신용 한도 변동성 (Credit Limit / Account Open Date)
train_data['Acct Open Date'] = pd.to_datetime(train_data['Acct Open Date'])
train_data['Account Duration'] = (train_data['Current Date'] - train_data['Acct Open Date']).dt.days
train_data['Credit Limit Volatility'] = train_data['Credit Limit'] / train_data['Account Duration']

# 2. 계좌 개설 후 소득 변동 (Per Capita Income / Account Open Date)
train_data['Income Change Since Account Open'] = train_data['Per Capita Income - Zipcode'] / train_data['Account Duration']

# 3. 사용자 신용 등급 그룹 (Credit Score / User)
train_data['Credit Score Group'] = pd.qcut(train_data['Credit Score'], q=4, labels=['Low', 'Medium', 'High', 'Very High'])

# 4. 보안 칩 사용 여부 (Security Chip Used / Has Chip)
train_data['Chip Used'] = ((train_data['Has Chip'] == 'Yes') & (train_data['Whether Security Chip is Used'] == 'Yes')).astype(int)

# 5. 보안 갱신 여부 (PIN Last Changed / Security Used)
train_data['Security Updated'] = ((train_data['Year PIN last Changed'].notnull()) | (train_data['Whether Security Chip is Used'] == 'Yes')).astype(int)

# 6. 신용 점수 대비 신용 한도 비율 (Credit Score / Credit Limit)
train_data['Credit Score to Limit Ratio'] = train_data['Credit Score'] / train_data['Credit Limit']

# 7. 보안 업데이트 상태 (Has Chip / PIN Last Changed)
train_data['Security Status'] = ((train_data['Has Chip'] == 'Yes') & (train_data['Year PIN last Changed'].notnull())).astype(int)

# 8. 은퇴까지 남은 기간 (Retirement Age / Current Age)
train_data['Years to Retirement'] = train_data['Retirement Age'] - train_data['Current Age']

# 9. 부채 대비 소득 비율 (Yearly Income / Total Debt)
train_data['Debt to Income Ratio'] = train_data['Total Debt'] / train_data['Yearly Income']

# Merchandise Code와 Price (Amount) 간의 평균 관계 생성
merchandise_avg_price = train_data.groupby('Merchandise Code')['Amount'].mean().reset_index()
merchandise_avg_price.columns = ['Merchandise Code', 'Avg Price per Merchandise Code']

# train_data에 병합하여 새로운 변수 추가
train_data = pd.merge(train_data, merchandise_avg_price, on='Merchandise Code', how='left')


# Is Fraud 컬럼을 포함하여 저장
columns_to_save = [ 'Is Fraud?',    'Current Date', 'Credit Limit Volatility', 
    'Income Change Since Account Open', 'Credit Score Group', 'Chip Used', 'Security Updated', 
    'Credit Score to Limit Ratio', 'Security Status', 'Years to Retirement', 'Debt to Income Ratio', 
    'Avg Price per Merchandise Code'
]
# 새로운 데이터를 train_user.csv로 저장
train_data[columns_to_save].to_csv('train_user.csv', index=False)



KeyError: "['IsFraud', 'Avg Price per Merchandise Code'] not in index"

In [None]:
# 데이터 불러오기
train_data = pd.read_csv('train_user.csv')

# 각 컬럼 간의 상관관계 계산
corr_matrix = train_data.corr()

# 히트맵 시각화
plt.figure(figsize=(12, 8))
sns.heatmap(corr_matrix, annot=True, fmt='.2f', cmap='coolwarm', cbar=True)

# 그래프 제목 추가
plt.title('Feature Correlation Heatmap')
plt.show()