In [None]:
import torch
import torch.nn
import pandas
import numpy
import torch.nn.functional
from torch.autograd import Variable
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
## Prepare Data set
temp_data=pandas.read_csv('D:/Data study/data analysis project/lending club/current_out_all_clean_data.csv')

temp_data['purpose']

y_temp=temp_data['total_rec_prncp']/temp_data['funded_amnt']


def clean_data(x) :
    x.replace([numpy.inf],0,inplace=True)
    x.replace([numpy.NAN],0,inplace=True)

clean_data(y_temp)
temp_data['revol_util']=temp_data['revol_util'].str.replace('%', '').astype(float)

temp_data.drop('id',axis=1,inplace=True)
select_feature='''
annual_inc
inq_last_6mths
sub_grade
installment
tot_cur_bal
avg_cur_bal
mo_sin_old_rev_tl_op
mo_sin_rcnt_rev_tl_op
mo_sin_rcnt_tl
mort_acc
num_il_tl
num_tl_120dpd_2m
num_tl_op_past_12m
revol_bal
total_bc_limit
dti
out_prncp
out_prncp_inv
total_pymnt
total_pymnt_inv
total_rec_prncp
total_rec_int
total_rec_late_fee
tot_hi_cred_lim
loan_amnt
term
emp_length
home_ownership
purpose
delinq_2yrs
revol_util
application_type
open_act_il
open_rv_24m
max_bal_bc
all_util
acc_open_past_24mths
num_op_rev_tl
num_rev_accts
num_tl_90g_dpd_24m
pct_tl_nvr_dlq
last_fico_range_high
last_fico_range_low
emp_length
verification_status
open_acc
pub_rec_bankruptcies
pub_rec
tax_liens
chargeoff_within_12_mths
'''
select_feature=select_feature.strip().split('\n')
x_temp=temp_data[select_feature]




def splitset(x,y) :
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
    x_train, x_valid, y_train, y_valid = train_test_split(x_train, y_train, test_size=0.25, random_state=42)
    return x_train.reset_index(drop=True),\
        y_train.reset_index(drop=True),\
        x_valid.reset_index(drop=True),\
        y_valid.reset_index(drop=True),\
        x_test.reset_index(drop=True),\
        y_test.reset_index(drop=True)

bx_train,y_train,bx_valid, y_valid,bx_test,y_test=splitset(x_temp,y_temp)

## scaling, encoding
def onehot_train_valid(x_train,x_valid) :
    one=OneHotEncoder()
    temp_final=pandas.DataFrame()
    temp_valid_final=pandas.DataFrame()
    temp_data=x_train.select_dtypes(include='object')
    temp_data2=x_valid.select_dtypes(include='object')
    for i in range(0,len(temp_data.columns)):
        one.fit(temp_data.iloc[:,i].values.reshape(-1,1))
        temp_x=one.transform(temp_data.iloc[:,i].values.reshape(-1,1)).toarray().astype(int)
        temp_x=pandas.DataFrame(temp_x,
                            columns=[str(one.categories_[0][i]) for i in
                                     range(len(one.categories_[0]))
                            ]
                            )
        temp_x_valid = one.transform(temp_data2.iloc[:, i].values.reshape(-1, 1)).toarray().astype(int)
        temp_x_valid = pandas.DataFrame(temp_x_valid,
                              columns=[str(one.categories_[0][i]) for i in
                                       range(len(one.categories_[0]))
                                       ]
                              )

        temp_final=pandas.concat([temp_x,temp_final],axis=1)
        temp_valid_final=pandas.concat([temp_x_valid,temp_valid_final],axis=1)
    return temp_final.reset_index(drop=True), temp_valid_final.reset_index(drop=True)

def stsc_train_valid(x_train,x_valid) :
    stdsc=StandardScaler()
    temp_x=x_train.select_dtypes(exclude='object')
    temp_valid=x_valid.select_dtypes(exclude='object')
    name=temp_x.columns
    stdsc.fit(numpy.array(temp_x).reshape(-1,len(name)))
    temp_x = stdsc.transform(numpy.array(temp_x).reshape(-1, len(name)))
    temp_x=pandas.DataFrame(temp_x)
    temp_x.columns=name
    temp_x_valid=stdsc.transform(numpy.array(temp_valid).reshape(-1,len(name)))
    temp_x_valid=pandas.DataFrame(temp_x_valid)
    temp_x_valid.columns=name

    return temp_x.reset_index(drop=True), temp_x_valid.reset_index(drop=True)

one_train,one_valid=onehot_train_valid(x_train=bx_train,x_valid=bx_valid)
stsc_train,stsc_valid=stsc_train_valid(bx_train,bx_valid)

x_train=pandas.concat([stsc_train,one_train],axis=1)
x_valid=pandas.concat([stsc_valid,one_valid],axis=1)


# tensor로 전달
x_train_tensor = torch.tensor(x_train.values, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32)

x_valid_tensor = torch.tensor(x_valid.values, dtype=torch.float32)
y_valid_tensor = torch.tensor(y_valid.values, dtype=torch.float32)

train_dataset = torch.utils.data.TensorDataset(x_train_tensor, y_train_tensor)
valid_dataset = torch.utils.data.TensorDataset(x_valid_tensor, y_valid_tensor)


# 데이터를 데이터 로더에 전달
train_loader=torch.utils.data.DataLoader(train_dataset,batch_size=100)
valid_loader=torch.utils.data.DataLoader(valid_dataset,batch_size=100)



## 신경망 정의
n_feature=len(x_train.columns)
class LCDNN(torch.nn.Module):
    def __init__(self):
        super(LCDNN,self).__init__()
        self.fc1=torch.nn.Linear(in_features= n_feature,out_features=32,bias=True) # 히든 레이어
        self.drop=torch.nn.Dropout(0.25) # p만큼의 비율로 텐서의 값이 0이 되고 0이 아닌 값은 1/(1-p)만큼 곱해서 커짐,
        self.fc2=torch.nn.Linear(in_features= 32 ,out_features=16,bias=True)
        self.fc3=torch.nn.Linear(in_features= 16 ,out_features=1,bias=True)

    def forward(self,input_data):
        out=input_data.view(-1, n_feature)
        out=torch.nn.functional.relu(self.fc1(out))
        out=self.drop(out)
        out=torch.nn.functional.relu(self.fc2(out))
        out=self.fc3(out)
        return out

# backward는 잘 정의하지 않는다

# gpu 설정
device=torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

## 파라미터 정의
learning_rate=0.001
model=LCDNN()
model.to(device)

criterion=torch.nn.MSELoss()
optimizer=torch.optim.Adam(model.parameters(),lr=learning_rate) # Adamgrid!!



## optimize
num_epochs=5 # early stopping 한번 찾아볼게요
count=0
loss_list=[]
iteration_list=[]

predictions_list=[]
labels_list=[]


#
# # train
# for epoch in range(num_epochs):
#     for feature, labels in train_loader :
#         feature, labels = feature.to(device), labels.to(device)
#         train=Variable(feature.view(feature.size(0) ,-1))
#         labels=Variable(labels)
#         outputs=model(train)
#         loss=criterion(outputs,labels)
#         optimizer.zero_grad()
#         loss.backward()
#         optimizer.step()
#         count +=1
#
#         if count % 500 == 0:
#             model.eval()  # Set the model to evaluation mode
#             total_loss = 0
#             total_mse = 0.0
#             with torch.no_grad():
#                 for inputs, targets in valid_loader:
#                     inputs, targets = inputs.to(device), targets.to(device)
#                     test = Variable(inputs.view(inputs.size(0), -1))  # Flatten the input
#                     outputs = model(test)
#                     loss = criterion(outputs, targets)
#                     total_loss += loss.item()
#                     mse = torch.nn.functional.mse_loss(outputs, targets)
#                     total_mse += mse.item()
#
#             average_loss = total_loss / len(valid_loader)
#             average_mse=total_mse/len(valid_loader)
#             print(f"Iteration: {count}, Validation Loss: {average_loss},Average_mse: {average_mse}")



count = 0  # Initialize count

for epoch in range(num_epochs):
    model.train()  # Set the model to training mode
    for feature, labels in train_loader:
        feature, labels = feature.to(device), labels.to(device)
        train = feature.view(feature.size(0), -1)  # Flatten the input
        outputs = model(train)

        # Ensure outputs and labels have the same size
        if outputs.size(1) == 1:
            outputs = outputs.squeeze(1)  # [batch_size, 1] -> [batch_size]

        loss = criterion(outputs, labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        count += 1  # Increment count after each training step

        if count % 500 == 0:
            model.eval()  # Set the model to evaluation mode
            total_loss = 0
            total_mse = 0.0
            total_mape = 0.0
            total_r2 = 0.0
            num_samples_mape = 0  # To count valid MAPE samples

            with torch.no_grad():
                for inputs, targets in valid_loader:
                    inputs, targets = inputs.to(device), targets.to(device)
                    test = inputs.view(inputs.size(0), -1)  # Flatten the input
                    outputs = model(test)

                    # Ensure outputs and targets have the same size
                    if outputs.size(1) == 1:
                        outputs = outputs.squeeze(1)  # [batch_size, 1] -> [batch_size]

                    loss = criterion(outputs, targets)
                    total_loss += loss.item()

                    # MSE
                    mse = torch.nn.functional.mse_loss(outputs, targets)
                    total_mse += mse.item()

                    # MAPE
                    epsilon = 1e-8
                    non_zero_targets = torch.abs(targets) > epsilon
                    if torch.sum(non_zero_targets) > 0:
                        mape = torch.mean(torch.abs((targets[non_zero_targets] - outputs[non_zero_targets]) /
                                                     (targets[non_zero_targets] + epsilon))) * 100
                        total_mape += mape.item()
                        num_samples_mape += 1

                    # R² Score
                    y_mean = torch.mean(targets)
                    ss_tot = torch.sum((targets - y_mean) ** 2)
                    ss_res = torch.sum((targets - outputs) ** 2)
                    r2 = 1 - (ss_res / ss_tot) if ss_tot.item() != 0 else float('nan')  # Handle case where ss_tot is zero
                    total_r2 += r2

            average_loss = total_loss / len(valid_loader)
            average_mse = total_mse / len(valid_loader)
            average_mape = total_mape / num_samples_mape if num_samples_mape > 0 else float('nan')
            average_r2 = total_r2 / len(valid_loader) if len(valid_loader) > 0 else float('nan')

            print(f"Iteration: {count}, Validation Loss: {average_loss:.4f}, Average MSE: {average_mse:.4f}, "
                  f"Average MAPE: {average_mape:.4f}%, Average R²: {average_r2:.4f}")
model.parameters()