In [None]:
import pandas as pd
import numpy as np
import random
from tqdm import tqdm

from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder
from sklearn.model_selection import train_test_split

import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader, Dataset
import torchvision.datasets as ds
import torchvision.transforms as transforms

import lightning as L
from lightning.pytorch.trainer import Trainer
from lightning.pytorch.callbacks.early_stopping import EarlyStopping

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.express as px

import warnings

In [None]:
## 경고 무시
warnings.filterwarnings('ignore')
## train, test 데이터 불러오기
train_ = pd.read_csv('./data/train.csv', encoding='cp949')
test_ = pd.read_csv('./data/test.csv', encoding='cp949')
train = train_
test = test_
## 결측치 제거
train = train.dropna()
# data.MonthlyRevenue = data.MonthlyRevenue.fillna(0)
# data.MonthlyMinutes = data.MonthlyMinutes.fillna(0)
## 데이터 전체 개수
display(len(train))
## 데이터 칼럼 전체 종류 
display(train.columns.unique())
## 데이터 값 종류
display(train.Churn.value_counts())
## 빈 데이터 총합 
display(train.isnull().sum())
## 중복 데이터 총합
display(train.duplicated().sum())
## 데이터 값 설명
display(train.describe())
## 데이터 정보
display(train.info())
## 데이터 종류가 object인 칼럼 추려내기
objects = []
train.dtypes
for idx in range(len(train.columns)):
    if train.dtypes[idx] == object:
        objects.append(train.columns[idx])
print(objects)

def convert_to_category(dfs: list):
    dfs.Churn = dfs.Churn.astype('category')
    dfs.ServiceArea = dfs.ServiceArea.astype('category')
    dfs.ChildrenInHH = dfs.ChildrenInHH.astype('category')
    dfs.HandsetRefurbished = dfs.HandsetRefurbished.astype('category')
    dfs.HandsetWebCapable = dfs.HandsetWebCapable.astype('category')
    dfs.TruckOwner = dfs.TruckOwner.astype('category')
    dfs.RVOwner = dfs.RVOwner.astype('category')
    dfs.Homeownership = dfs.Homeownership.astype('category')
    dfs.BuysViaMailOrder = dfs.BuysViaMailOrder.astype('category')
    dfs.RespondsToMailOffers = dfs.RespondsToMailOffers.astype('category')
    dfs.OptOutMailings = dfs.OptOutMailings.astype('category')
    dfs.NonUSTravel = dfs.NonUSTravel.astype('category')
    dfs.OwnsComputer = dfs.OwnsComputer.astype('category')
    dfs.NewCellphoneUser = dfs.NewCellphoneUser.astype('category')
    dfs.NotNewCellphoneUser = dfs.NotNewCellphoneUser.astype('category')
    dfs.OwnsMotorcycle = dfs.OwnsMotorcycle.astype('category')
    dfs.HandsetPrice = dfs.HandsetPrice.astype('category')
    dfs.MadeCallToRetentionTeam = dfs.MadeCallToRetentionTeam.astype('category')
    dfs.CreditRating = dfs.CreditRating.astype('category')
    dfs.PrizmCode = dfs.PrizmCode.astype('category')
    dfs.Occupation = dfs.Occupation.astype('category')
    dfs.MaritalStatus = dfs.MaritalStatus.astype('category')

    return dfs

def convert_category_into_integer(df: pd.DataFrame, columns: list):
    label_encoders = {}
    for column in columns:
        label_encoder = LabelEncoder()
        df.loc[:, column] = label_encoder.fit_transform(df[column])

        label_encoders.update({column: label_encoder})
    
    return df, label_encoders

## 데이터 종류가 object인 칼럼을 카테고리화 하기
train = convert_to_category(train)
test = convert_to_category(test)

## 범주형 변수 인코딩하기
train, _ = convert_category_into_integer(train, objects)
test, _ = convert_category_into_integer(test, objects)

## 타겟 변수 정의
target = 'churn'

## train 데이터 분할하기
seed = 0
valid, test = train_test_split(test, test_size=0.5, random_state=seed)
display(len(train), len(valid), len(test))

## 특성 스케일링
standard_scaler = StandardScaler()
train = standard_scaler.fit_transform(train)
valid = standard_scaler.fit_transform(valid)
test = standard_scaler.transform(test)

train = pd.DataFrame(train, columns=train_.columns)
# df_train.corr()


