## 와인 데이터 분류
- 화이트 와인, 레드와인 분류

In [1]:
import pandas as pd
wine = pd.read_csv('https://bit.ly/wine_csv_data')

In [2]:
wine.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [3]:
wine['class'].value_counts()

1.0    4898
0.0    1599
Name: class, dtype: int64

## 데이터셋 분리


In [7]:
# 독립변수
data = wine[['alcohol', 'sugar', 'pH']].to_numpy()

# 종속변수
target = wine['class'].to_numpy()

In [8]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
                                    # 실험 재현성
    data, target, test_size = 0.2, random_state=42,
)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5197, 3), (1300, 3), (5197,), (1300,))

In [9]:
X_train, X_val, y_train, y_val = train_test_split(
    X_train, y_train, test_size = 0.2, random_state=42,
)

X_train.shape, X_val.shape, y_train.shape, y_val.shape

((4157, 3), (1040, 3), (4157,), (1040,))

## 모형 만들기

In [10]:
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train, y_train)


DecisionTreeClassifier(random_state=42)

- 모형 성능 평가

In [12]:
dt.score(X_train, y_train)

0.9971133028626413

In [13]:
dt.score(X_val, y_val)

0.864423076923077

## 교차검증

In [15]:
from sklearn.model_selection import cross_validate

scores = cross_validate(dt, X_train, y_train)
print(scores)

{'fit_time': array([0.00834727, 0.00605869, 0.00593829, 0.00841141, 0.00740552]), 'score_time': array([0.00157142, 0.00070739, 0.00064516, 0.00113273, 0.00101566]), 'test_score': array([0.84975962, 0.85456731, 0.85679904, 0.81588448, 0.84837545])}


In [16]:
import numpy as np

print(np.mean(scores['test_score']))

0.8450771776358419


## 구글 드라이브 연동  

In [17]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


## 데이터 불러오기

In [37]:
import pandas as pd

DATA_PATH = '/content/drive/MyDrive/Colab Notebooks/2023/human/titanic'
train_df = pd.read_csv(DATA_PATH + '/train.csv')
test_df = pd.read_csv(DATA_PATH + '/test.csv')

- 데이터를 확인

In [23]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 891 entries, 0 to 890
Data columns (total 12 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PassengerId  891 non-null    int64  
 1   Survived     891 non-null    int64  
 2   Pclass       891 non-null    int64  
 3   Name         891 non-null    object 
 4   Sex          891 non-null    object 
 5   Age          714 non-null    float64
 6   SibSp        891 non-null    int64  
 7   Parch        891 non-null    int64  
 8   Ticket       891 non-null    object 
 9   Fare         891 non-null    float64
 10  Cabin        204 non-null    object 
 11  Embarked     889 non-null    object 
dtypes: float64(2), int64(5), object(5)
memory usage: 83.7+ KB


- 결측치의 비율 확인

In [38]:
(train_df.isnull().sum() / train_df.shape[0]).sort_values(ascending = False)

Cabin          0.771044
Age            0.198653
Embarked       0.002245
PassengerId    0.000000
Survived       0.000000
Pclass         0.000000
Name           0.000000
Sex            0.000000
SibSp          0.000000
Parch          0.000000
Ticket         0.000000
Fare           0.000000
dtype: float64

In [39]:
train_df.describe(include='all')

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
count,891.0,891.0,891.0,891,891,714.0,891.0,891.0,891.0,891.0,204,889
unique,,,,891,2,,,,681.0,,147,3
top,,,,"Braund, Mr. Owen Harris",male,,,,347082.0,,B96 B98,S
freq,,,,1,577,,,,7.0,,4,644
mean,446.0,0.383838,2.308642,,,29.699118,0.523008,0.381594,,32.204208,,
std,257.353842,0.486592,0.836071,,,14.526497,1.102743,0.806057,,49.693429,,
min,1.0,0.0,1.0,,,0.42,0.0,0.0,,0.0,,
25%,223.5,0.0,2.0,,,20.125,0.0,0.0,,7.9104,,
50%,446.0,0.0,3.0,,,28.0,0.0,0.0,,14.4542,,
75%,668.5,1.0,3.0,,,38.0,1.0,0.0,,31.0,,


## 데이터 시각화
- 각 칼럼별 ~ 종속변수
  + 박스플롯, 산점도, 히스토그램 등등 다양한 그래프를 그립니다.
- 탐색적 자료분석 (EDA: Exploratory Data Analysis)

## 모형 개발 코드 작성
- 파이프라인 구축
- pycaret 모형 만들 때는, 내부적으로는 파이프라인 통해서 구현

In [40]:
from sklearn.pipeline import Pipeline

# NA, NULL, 결측치 값에 특정 값을 추가하는 방법(가장 단순한)
from sklearn.impute import SimpleImputer

# 수치 데이터를 변환
# 최소와 최대 0~1
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer

num_features = ['Age', 'Fare', 'SibSp', 'Parch']
cat_features = ['Embarked', 'Sex', 'Pclass']
rem_features = ['Name', 'Cabin', 'Ticket', 'PassengerId']

# 각 데이터타입에 맞는 transformer 구성
# 수치 데이터 결측치 값 채우는 방법 : 평균 또는 중간값 --> 증긴깂
num_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')), 
    ('scaler', MinMaxScaler())
])

# 문자 데이터 결측치 값 채우는 방법 : 가장 빈도수가 많은 데이터 추출
cat_transformer = Pipeline(steps=[
     ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder())
])

# 데이터 전처리기 파이프라인 구성
preprocessor = ColumnTransformer(
    transformers = [
        ('numeric_features', num_transformer, num_features), # 수치 데이터
        ('category_features', cat_transformer, cat_features), # 문자 데이터
        ('remove_features', 'drop', rem_features) # 불필요한 컬럼 제거
    ]
)



### 모델 생성

In [41]:
# 독립변수, 종속변수 
target = train_df['Survived'].copy()
train_df = train_df.drop(['Survived'], axis = 1)

train_df.shape, target.shape

((891, 11), (891,))

In [42]:
# 로지스틱 회귀분석 (-- 딥러닝 교육할 때 다시 배울 예정)
from sklearn.linear_model import LogisticRegression 

# 모델 파이프라인 연결
model = Pipeline([
    ('preprocessor', preprocessor), 
    ('clf', LogisticRegression(solver='liblinear'))
])

model.fit(train_df, target)

Pipeline(steps=[('preprocessor',
                 ColumnTransformer(transformers=[('numeric_features',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='median')),
                                                                  ('scaler',
                                                                   MinMaxScaler())]),
                                                  ['Age', 'Fare', 'SibSp',
                                                   'Parch']),
                                                 ('category_features',
                                                  Pipeline(steps=[('imputer',
                                                                   SimpleImputer(strategy='most_frequent')),
                                                                  ('onehot',
                                                                   OneHotEncoder()

- 정확도

In [44]:
accuray_score = round(model.score(train_df, target) * 100, 2)
accuray_score

80.02

### 모형 예측 및 csv 파일 변환

In [49]:
y_pred = model.predict(test_df)

submission = pd.DataFrame({
    'PassengerId' : test_df['PassengerId'],
    'Survived' : y_pred
})

submission.to_csv(DATA_PATH + "/submission.csv", index=False)