In [12]:
import seaborn as sns
import sweetviz as sv
import pandas as pd
from pandas_profiling import ProfileReport

PydanticImportError: `BaseSettings` has been moved to the `pydantic-settings` package. See https://docs.pydantic.dev/2.8/migration/#basesettings-has-moved-to-pydantic-settings for more details.

For further information visit https://errors.pydantic.dev/2.8/u/import-error

## Sweetviz
- EDA(Exploaratory Data Analysis)도구, 자동으로 빠르게 직관적인 탐색이 가능한 패키지
- 간단하게 확인해야 하는 경우
    - 결측치
    - 타겟에 대한 분석
    - 데이터에 대한 비교 등
    - 빠르고 간결하게 확인

In [8]:
df = sns.load_dataset('titanic')

In [9]:
report = sv.analyze(df)

                                             |          | [  0%]   00:00 -> (? left)

In [10]:
report.show_html('sweetviz_titanic_report.html')

Report sweetviz_titanic_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


0:182: execution error: 일부 대상체 파일을 발견할 수 없습니다. (-43)


In [13]:
df_iris =sns.load_dataset('iris')

In [24]:
profile_iris = sv.analyze(df_iris)

                                             |          | [  0%]   00:00 -> (? left)

In [25]:
report.show_html('sweetviz_iris_report.html')

Report sweetviz_iris_report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


0:179: execution error: 일부 대상체 파일을 발견할 수 없습니다. (-43)


In [26]:
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.model_selection import train_test_split, cross_val_score

## pipeline 패키지
from sklearn.pipeline import Pipeline

In [27]:
#타이타닉 데이터 불러오기
df =sns.load_dataset('titanic')

## 간단한 전처리
X=df.drop(columns=['survived'])
y=df['survived']

## 수치형변수, 범주형 변수 분리
numeric_features = ['age','fare','parch','sibsp']
categorical_features = ['embarked','sex','pclass']

# 파이프라인 구축
#수치형 변수에 대한 전처리 파이프
numeric_transformer =Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler',StandardScaler())
])

##범주형 변수 전처리 파이프라인
categorical_transformer =Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot',OneHotEncoder(handle_unknown='ignore'))
])

# 파이프라인을 통합할 수 있다. 

pre_processor=ColumnTransformer(
    transformers=[
        ('num',numeric_transformer,numeric_features),
        ('cat',categorical_transformer,categorical_features)
    ]
)


## 모델을 여러 개 사용해 보자!

models ={
    'Random Forest' : RandomForestClassifier(random_state=111),
    'SVC' : SVC(random_state=111),
    'KNN' : KNeighborsClassifier(),
    'Logistic Regression' : LogisticRegression(max_iter=1000, random_state=111)
}


## 모델의 성능을 비교해 보자!
## 모델들을 pipeline으로 합칠 수 있다.
## 교차검증까지 추가해서 실제 성능에 대해서 교차검증 결과를 출력 
results = {}
for model_name, model in models.items():
    #파이프라인 구성
    pipeline = Pipeline(steps=[
        ('pre_processor',pre_processor),
        ('models',model)
    ])
    #교차검증 진행
    cv_scores=cross_val_score(pipeline,X,y, cv=5, scoring='accuracy')
    results[model_name] = cv_scores.mean()

print('Model Comparison Results')
for model_name, accuracy in results.items():
    print(model_name, accuracy)


best_model_name =max(results, key=results.get)    
best_model = models[best_model_name]

Model Comparison Results
Random Forest 0.806973824618668
SVC 0.8271608813006089
KNN 0.8058627832527776
Logistic Regression 0.7901387232439896


In [28]:
##파이프라인을 통해 데이터를 train_test 분리해서 학습하고 결과도 출력해 보자!
pipeline = Pipeline(steps=[
        ('pre_processor',pre_processor)
    ,('models',model)])

In [29]:
X_train, X_test, y_train, y_test= train_test_split(X,y, test_size=0.2, random_state=111)

In [30]:
pipeline.fit(X_train, y_train)
y_pred = pipeline.predict(X_test)

In [31]:
y_pred

array([1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0,
       0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0,
       1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0])