# 데이터 불러오기

In [1]:
import sklearn.datasets as sd
from sklearn.model_selection import train_test_split
import pandas as pd

### 아이리스 데이터

In [2]:
iris = sd.load_iris()

In [3]:
iris_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
iris_df['target'] = iris.target
iris_df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [4]:
iris_df.to_csv('static/upload/iris.csv',index=False)

In [6]:
X_train, X_test, y_train, y_test = train_test_split(
    iris_df,iris_df.target,test_size=0.25,stratify=iris.target, random_state=2021
)

In [8]:
X_train.to_csv('static/data/iris_train.csv', index=False)
X_test.to_csv('static/data/iris_test.csv', index=False)

### 와인 데이터

In [9]:
wine = sd.load_wine()
wine_df = pd.DataFrame(data=wine.data, columns=wine.feature_names)
wine_df['target'] = wine.target
X_train, X_test, y_train, y_test = train_test_split(
    wine_df,wine_df.target,test_size=0.25,stratify=wine.target, random_state=2021
)
X_train.to_csv('static/data/wine_train.csv', index=False)
X_test.to_csv('static/data/wine_test.csv', index=False)

### 보스턴 데이터

In [14]:
boston = sd.load_boston()
boston_df = pd.DataFrame(data=boston.data, columns=boston.feature_names)
boston_df['target'] = boston.target
X_train, X_test, y_train, y_test = train_test_split(
    boston_df,boston_df['target'],test_size=0.25, random_state=2021
)
X_train.to_csv('static/data/boston_train.csv', index=False)
X_test.to_csv('static/data/boston_test.csv', index=False)

### 당뇨병 데이터

In [2]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
import pandas as pd
import joblib
diabetes_data = pd.read_csv('static/data/diabetes.csv')
# 0값을 검사할 피처명 리스트 객체 설정
zero_features = ['Glucose','BloodPressure','SkinThickness','Insulin','BMI']

# 전체 데이터 건수
total_count = diabetes_data['Glucose'].count()

# 피처별로 반복 하면서 데이터 값이 0 인 데이터 건수 추출하고, 퍼센트 계산
for feature in zero_features:
    zero_count = diabetes_data[diabetes_data[feature] == 0][feature].count()
    percent = 100*zero_count/total_count
diabetes_data[zero_features]=diabetes_data[zero_features].replace(0, diabetes_data[zero_features].mean())
# 피처 데이터 세트 X, 레이블 데이터 세트 y를 추출. 
# 맨 끝이 Outcome 컬럼으로 레이블 값임. 
X = diabetes_data.iloc[:, :]
y = diabetes_data.iloc[:, -1]


diabetes_df = pd.DataFrame(data=X)
X_train, X_test, y_train, y_test = train_test_split(
    diabetes_df, y, test_size = 0.25, random_state = 2021, stratify=y
)
scale = StandardScaler()
scale.fit(X_train)
joblib.dump(scale, 'static/model/diabetes_scale.pkl')
X_train.to_csv('static/data/diabetes_train.csv', index=False)
X_test.to_csv('static/data/diabetes_test.csv', index=False)

In [3]:
diabetes_data

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148.0,72.0,35.000000,79.799479,33.6,0.627,50,1
1,1,85.0,66.0,29.000000,79.799479,26.6,0.351,31,0
2,8,183.0,64.0,20.536458,79.799479,23.3,0.672,32,1
3,1,89.0,66.0,23.000000,94.000000,28.1,0.167,21,0
4,0,137.0,40.0,35.000000,168.000000,43.1,2.288,33,1
...,...,...,...,...,...,...,...,...,...
763,10,101.0,76.0,48.000000,180.000000,32.9,0.171,63,0
764,2,122.0,70.0,27.000000,79.799479,36.8,0.340,27,0
765,5,121.0,72.0,23.000000,112.000000,26.2,0.245,30,0
766,1,126.0,60.0,20.536458,79.799479,30.1,0.349,47,1


In [4]:
diabetes_data.iloc[:, -1]

0      1
1      0
2      1
3      0
4      1
      ..
763    0
764    0
765    0
766    1
767    0
Name: Outcome, Length: 768, dtype: int64

In [5]:
X_train

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
51,1,101.0,50.000000,15.000000,36.000000,24.200000,0.526,26,0
435,0,141.0,69.105469,20.536458,79.799479,42.400000,0.205,29,1
16,0,118.0,84.000000,47.000000,230.000000,45.800000,0.551,31,1
148,5,147.0,78.000000,20.536458,79.799479,33.700000,0.218,65,0
246,10,122.0,68.000000,20.536458,79.799479,31.200000,0.258,41,0
...,...,...,...,...,...,...,...,...,...
494,3,80.0,69.105469,20.536458,79.799479,31.992578,0.174,22,0
9,8,125.0,96.000000,20.536458,79.799479,31.992578,0.232,54,1
677,0,93.0,60.000000,20.536458,79.799479,35.300000,0.263,25,0
551,3,84.0,68.000000,30.000000,106.000000,31.900000,0.591,25,0


In [6]:
y_train

51     0
435    1
16     1
148    0
246    0
      ..
494    0
9      1
677    0
551    0
432    0
Name: Outcome, Length: 576, dtype: int64

In [62]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 192 entries, 41 to 136
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               192 non-null    int64  
 1   Glucose                   192 non-null    float64
 2   BloodPressure             192 non-null    float64
 3   SkinThickness             192 non-null    float64
 4   Insulin                   192 non-null    float64
 5   BMI                       192 non-null    float64
 6   DiabetesPedigreeFunction  192 non-null    float64
 7   Age                       192 non-null    int64  
 8   Outcome                   192 non-null    int64  
dtypes: float64(6), int64(3)
memory usage: 15.0 KB


### 유방암 데이터

In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import StandardScaler
breast = load_breast_cancer()
breast_df = pd.DataFrame(data=breast.data, columns=breast.feature_names)
breast_df['target'] = breast.target
X_train, X_test, y_train, y_test = train_test_split(
    breast_df,breast_df['target'],test_size=0.25, random_state=2021
)
X_train.to_csv('static/data/breast_train.csv', index=False)
X_test.to_csv('static/data/breast_test.csv', index=False)

In [3]:
scale = StandardScaler()
scale.fit(X_train)

StandardScaler()

In [4]:
import joblib

In [5]:
joblib.dump(scale, 'static/model/breast_scale.pkl')

['static/model/breast_scale.pkl']

### Digits

In [10]:
digits = sd.load_digits()
df = pd.DataFrame(digits.data, columns=digits.feature_names)
df['target'] = digits.target
X_train, X_test, y_train, y_test = train_test_split(
    df, digits.target, test_size=0.25, stratify=digits.target, random_state=2021
)

In [11]:
X_test.reset_index(inplace=True)
X_test.to_csv('static/data/digits_test.csv', index=False)
df = pd.read_csv('static/data/digits_test.csv')
df.head()

Unnamed: 0,index,pixel_0_0,pixel_0_1,pixel_0_2,pixel_0_3,pixel_0_4,pixel_0_5,pixel_0_6,pixel_0_7,pixel_1_0,...,pixel_6_7,pixel_7_0,pixel_7_1,pixel_7_2,pixel_7_3,pixel_7_4,pixel_7_5,pixel_7_6,pixel_7_7,target
0,667,0.0,0.0,4.0,16.0,15.0,4.0,0.0,0.0,0.0,...,0.0,0.0,0.0,3.0,12.0,14.0,11.0,0.0,0.0,1
1,971,0.0,0.0,5.0,15.0,14.0,3.0,0.0,0.0,0.0,...,1.0,0.0,0.0,4.0,11.0,13.0,16.0,11.0,0.0,9
2,914,0.0,0.0,7.0,14.0,9.0,0.0,0.0,0.0,0.0,...,3.0,0.0,0.0,5.0,15.0,16.0,16.0,12.0,1.0,9
3,1702,0.0,0.0,7.0,11.0,13.0,8.0,1.0,0.0,0.0,...,0.0,0.0,1.0,10.0,14.0,2.0,0.0,0.0,0.0,5
4,1356,0.0,0.0,0.0,12.0,14.0,5.0,0.0,0.0,0.0,...,0.0,0.0,0.0,1.0,10.0,14.0,15.0,2.0,0.0,9


In [12]:
X_train.to_csv('static/data/digits_train.csv', index=False)
X_test.to_csv('static/data/digits_test.csv', index=False)

In [71]:
titanic_df = pd.read_csv('../../Machine-Learning/00.data/titanic/train.csv')

In [72]:
titanic_df = pd.concat([titanic_train,titanic_test],ignore_index=True)
titanic_df['Age'].fillna(titanic_df['Age'].mean(), inplace=True)
titanic_df['Cabin'].fillna('N',inplace=True)
titanic_df['Embarked'].fillna('N',inplace=True)

In [73]:
titanic_df['Cabin']

0       3
1       7
2       7
3       7
4       7
       ..
1304    7
1305    7
1306    2
1307    7
1308    0
Name: Cabin, Length: 1309, dtype: int64

In [69]:
titanic_df['Cabin'] = titanic_df['Cabin'].str[:1]
from sklearn.preprocessing import LabelEncoder

features = ['Cabin', 'Sex', 'Embarked']
for feature in features:
    le = LabelEncoder()
    titanic_df[feature] = le.fit_transform(titanic_df[feature])

titanic_df.head(3)

AttributeError: Can only use .str accessor with string values!

In [48]:
# Null 처리 함수
def fillna(df):
    df['Age'].fillna(df['Age'].mean(), inplace=True)
    df['Cabin'].fillna('N', inplace=True)
    df['Embarked'].fillna('N', inplace=True)
    df['Fare'].fillna(0, inplace=True)
    return df

# 머신러닝 알고리즘에 불필요한 속성 제거
def drop_features(df):
    df.drop(['PassengerId','Name','Ticket'], axis=1, inplace=True)
    return df

# 레이블 인코딩 수행. 
def format_features(df):
    df['Cabin'] = df['Cabin'].str[:1]
    features = ['Cabin', 'Sex', 'Embarked']
    for feature in features:
        le = LabelEncoder()
        df[feature] = le.fit_transform(df[feature])
    return df

# 앞에서 설정한 Data Preprocessing 함수 호출
def transform_features(df):
    df = fillna(df)
    df = drop_features(df)
    df = format_features(df)
    return df

In [49]:
titanic_df['Age'].fillna(titanic_df['Age'].mean(), inplace=True)
titanic_df['Cabin'].fillna('N',inplace=True)
titanic_df['Embarked'].fillna('N',inplace=True)
print('데이터 세트 Null 값 갯수 ', titanic_df.isnull().sum().sum())

데이터 세트 Null 값 갯수  419


In [50]:
titanic_df['Age_cat'] = titanic_df['Age'].apply(lambda x : get_category(x))

In [51]:
y_titanic_df = titanic_df['Survived']
X_titanic_df = titanic_df.drop('Survived', axis=1)

X_titanic_df = transform_features(X_titanic_df)

In [52]:
X_train, X_test, y_train, y_test=train_test_split(
    X_titanic_df, y_titanic_df, test_size=0.25, random_state=2021
)

In [53]:
titanic_train = pd.concat([X_train,y_train],axis=1)
titanic_test = pd.concat([X_test,y_test],axis=1)

In [58]:
titanic_train.fillna(0)
titanic_test.fillna(0)

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Age_cat,Survived
1245,3,0,0.170000,1,2,20.5750,7,3,Baby,0.0
745,1,1,70.000000,1,1,71.0000,1,3,Elderly,0.0
544,1,1,50.000000,1,0,106.4250,2,0,Adult,0.0
1269,1,1,55.000000,0,0,50.0000,2,3,Adult,0.0
1040,2,1,30.000000,1,1,26.0000,7,3,Young Adult,0.0
...,...,...,...,...,...,...,...,...,...,...
697,3,0,29.881138,0,0,7.7333,7,2,Young Adult,1.0
448,3,0,5.000000,2,1,19.2583,7,0,Baby,1.0
351,1,1,29.881138,0,0,35.0000,2,3,Young Adult,0.0
476,2,1,34.000000,1,0,21.0000,7,3,Young Adult,0.0


In [60]:
titanic_test

Unnamed: 0,Pclass,Sex,Age,SibSp,Parch,Fare,Cabin,Embarked,Age_cat,Survived
1245,3,0,0.170000,1,2,20.5750,7,3,Baby,
745,1,1,70.000000,1,1,71.0000,1,3,Elderly,0.0
544,1,1,50.000000,1,0,106.4250,2,0,Adult,0.0
1269,1,1,55.000000,0,0,50.0000,2,3,Adult,
1040,2,1,30.000000,1,1,26.0000,7,3,Young Adult,
...,...,...,...,...,...,...,...,...,...,...
697,3,0,29.881138,0,0,7.7333,7,2,Young Adult,1.0
448,3,0,5.000000,2,1,19.2583,7,0,Baby,1.0
351,1,1,29.881138,0,0,35.0000,2,3,Young Adult,0.0
476,2,1,34.000000,1,0,21.0000,7,3,Young Adult,0.0


In [59]:
X_train.to_csv('static/data/titanic_train.csv', index=False)
X_test.to_csv('static/data/titanic_test.csv', index=False)