# Linear Regression

In [1]:
import numpy as np
import pandas as pd

## DATA

데이터 확인하기

- name : 이름
- height : 키
- gender : 성별

In [14]:
raw_data = {
  'name' :  ['kevin', 'sally', 'hoyeon', 'lux' ],
  'height' : [178.2, 162.9, 160.6, 156.2],
  'gender' : ['male', 'female', 'female', None]
}
pd_data = pd.DataFrame(raw_data)
pd_data.head()

Unnamed: 0,name,height,gender
0,kevin,178.2,male
1,sally,162.9,female
2,hoyeon,160.6,female
3,lux,156.2,


## None 결측치 제거

In [15]:
filtered_data = pd_data.dropna()
filtered_data.head()

Unnamed: 0,name,height,gender
0,kevin,178.2,male
1,sally,162.9,female
2,hoyeon,160.6,female


이른음 feature의 가치가 없다고 판단 제거

In [16]:
del(filtered_data['name'])
filtered_data.head()

Unnamed: 0,height,gender
0,178.2,male
1,162.9,female
2,160.6,female


## 주어진 3개의 데이터 instance를 기반으로 데이터를 무작위로 생성

In [25]:
## female의 키의 평균
female_mean = np.average(filtered_data[filtered_data['gender'] == 'female']['height'].values)
print('female_mean :', female_mean)

## male의 키의 평균
male_mean = np.average(filtered_data[filtered_data['gender'] == 'male']['height'].values)
print('male_mean :', male_mean)


female_mean : 161.75
male_mean : 178.2


# 주어진 3개의 데이터 instance를 기반으로 데이터를 무작위로 생성

In [35]:
np.random.seed(0)
variance = 3
female_heights = variance*np.random.randn(200) + female_mean
#print('female_heights :', female_heights)

male_heights = variance*np.random.randn(200) + male_mean
#print('male_heights :', male_heights)

print('female_heights.shape :', female_heights.shape)
print('female_heights[:10] :', female_heights[:10])

female_heights.shape : (200,)
female_heights[:10] : [167.04215704 162.95047163 164.68621395 168.4726796  167.35267397
 158.81816636 164.60026525 161.29592838 161.44034344 162.98179551]


### 데이터 프레임 생성

In [40]:
generated_data = {
  'gender' : ['female'] * 200 + ['male'] * 200,
  'height' : list(female_heights) + list(male_heights)
}

filtered_data = filtered_data.append(pd.DataFrame(generated_data), ignore_index=True)
# print(filtered_data)
filtered_data.head()

Unnamed: 0,height,gender
0,178.2,male
1,162.9,female
2,160.6,female
3,167.042157,female
4,162.950472,female


### 'gender'data attribute를 숫자 형태로 변환

In [46]:
from sklearn import preprocessing

le_gender = preprocessing.LabelEncoder()
final_data = filtered_data.copy()
final_data['gender'] = le_gender.fit_transform(filtered_data['gender'])
# print(final_data)
final_data.tail()

Unnamed: 0,height,gender
1598,175.915523,1
1599,180.773772,1
1600,181.623306,1
1601,182.599736,1
1602,180.757656,1


## 모델 적용

### kfold 5 적용

In [73]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression

features = ['height']

kf = KFold(n_splits=5, shuffle=True)

accrs = []
fold_idx = 1

for train_idx, test_idx in kf.split(final_data):
    print('Fold : {}'.format(fold_idx))
    train_d, test_d = final_data.iloc[train_idx], final_data.iloc[test_idx]
    
    train_y = train_d['gender']
    train_x = train_d[features]
    
    test_y = test_d['gender']
    test_x = test_d[features]
    
    model = LinearRegression()
    model.fit(train_x, train_y)
    
    mean_accr = model.score(test_x, test_y)
    accrs.append(mean_accr)
    
    fold_idx += 1
    
print(np.average(accrs))


Fold : 1
Fold : 2
Fold : 3
Fold : 4
Fold : 5
0.8780975884189773


# Logistic Regression

In [77]:
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression

features = ['height']

kf = KFold(n_splits=5, shuffle=True)

accrs = []
fold_idx = 1

for train_idx, test_idx in kf.split(final_data):
    print('Fold : {}'.format(fold_idx))
    train_d, test_d = final_data.iloc[train_idx], final_data.iloc[test_idx]
    
    train_y = train_d['gender']
    train_x = train_d[features]
    
    test_y = test_d['gender']
    test_x = test_d[features]
    
    model = LogisticRegression(solver='lbfgs')
    model.fit(train_x, train_y)
    
    mean_accr = model.score(test_x, test_y)
    accrs.append(mean_accr)
    
    fold_idx += 1
    
print(np.average(accrs))


Fold : 1
Fold : 2
Fold : 3
Fold : 4
Fold : 5
1.0
