# Pandas 데이터 컨트롤 베스트 프랙티스



In [2]:
import pandas as pd

data = pd.read_csv('./extrafiles/Mall_Customers.csv')

In [9]:
# rows and cols
data.shape

(200, 6)

In [4]:
# 데이터 보기
data.head()

Unnamed: 0,ID,Gender,Age,Income,Spend
0,1,Male,19,15,39
1,2,Male,21,15,81
2,3,Female,20,16,6
3,4,Female,23,16,77
4,5,Female,31,17,40


In [7]:
# 데이터 변환
data['Gender_value'] = data['Gender'].map(dict(Male=1, Female=2))
data.head()

Unnamed: 0,ID,Gender,Age,Income,Spend,Gender_value
0,1,Male,19,15,39,1
1,2,Male,21,15,81,1
2,3,Female,20,16,6,2
3,4,Female,23,16,77,2
4,5,Female,31,17,40,2


In [16]:
# ID 가 50에서 80까지 row split
data_sample = data.iloc[50:80, :]

In [20]:
# nparray 를 DataFrame 으로 변환
npArr = np.array(data_sample)
pd.DataFrame(npArr, index=data_sample.index, columns=data_sample.columns).head()

Unnamed: 0,ID,Gender,Age,Income,Spend,Gender_value
50,51,Female,49,42,52,2
51,52,Male,33,42,60,1
52,53,Female,31,43,54,2
53,54,Male,59,43,60,1
54,55,Female,50,43,45,2


In [26]:
# 남녀 성별을 기준으로 group by 수행
gb_data = data.groupby('Gender').mean()
print(gb_data)

# index 값이 남/녀라니!!
gb_data.index

                ID        Age     Income      Spend  Gender_value
Gender                                                           
Female   97.562500  38.098214  59.250000  51.526786           2.0
Male    104.238636  39.806818  62.227273  48.511364           1.0


Index(['Female', 'Male'], dtype='object', name='Gender')

In [63]:
# 데이터 정렬
sort_data = data['Age'].sort_values(ascending=False).head(10)  # ascending=False 로 역순 지정 가능
print(sort_data)
print(type(sort_data))

70     70
60     70
57     69
90     68
67     68
108    68
82     67
10     67
102    67
62     67
Name: Age, dtype: int64
<class 'pandas.core.series.Series'>


In [41]:
data.mean()["Income"].mean()

  """Entry point for launching an IPython kernel.


60.56

In [62]:
# drop 은 데이터 프레임을 리턴 한다.
return_data = data.drop(['Gender_value'], axis=1)
print(return_data)

      ID  Gender  Age  Income  Spend
0      1    Male   19      15     39
1      2    Male   21      15     81
2      3  Female   20      16      6
3      4  Female   23      16     77
4      5  Female   31      17     40
..   ...     ...  ...     ...    ...
195  196  Female   35     120     79
196  197  Female   45     126     28
197  198    Male   32     126     74
198  199    Male   32     137     18
199  200    Male   30     137     83

[200 rows x 5 columns]
<class 'pandas.core.frame.DataFrame'>


In [53]:
# 연속형과 범주형 데이터 분리
data.columns
X_num = data[['Age', 'Income', 'Spend']] 
X_cat = data[['ID', 'Gender']]

# drop_fist : 가장 첫번째 항은 더미변수화 시키지 않는다. (즉 더미변수 값이 모두 0 인 경우 첫번째항이 선택된 경우이다)
# 로지스틱 회귀의 경우 drop_first 가 학습율이 더 좋다. -> 다중공선성 문제 ?? (Multicollinearity)
X_dummies = pd.get_dummies(X_cat, drop_first=True) 

# 카테고리 데이터를 모두 더미변수화 시켜준다.
X_dummies = pd.get_dummies(X_cat)

print(X_dummies)
# ID 같은 유니크 값은 어디에 넣어야 하는가?
# 범주형에 넣으면 원핫 팩터 처리가 되어도 ID 는 처리되지 않는다.

      ID  Gender_Female  Gender_Male
0      1              0            1
1      2              0            1
2      3              1            0
3      4              1            0
4      5              1            0
..   ...            ...          ...
195  196              1            0
196  197              1            0
197  198              0            1
198  199              0            1
199  200              0            1

[200 rows x 3 columns]
      ID  Gender_Male
0      1            1
1      2            1
2      3            0
3      4            0
4      5            0
..   ...          ...
195  196            0
196  197            0
197  198            1
198  199            1
199  200            1

[200 rows x 2 columns]


In [51]:
# 스케일링 하기
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(X_num)
X_scaled = scaler.transform(X_num)
X_scaled

array([[0.01923077, 0.        , 0.3877551 ],
       [0.05769231, 0.        , 0.81632653],
       [0.03846154, 0.00819672, 0.05102041],
       [0.09615385, 0.00819672, 0.7755102 ],
       [0.25      , 0.01639344, 0.39795918],
       [0.07692308, 0.01639344, 0.76530612],
       [0.32692308, 0.02459016, 0.05102041],
       [0.09615385, 0.02459016, 0.94897959],
       [0.88461538, 0.03278689, 0.02040816],
       [0.23076923, 0.03278689, 0.7244898 ],
       [0.94230769, 0.03278689, 0.13265306],
       [0.32692308, 0.03278689, 1.        ],
       [0.76923077, 0.04098361, 0.14285714],
       [0.11538462, 0.04098361, 0.7755102 ],
       [0.36538462, 0.04098361, 0.12244898],
       [0.07692308, 0.04098361, 0.79591837],
       [0.32692308, 0.04918033, 0.34693878],
       [0.03846154, 0.04918033, 0.66326531],
       [0.65384615, 0.06557377, 0.28571429],
       [0.32692308, 0.06557377, 0.98979592],
       [0.32692308, 0.07377049, 0.34693878],
       [0.13461538, 0.07377049, 0.73469388],
       [0.

In [55]:
# 결측값 대체
data_missing = pd.read_csv('./extrafiles/EX_missing.csv')
data_missing

Unnamed: 0,salary,sales,roe,industry
0,1095.0,27595.0,14.1,1
1,,9958.0,10.9,1
2,,6125.899902,23.5,1
3,578.0,16246.0,5.9,1
4,1368.0,,13.8,1
5,1145.0,,20.0,2
6,1078.0,2266.699951,16.4,2
7,1094.0,2966.800049,16.299999,2
8,1237.0,4570.200195,10.5,2
9,833.0,2830.0,,2


In [61]:
# 중앙값
print(data_missing['sales'].median())

# 평균값
print(data_missing['sales'].mean())

# 최소값
print(data_missing['sales'].min())

# 최대값
print(data_missing['sales'].max())

5348.0500485
9069.825012125
2266.699951
27595.0


In [None]:
# % 로 구간 분리하기
