In [5]:
import pandas as pd
from io import StringIO

csv_data = '''A,B,C,D
                        1.0,2.0,3.0,4.0
                        5.0,6.0,,8.0
                        10.0,11.0,12.0,'''
df = pd.read_csv(StringIO(csv_data))
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [6]:
df.isnull().sum()

A    0
B    0
C    1
D    1
dtype: int64

In [8]:
df.dropna()
df.dropna(how='allf')
df.dropna(axis=1)
df.dropna(thresh=4)
df.dropna(subset=['C'])

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
2,10.0,11.0,12.0,


In [9]:
from sklearn.impute import SimpleImputer
import numpy as np

# 欠損値補完のインスタンスを生成（平均値補完）
imr = SimpleImputer(missing_values=np.nan, strategy='mean')

# データを適合
imr = imr.fit(df.values)
imputed_data = imr.transform(df.values)
imputed_data

array([[ 1. ,  2. ,  3. ,  4. ],
       [ 5. ,  6. ,  7.5,  8. ],
       [10. , 11. , 12. ,  6. ]])

In [10]:
df

Unnamed: 0,A,B,C,D
0,1.0,2.0,3.0,4.0
1,5.0,6.0,,8.0
2,10.0,11.0,12.0,


In [40]:
import pandas as pd

df = pd.DataFrame([
        ['green', 'M', 10.1, 'class2'],
        ['red', 'L', 13.5, 'class1'],
        ['blue', 'XL', 15.3, 'class2']])

df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [41]:
size_mapping = {'XL': 3, 'L': 2, 'M': 1}
df['size'] = df['size'].map(size_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [42]:
inv_size_mapping = {v: k for k, v in size_mapping.items()}
df['size'].map(inv_size_mapping)


0     M
1     L
2    XL
Name: size, dtype: object

In [43]:
import numpy as np
class_mapping = {label : idx for idx, label in enumerate(np.unique(df['classlabel']))}
class_mapping

{'class1': 0, 'class2': 1}

In [44]:
df['classlabel'] = df['classlabel'].map(class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,1
1,red,2,13.5,0
2,blue,3,15.3,1


In [45]:
inv_class_mapping = {v : k for k , v in class_mapping.items()}
df['classlabel'] = df['classlabel'].map(inv_class_mapping)
df

Unnamed: 0,color,size,price,classlabel
0,green,1,10.1,class2
1,red,2,13.5,class1
2,blue,3,15.3,class2


In [46]:
from sklearn.preprocessing import LabelEncoder

class_le = LabelEncoder()

y = class_le.fit_transform(df['classlabel'].values)
y

array([1, 0, 1])

In [48]:
class_le.inverse_transform(y)

array(['class2', 'class1', 'class2'], dtype=object)

In [60]:
X = df[['color', 'size', 'price']].values
color_le = LabelEncoder()
X[: , 0] = color_le.fit_transform(X[: , 0])
X

array([[1, 1, 10.1],
       [2, 2, 13.5],
       [0, 3, 15.3]], dtype=object)

In [73]:
from sklearn.preprocessing import OneHotEncoder

X = df[['color', 'size', 'price']].values

# Generation of one-hot encoder
color_ohe = OneHotEncoder()

# Execute one-hot encoding
color_ohe.fit_transform(X[: , 0].reshape(-1, 1)).toarray()

array([[0., 1., 0.],
       [0., 0., 1.],
       [1., 0., 0.]])

In [75]:
from sklearn.compose import ColumnTransformer

X = df[['color', 'size', 'price']].values

c_transf = ColumnTransformer([('onehot', OneHotEncoder(), [0]),
                                                         ('nothing', 'passthrough', [1, 2])])
c_transf.fit_transform(X).astype(float)

array([[ 0. ,  1. ,  0. ,  1. , 10.1],
       [ 0. ,  0. ,  1. ,  2. , 13.5],
       [ 1. ,  0. ,  0. ,  3. , 15.3]])

In [94]:
c_transf_2 = ColumnTransformer([('onehot', OneHotEncoder(), [1]), ('nothing', 'passthrough', [0, 2])])
c_transf_2.fit_transform(X)

array([[1.0, 0.0, 0.0, 'green', 10.1],
       [0.0, 1.0, 0.0, 'red', 13.5],
       [0.0, 0.0, 1.0, 'blue', 15.3]], dtype=object)

In [97]:
pd.get_dummies(df[['price', 'color', 'size']])#, drop_first=True)

Unnamed: 0,price,size,color_blue,color_green,color_red
0,10.1,1,0,1,0
1,13.5,2,0,0,1
2,15.3,3,1,0,0


In [98]:
pd.get_dummies(df[['price', 'color', 'size']], drop_first=True)

Unnamed: 0,price,size,color_green,color_red
0,10.1,1,1,0
1,13.5,2,0,1
2,15.3,3,0,0


In [99]:
# one-hot encoder の生成
color_ohe = OneHotEncoder(categories='auto', drop='first')
c_transf = ColumnTransformer([('onehot', color_ohe, [0]), ('nothing', 'passthrough', [1, 2])])
c_transf.fit_transform(X).astype(float)

array([[ 1. ,  0. ,  1. , 10.1],
       [ 0. ,  1. ,  2. , 13.5],
       [ 0. ,  0. ,  3. , 15.3]])

In [100]:
df = pd.DataFrame([['green', 'M', 10.1, 'class2'],
                                    ['red', 'L', 13.5, 'class1'],
                                    ['blue', 'XL', 15.3, 'class2']])
df.columns = ['color', 'size', 'price', 'classlabel']
df

Unnamed: 0,color,size,price,classlabel
0,green,M,10.1,class2
1,red,L,13.5,class1
2,blue,XL,15.3,class2


In [101]:
df['x > M'] = df['size'].apply(lambda x: 1 if x in {'L', 'XL'} else 0)
df['x > L']  = df['size'].apply(lambda x: 1 if x == 'XL' else 0)
del df['size']
df

Unnamed: 0,color,price,classlabel,x > M,x > L
0,green,10.1,class2,0,0
1,red,13.5,class1,1,0
2,blue,15.3,class2,1,1
