# one hot encoding & label encoding範例

In [55]:
# import 所需的package
import pandas as pd
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

In [56]:
# 建立dataframe
data = pd.DataFrame([['小明','男','電機',18],['阿花','女','企管', 22],['小智','男','資管', 23]],columns=['姓名','性別','科系','年齡'])
df = data.copy()
df

Unnamed: 0,姓名,性別,科系,年齡
0,小明,男,電機,18
1,阿花,女,企管,22
2,小智,男,資管,23


## LabelEncoding

In [15]:
# 初始化需要的函數
ohe = OneHotEncoder()
le = LabelEncoder()

In [16]:
for col in df[['姓名','性別','科系']]:
    df[col] = le.fit_transform(df[col])
df

Unnamed: 0,姓名,性別,科系,年齡
0,0,1,2,18
1,2,0,0,22
2,1,1,1,23


## OneHotEncoding

In [None]:
# 方法一
df_ohe = ohe.fit_transform(df).toarray()
pd.DataFrame(df_ohe)

In [6]:
# 重製dataframe
df = data.copy()
df

Unnamed: 0,姓名,性別,科系,年齡
0,小明,男,電機,18
1,阿花,女,企管,22
2,小智,男,資管,23


In [None]:
# 方法二
# 使用get_dummies
df_dum = pd.get_dummies(df)
df_dum

In [None]:
# 方法三
# 使用get_dummies但輸入列的名稱
df_dum = pd.get_dummies(df[['姓名','科系']])
df_dum

In [None]:
df_new = pd.concat([df_dum, df['年齡']],axis=1)
df_new

## 測試

In [35]:
# import 所需的package
import pandas as pd
import nltk
#nltk.download()
import numpy as np
import sys

np.set_printoptions(threshold = sys.maxsize)# 為了列出整個串列

In [36]:
# 讀取資料
dataset = pd.read_csv('Restaurant_Reviews.tsv', delimiter='\t', quoting=3)
corpus = dataset['Review'].values

In [37]:
# 建立一個空list 後續會將文字append近來
whole_words = []
for sentence in corpus:
    tokenized_sentence = nltk.word_tokenize(sentence) ## 使用 tokenize 進行斷詞
    for word in tokenized_sentence:
        whole_words.append(word)

In [38]:
whole_words = set(whole_words) #去除重複的單詞
print('共有{}個單字'.format(len(whole_words)))

共有2356個單字


In [40]:
#建立一個字典將單詞放進去
word_index = {}
index_word = {}
n = 0
for word in whole_words:
    word_index[word] = n 
    index_word[n] = word
    n+=1
    

## 轉換句子為詞袋模型(Bag-of-words)的形式

In [41]:
def get_bag_of_words_vector(sent, word_index_dic, whole_words):
    sentence = sent
    vector = np.zeros(len(whole_words)) #初始化 0 向量
    for word in nltk.word_tokenize(sentence):
        if word in whole_words:
            vector[word_index[word]]+=1
    return vector

In [43]:
ans=get_bag_of_words_vector('I love eating apples.', word_index, whole_words)

In [50]:
result = np.where(ans == 1)
result

(array([ 130,  465,  918, 1690], dtype=int64),)