In [32]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import LabelBinarizer, MultiLabelBinarizer
from sklearn.feature_extraction import DictVectorizer

In [11]:
feature = np.array([['Texas'],
                   ['California'],
                   ['Texas'],
                   ['Pipli'],
                   ['Texas'],
                   ['Maysor']])
feature

array([['Texas'],
       ['California'],
       ['Texas'],
       ['Pipli'],
       ['Texas'],
       ['Maysor']],
      dtype='<U10')

In [12]:
one_hot = LabelBinarizer()
one_hot.fit_transform(feature)

array([[0, 0, 0, 1],
       [1, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 1, 0],
       [0, 0, 0, 1],
       [0, 1, 0, 0]])

In [6]:
feature = np.array([['Texas'],
                   ['California'],
                   ['Texas'],
                   ['Pipli'],
                   ['Texas']])
feature

array([['Texas'],
       ['California'],
       ['Texas'],
       ['Pipli'],
       ['Texas']],
      dtype='<U10')

In [7]:
one_hot = LabelBinarizer()
one_hot.fit_transform(feature)

array([[0, 0, 1],
       [1, 0, 0],
       [0, 0, 1],
       [0, 1, 0],
       [0, 0, 1]])

In [13]:
one_hot.classes_

array(['California', 'Maysor', 'Pipli', 'Texas'],
      dtype='<U10')

In [15]:
one_hot.inverse_transform(one_hot.transform(feature))

array(['Texas', 'California', 'Texas', 'Pipli', 'Texas', 'Maysor'],
      dtype='<U10')

In [20]:
pd.get_dummies(feature[:,0])

Unnamed: 0,California,Maysor,Pipli,Texas
0,0,0,0,1
1,1,0,0,0
2,0,0,0,1
3,0,0,1,0
4,0,0,0,1
5,0,1,0,0


In [21]:
multi = [('Texas', 'Florida'),
        ('California', 'Alabama'),
        ('Texas', 'Florida'),
        ('Delware', 'Florida'),
        ('Texas', 'Alabama')]
multi

[('Texas', 'Florida'),
 ('California', 'Alabama'),
 ('Texas', 'Florida'),
 ('Delware', 'Florida'),
 ('Texas', 'Alabama')]

### One_hot encode data

In [22]:
one_hot = MultiLabelBinarizer()
one_hot.fit_transform(multi)

array([[0, 0, 0, 1, 1],
       [1, 1, 0, 0, 0],
       [0, 0, 0, 1, 1],
       [0, 0, 1, 1, 0],
       [1, 0, 0, 0, 1]])

In [24]:
one_hot.classes_

array(['Alabama', 'California', 'Delware', 'Florida', 'Texas'], dtype=object)

In [27]:
df = pd.DataFrame({'Score':['Low','Low','Medium','Medium','High']})
df

Unnamed: 0,Score
0,Low
1,Low
2,Medium
3,Medium
4,High


### Create mapper

In [None]:
scale_mapper = {'Low':1,
                'Medium':2,
                'High':3}
scale_mapper

### Replace feature values with scale

In [28]:
df['Score'].replace(scale_mapper)

0    1
1    1
2    2
3    2
4    3
Name: Score, dtype: int64

In [29]:
df = pd.DataFrame({'Score':['Low',
                            'Low',
                            'Medium',
                            'Medium',
                            'High',
                            'Barely more than Medium']})
df

Unnamed: 0,Score
0,Low
1,Low
2,Medium
3,Medium
4,High
5,Barely more than Medium


In [30]:
scale_mapper = {'Low':1,
                'Medium':2,
                'Barely more than Medium':2.1,
                'High':3}
scale_mapper

{'Barely more than Medium': 2.1, 'High': 3, 'Low': 1, 'Medium': 2}

In [31]:
df['Score'].replace(scale_mapper)

0    1.0
1    1.0
2    2.0
3    2.0
4    3.0
5    2.1
Name: Score, dtype: float64

In [33]:
data_dict = [{'Red':2, 'Blue': 4},
            {'Red':4, 'Blue': 3},
            {'Red':1, 'Yellow': 2},
            {'Red':2, 'Yellow': 2}]
data_dict

[{'Blue': 4, 'Red': 2},
 {'Blue': 3, 'Red': 4},
 {'Red': 1, 'Yellow': 2},
 {'Red': 2, 'Yellow': 2}]

### Create dictionary vectorize

In [37]:
dict_vectorizer = DictVectorizer(sparse=False)
features = dict_vectorizer.fit_transform(data_dict)
features

array([[ 4.,  2.,  0.],
       [ 3.,  4.,  0.],
       [ 0.,  1.,  2.],
       [ 0.,  2.,  2.]])

### Convert dictionary to feature matrix

In [40]:
feature_names = ['Red', 'Blue', 'Yellow']

### Create new dataframe and give column name to it

In [41]:
pd.DataFrame(features, columns=feature_names)

Unnamed: 0,Red,Blue,Yellow
0,4.0,2.0,0.0
1,3.0,4.0,0.0
2,0.0,1.0,2.0
3,0.0,2.0,2.0


### Create word counts dictionaries for four documents

In [45]:
doc_1_word_count = {'Red':2, 'Blue':4}
doc_2_word_count = {'Red':4, 'Blue':3}
doc_3_word_count = {'Red':1, 'Yellow':2}
doc_4_word_count = {'Red':2, 'Yellow':2}

### Create list

In [47]:
doc_word_count = [doc_1_word_count, doc_2_word_count, doc_3_word_count, doc_4_word_count]
doc_word_count

[{'Blue': 4, 'Red': 2},
 {'Blue': 3, 'Red': 4},
 {'Red': 1, 'Yellow': 2},
 {'Red': 2, 'Yellow': 2}]

In [48]:
dict_vectorizer.fit_transform(doc_word_count)

array([[ 4.,  2.,  0.],
       [ 3.,  4.,  0.],
       [ 0.,  1.,  2.],
       [ 0.,  2.,  2.]])