### Loading the Data

In [1]:
# Loading features from dicts (DictVectorizer)
from sklearn.feature_extraction import DictVectorizer

# Creating array of dicts 
measurements = [
    {'city': 'Dubai', 'temperature': 33.},
    {'city': 'Londo', 'temperature': 12.},
    {'city': 'San Francisco', 'temperature': 18.},
]

vec = DictVectorizer()
vec.fit_transform(measurements).toarray()

array([[ 1.,  0.,  0., 33.],
       [ 0.,  1.,  0., 12.],
       [ 0.,  0.,  1., 18.]])

In [2]:
# Printing the names of the new features
vec.get_feature_names()

['city=Dubai', 'city=Londo', 'city=San Francisco', 'temperature']

### Feature hashing (FeatureHasher)

In [3]:
from sklearn.feature_extraction import FeatureHasher

# Creating array of dicts
data = [
    {'dog': -1, 'cat': 2, 'elephant': 4},
    {'dog': 2, 'run': 5, 'cat':-7}
]

h = FeatureHasher(n_features=4)
h.transform(data).toarray()

array([[ 0.,  1., -4.,  2.],
       [-5., -2.,  0., -7.]])

### Text feature extraction

<ul>
    <li><b>tokenizing</b> strings and giving an integer id for each possible token.</li>
    <li><b>counting</b> the occurrences of tokens in each document.</li>
    <li><b>normalizing</b> and weighting with diminishing importance tokens that occur in the majority of samples / documents.</li>
</ul>

In [4]:
from sklearn.feature_extraction.text import CountVectorizer

# Creating Dataset
data = [
    'Test sentence one of three.',
    'Second test sentence of three.',
    'Last sentence of three.'
]

vec = CountVectorizer()

vec.fit_transform(data).toarray()

array([[0, 1, 1, 0, 1, 1, 1],
       [0, 1, 0, 1, 1, 1, 1],
       [1, 1, 0, 0, 1, 0, 1]], dtype=int64)

In [5]:
vec.transform(['New sentence']).toarray()

array([[0, 0, 0, 0, 1, 0, 0]])

In [6]:
from sklearn.feature_extraction.text import TfidfTransformer

data_vec = vec.transform(data).toarray()
data_vec

array([[0, 1, 1, 0, 1, 1, 1],
       [0, 1, 0, 1, 1, 1, 1],
       [1, 1, 0, 0, 1, 0, 1]])

In [7]:
tfidf = TfidfTransformer()
data_vec_weighted = tfidf.fit_transform(data_vec)
data_vec_weighted.toarray()

array([[0.        , 0.3645444 , 0.61722732, 0.        , 0.3645444 ,
        0.46941728, 0.3645444 ],
       [0.        , 0.3645444 , 0.        , 0.61722732, 0.3645444 ,
        0.46941728, 0.3645444 ],
       [0.69903033, 0.41285857, 0.        , 0.        , 0.41285857,
        0.        , 0.41285857]])

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vec = TfidfVectorizer()
tfidf_vec.fit_transform(data).toarray()

array([[0.        , 0.3645444 , 0.61722732, 0.        , 0.3645444 ,
        0.46941728, 0.3645444 ],
       [0.        , 0.3645444 , 0.        , 0.61722732, 0.3645444 ,
        0.46941728, 0.3645444 ],
       [0.69903033, 0.41285857, 0.        , 0.        , 0.41285857,
        0.        , 0.41285857]])