## Scikit-Learn - a one stop solution 

### Built-in Datasets

In [1]:
import sklearn.datasets as datasets
iris = datasets.load_iris()
print(iris.data.shape)
print(iris.feature_names)
print(iris.target_names)

(150, 4)
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
['setosa' 'versicolor' 'virginica']


In [2]:
import sklearn.datasets as datasets
house_dtls = datasets.load_boston()
print(house_dtls.data.shape)
print(house_dtls.feature_names)

(506, 13)
['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


### Real World Datasets

In [3]:
from sklearn.datasets import fetch_20newsgroups
newsgroups_train = fetch_20newsgroups(subset='train')
print(newsgroups_train.filenames.shape)
newsgroups_train.data[:1]

(11314,)


["From: lerxst@wam.umd.edu (where's my thing)\nSubject: WHAT car is this!?\nNntp-Posting-Host: rac3.wam.umd.edu\nOrganization: University of Maryland, College Park\nLines: 15\n\n I was wondering if anyone out there could enlighten me on this car I saw\nthe other day. It was a 2-door sports car, looked to be from the late 60s/\nearly 70s. It was called a Bricklin. The doors were really small. In addition,\nthe front bumper was separate from the rest of the body. This is \nall I know. If anyone can tellme a model name, engine specs, years\nof production, where this car is made, history, or whatever info you\nhave on this funky looking car, please e-mail.\n\nThanks,\n- IL\n   ---- brought to you by your neighborhood Lerxst ----\n\n\n\n\n"]

### Generate Datasets

In [4]:
X,y = datasets.make_classification(n_features=20,
                                   n_samples=100,
                                   n_redundant=0, 
                                   n_informative=5,
                                   n_clusters_per_class=1
                                  )
print("The data X shape is {}".format(X.shape))
print("The data y shape is {}".format(y.shape))

The data X shape is (100, 20)
The data y shape is (100,)


In [5]:
y

array([0, 0, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 1, 1, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1,
       1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0])

In [6]:
A, b = datasets.make_multilabel_classification(n_classes=3, 
                                               allow_unlabeled=True,
                                               random_state=1)
print("The data X shape is {}".format(A.shape))
print("The data y shape is {}".format(b.shape))

The data X shape is (100, 20)
The data y shape is (100, 3)


In [7]:
b[:5]

array([[0, 1, 0],
       [0, 0, 0],
       [1, 1, 0],
       [0, 0, 0],
       [1, 1, 0]])

In [8]:
X, y = datasets.make_regression(n_features=1, n_informative=1)
print("The regression data X shape is {}".format(X.shape))
print("The regression data y shape is {}".format(y.shape))

The regression data X shape is (100, 1)
The regression data y shape is (100,)


### Data Preprocessing

### Numerical Features

#### MinMaxScaler

In [9]:
from sklearn.preprocessing import MinMaxScaler
data = [[-1, 72], [-0.5, -6], [90, 10], [10, 188]]
scaler = MinMaxScaler() 
print("Fit: ", scaler.fit(data))
print("Max: ", scaler.data_max_)
print("Transform: " , scaler.transform(data))

Fit:  MinMaxScaler()
Max:  [ 90. 188.]
Transform:  [[0.         0.40206186]
 [0.00549451 0.        ]
 [1.         0.08247423]
 [0.12087912 1.        ]]


#### StandardScaler

In [10]:
from sklearn.preprocessing import StandardScaler
data = [[0, 0], [0, 0], [1, 1], [1, 1]]
scaler = StandardScaler()
print("Fit: ", scaler.fit(data))
print("Mean: ", scaler.mean_)
print("Transform: " , scaler.transform(data))

Fit:  StandardScaler()
Mean:  [0.5 0.5]
Transform:  [[-1. -1.]
 [-1. -1.]
 [ 1.  1.]
 [ 1.  1.]]


#### Normalizer

In [11]:
from sklearn.preprocessing import Normalizer
X = [[4, 1, 2, 2],
      [1, 3, 9, 3],
      [5, 7, 5, 1]]
transformer = Normalizer().fit(X)  
transformer.transform(X)

array([[0.8, 0.2, 0.4, 0.4],
       [0.1, 0.3, 0.9, 0.3],
       [0.5, 0.7, 0.5, 0.1]])

### Categorical Features

#### OrdinalEncoder

In [12]:
import sklearn.preprocessing as preprocessing
enc = preprocessing.OrdinalEncoder()
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)
print("The original data")
print(X)
print("The transform data using OrdinalEncoder")
print(enc.transform([['female', 'from US', 'uses Safari']]))


The original data
[['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
The transform data using OrdinalEncoder
[[0. 1. 1.]]


#### OneHotEncoder

In [13]:
import sklearn.preprocessing as preprocessing
enc = preprocessing.OneHotEncoder()
X = [['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
enc.fit(X)
print("The original data")
print(X)
print("The transform data using OneHotEncoder")
print(enc.transform([['female', 'from US', 'uses Safari'],
                ['male', 'from Europe', 'uses Safari']]).toarray())


The original data
[['male', 'from US', 'uses Safari'], ['female', 'from Europe', 'uses Firefox']]
The transform data using OneHotEncoder
[[1. 0. 0. 1. 0. 1.]
 [0. 1. 1. 0. 0. 1.]]


#### LabelEncoder

In [14]:
import sklearn.preprocessing as preprocessing
import numpy as np

targets = np.array(["Sun", "Sun", "Moon", "Earth", "Monn", "Venus"])
labelenc = preprocessing.LabelEncoder()
labelenc.fit(targets)
targets_trans = labelenc.transform(targets)
print("The original data")
print(targets)
print("The transform data using LabelEncoder")
print(targets_trans)

The original data
['Sun' 'Sun' 'Moon' 'Earth' 'Monn' 'Venus']
The transform data using LabelEncoder
[3 3 2 0 1 4]


### Feature Selection

#### SelectKBest

In [15]:
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest, chi2

X, y = load_digits(return_X_y=True)
print("Orginal Features Count: ", X.shape[1])

X_new = SelectKBest(chi2, k=20).fit_transform(X, y)
print("Features count after using SelectKBest: ", X_new.shape[1])

Orginal Features Count:  64
Features count after using SelectKBest:  20


#### VarianceThreshold

In [16]:
import sklearn.feature_selection as fs
import numpy as np 

X = np.array([[0, 0, 1], [0, 1, 0], [1, 0, 0], [0, 1, 1], [0, 1, 0], [0, 1,
                                                                      1]])
var = fs.VarianceThreshold(threshold=0.2)
var.fit(X)
X_trans = var.transform(X)
print("The original data")
print(X)
print("The processed data by variance threshold")
print(X_trans)

The original data
[[0 0 1]
 [0 1 0]
 [1 0 0]
 [0 1 1]
 [0 1 0]
 [0 1 1]]
The processed data by variance threshold
[[0 1]
 [1 0]
 [0 0]
 [1 1]
 [1 0]
 [1 1]]


### Feature Extraction

#### Text Analysis

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

counterVec = CountVectorizer()
# corpus is a list of string in this example, such as:
corpus = [
    "I have an apple.",
    "The apple is red",
    "I like the apple",
    "Apple is nutritous"
    ]
counterVec.fit(corpus)
# corpus_data is a matrix with 0/1.
corpus_data = counterVec.transform(corpus)
print("Get all the feature names of this corpus")
print(counterVec.get_feature_names())
print("The number of feature is {}".format(len(counterVec.get_feature_names())))
corpus_data = counterVec.transform(corpus)
print("The transform data's shape is {}".format(corpus_data.toarray().shape))
print(corpus_data.toarray())

Get all the feature names of this corpus
['an', 'apple', 'have', 'is', 'like', 'nutritous', 'red', 'the']
The number of feature is 8
The transform data's shape is (4, 8)
[[1 1 1 0 0 0 0 0]
 [0 1 0 1 0 0 1 1]
 [0 1 0 0 1 0 0 1]
 [0 1 0 1 0 1 0 0]]


#### Image Analysis

In [18]:
import numpy as np
from sklearn.feature_extraction import image

one_image = np.arange(4 * 4 * 3).reshape((4, 4, 3))
one_image[:, :, 0]  # R channel of a fake RGB picture
patches = image.extract_patches_2d(one_image, (2, 2), max_patches=2,random_state=0)
print(patches.shape)
print(patches[:, :, :, 0])
patches = image.extract_patches_2d(one_image, (2, 2))
print(patches.shape)
print(patches[4, :, :, 0])
reconstructed = image.reconstruct_from_patches_2d(patches, (4, 4, 3))
np.testing.assert_array_equal(one_image, reconstructed)


(2, 2, 2, 3)
[[[ 0  3]
  [12 15]]

 [[15 18]
  [27 30]]]
(9, 2, 2, 3)
[[15 18]
 [27 30]]
