In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
from numpy import array
from keras.datasets import mnist

<a id="data-science-coding-skills"></a>

<div style="background-color: #e3f2fd; font-size:150%; text-align:left; border: 7px solid #0288d1; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.3); font-family: Calibri; border-radius: 20px; padding: 10px; width:95%">
<h1 align="center"><font color=#0277bd><strong>ESSENTIAL CODING SKILLS FOR DATA SCIENCE, MACHINE LEARNING, & KAGGLE</strong></font></h1>
<h2 align="center"><font color=#0277bd><strong>Your Guide to Mastering Key Libraries</strong></font></h2>
<p style="font-family: Calibri; color: #0277bd; text-shadow: 2px 2px 4px rgba(0, 0, 0, 0.3); text-align: left; font-size: 110%">
    Whether you're just starting out in data science or have some experience, this notebook is designed to help you. For beginners, it serves as an excellent reference, introducing the <strong>key libraries</strong> needed for data analysis, machine learning, and Kaggle competitions. Even if you're not a beginner, the comprehensive overview of libraries such as:
    <br><br>
    1. <strong>NumPy</strong> and <strong>Pandas</strong>: For numerical computations and data manipulation.
    <br>
    2. <strong>Seaborn (sns)</strong>: For data visualization to enhance analysis.
    <br>
    3. <strong>Scikit-learn (sklearn)</strong>: A comprehensive machine learning toolkit covering preprocessing, feature extraction, selection, model training (e.g., SVM, tree-based models), clustering, and model evaluation (metrics).
    <br>
    4. <strong>SciPy</strong>: For advanced statistical analysis and sparse matrices, crucial in data analysis workflows.
    <br>
    5. <strong>Imbalanced-learn</strong>: For handling imbalanced datasets through under-sampling and over-sampling techniques.
    <br><br>
    makes this notebook a useful resource for more experienced practitioners as well. Together, these tools form the foundation for effective data processing, visualization, model building, and evaluation.
    <br><br>
    If you found this notebook helpful and appreciate the effort put into it, please consider <strong>upvoting</strong>! Your support motivates me to create more content.
</p>
</div>


## numpy

### [array](https://numpy.org/doc/1.23/reference/generated/numpy.array.html)

*Create an array.*

#### Examples

In [None]:
np.array([1, 2, 3])

Upcasting

In [None]:
np.array([1, 2, 3.0])

More than one dimension:

In [None]:
np.array([[1, 2], [3, 4]])

Minimum dimensions 2:

In [None]:
np.array([1, 2, 3], ndmin=2)

Type provided:

In [None]:
np.array([1, 2, 3], dtype='float64')

#### Methods

##### [shape()](https://numpy.org/doc/1.23/reference/generated/numpy.ndarray.shape.html)

*Tuple of array dimensions.*

In [None]:
x = np.array([1, 2, 3, 4])
x.shape

In [None]:
y = np.zeros((2, 3, 4))
y.shape

In [None]:
y.shape = (3, 8) # similar to reshape
y

In [None]:
try:
  y.shape = (3, 6) # cannot reshape since there're 24 elements.
except ValueError:
  print("Exception!")

##### [reshape()](https://numpy.org/doc/1.23/reference/generated/numpy.ndarray.reshape.html)

*Returns an array containing the same data with a new shape.*


In [None]:
y = np.zeros((2, 3, 4))
y.reshape(3, 8)

In [None]:
y.reshape((3, 8)) #also works

[transpose()](https://numpy.org/doc/1.23/reference/generated/numpy.ndarray.transpose.html)

*Returns a view of the array with axes transposed*

In [None]:
a = np.array([[1, 2], [3, 4]])
a

In [None]:
a.transpose()

In [None]:
a.transpose((1, 0)) # interchanges axes 1 and 0.

In [None]:
a.transpose(1, 0) # also works

In [None]:
a = np.arange(12).reshape(2,3,2)
a.transpose(0,2,1)

The property T is an accessor to this method

##### [mean()](https://numpy.org/doc/1.23/reference/generated/numpy.ndarray.mean.html), [var()](https://numpy.org/doc/1.23/reference/generated/numpy.ndarray.var.html), [std()](https://numpy.org/doc/1.23/reference/generated/numpy.ndarray.std.html)

*Returns the average, variance and std. deviation respectively of the array elements along given axis.*

##### [ravel()](https://numpy.org/doc/1.23/reference/generated/numpy.ndarray.ravel.html)

*Return a flattened array.*

In [None]:
x = np.array([[1, 2, 3], [4, 5, 6]])
np.ravel(x)

In [None]:
np.arange(12).reshape(2,3,2).transpose(0,2,1).ravel()

### [arange](https://numpy.org/doc/1.23/reference/generated/numpy.arange.html)

*Return evenly spaced values within a given interval.*

In [None]:
np.arange(3)

In [None]:
np.arange(3.0)

In [None]:
np.arange(3,7)

In [None]:
np.arange(3,7,2)

In [None]:
np.arange(-3, 3, 0.5)

### [linspace](https://numpy.org/doc/1.23/reference/generated/numpy.linspace.html)

*Returns num evenly spaced samples, calculated over the interval [start, stop].*


In [None]:
np.linspace(2.0, 3.0, num=5)

In [None]:
np.linspace(2.0, 3.0, num=5, endpoint=False)

In [None]:
np.linspace(2.0, 3.0, num=5, retstep=True)

Graphical illustration

In [None]:
import matplotlib.pyplot as plt
N = 8
y = np.zeros(N)
x1 = np.linspace(0, 10, N, endpoint=True)
x2 = np.linspace(0, 10, N, endpoint=False)
plt.plot(x1, y, 'o')

plt.plot(x2, y + 0.5, 'o')

plt.ylim([-0.5, 1])
plt.show()

### [logspace](https://numpy.org/doc/1.23/reference/generated/numpy.logspace.html)

*Return numbers spaced evenly on a log scale.*

In [None]:
np.logspace(2.0, 3.0, num=4)

In [None]:
np.logspace(2.0, 3.0, num=4, endpoint=False)

In [None]:
np.logspace(2.0, 3.0, num=4, base=2.0)

Graphical illustration

In [None]:
import matplotlib.pyplot as plt
N = 10
x1 = np.logspace(0.1, 1, N, endpoint=True)
x2 = np.logspace(0.1, 1, N, endpoint=False)
y = np.zeros(N)

plt.plot(x1, y, 'o')

plt.plot(x2, y + 0.5, 'o')

plt.ylim([-0.5, 1])
plt.show()

### [unique](https://numpy.org/doc/1.23/reference/generated/numpy.unique.html)

*Returns the sorted unique elements of an array. There are three optional outputs in addition to the unique elements:*

*-the indices of the input array that give the unique values*

*-the indices of the unique array that reconstruct the input array*

*-the number of times each unique value comes up in the input array*

In [None]:
np.unique([1, 1, 2, 2, 3, 3])

In [None]:
a = np.array([[1, 1], [2, 3]])
np.unique(a)

Return the unique rows of a 2D array

In [None]:
a = np.array([[1, 0, 0], [1, 0, 0], [2, 3, 4]])
np.unique(a, axis=0)

Return the indices of the original array that give the unique values:

In [None]:
a = np.array(['a', 'b', 'b', 'c', 'a'])
u, indices = np.unique(a, return_index=True)
print(u)
print(indices)

In [None]:
a[indices]

Reconstruct the input values from the unique values and counts:

In [None]:
a = np.array([1, 2, 6, 4, 2, 3, 2])
values, counts = np.unique(a, return_counts=True)
print(values)
print(counts)

In [None]:
np.repeat(values, counts)  # original order not preserved

### [zeros](https://numpy.org/doc/1.23/reference/generated/numpy.zeros.html), [ones](https://numpy.org/doc/1.23/reference/generated/numpy.ones.html)

*Return a new array of given shape and type, filled with zeros and ones respectively*


In [None]:
np.zeros((5,), dtype=int)

In [None]:
np.ones((5,), dtype=int)

In [None]:
np.zeros((2,2))

In [None]:
np.ones((2,2))

In [None]:
np.zeros((2,), dtype=[('x', 'i4'), ('y', 'i4')]) # custom dtype

### [where](https://numpy.org/doc/1.23/reference/generated/numpy.where.html)

*Return elements chosen from x or y depending on condition.*

*When only condition is provided, this function returns indices that match the condition.  In other words, it works is a shorthand for np.asarray(condition).nonzero()*

In [None]:
a = np.arange(10)

np.where(a < 5, a, 10*a)

This can be used on multidimensional arrays too:

In [None]:
np.where([[True, False], [True, True]],
         [[1, 2], [3, 4]],
         [[9, 8], [7, 6]])

The shapes of x, y, and the condition are broadcast together:

In [None]:
x, y = np.ogrid[:3, :4]

In [None]:
np.where(x < y, x, 10 + y)  # both x and 10+y are broadcast

A simple *where* clause yields an tuple of lists; from each list, pick up an index to make co-ordinates.  In this case, following coordinates can be created (0, 0) (0, 1), (0, 2)...(2,1)

In [None]:
a = np.array([[0, 1, 2],
              [0, 2, 4],
              [0, 3, 6]])

np.where(a < 4)

In [None]:
np.where(a < 4, a, -1)  # -1 is broadcast

When only condition is provided

In [None]:
a = np.array([[0, 1, 2],
              [0, 2, 4],
              [0, 3, 6]])
np.where(a < 4)

### [argmax](https://numpy.org/doc/1.23/reference/generated/numpy.argmax.html)

*Returns the indices of the maximum values along an axis.*

In [None]:
a = np.arange(6).reshape(2,3) + 10

In [None]:
np.argmax(a)


In [None]:
np.argmax(a, axis=0)


In [None]:
np.argmax(a, axis=1)

In [None]:
# Indexes of the maximal elements of a N-dimensional array
ind = np.unravel_index(np.argmax(a, axis=None), a.shape)
ind

In [None]:
a[ind]

In [None]:
b = np.arange(6)
b[1] = 5
np.argmax(b)  # Only the first occurrence is returned.

In [None]:
x = np.array([[4,2,3], [1,0,3]])
index_array = np.argmax(x, axis=-1)
# Same as np.amax(x, axis=-1, keepdims=True)
np.take_along_axis(x, np.expand_dims(index_array, axis=-1), axis=-1)

In [None]:
np.take_along_axis(x, np.expand_dims(index_array, axis=-1), axis=-1).squeeze(axis=-1)

In [None]:
# Setting keepdims to True,
x = np.arange(24).reshape((2, 3, 4))
res = np.argmax(x, axis=1)
res.shape

### [argmin](https://numpy.org/doc/1.23/reference/generated/numpy.argmin.html)

*Returns the indices of the minimum values along an axis.*

In [None]:
a = np.arange(6).reshape(2,3) + 10

In [None]:
np.argmin(a)

In [None]:
np.argmin(a, axis=0)

In [None]:
np.argmin(a, axis=1)

In [None]:
ind = np.unravel_index(np.argmin(a, axis=None), a.shape)
ind

In [None]:
a[ind]

In [None]:
b = np.arange(6) + 10
b[4] = 10
np.argmin(b)  # Only the first occurrence is returned.

In [None]:
x = np.array([[4,2,3], [1,0,3]])
index_array = np.argmin(x, axis=-1)
# Same as np.amin(x, axis=-1, keepdims=True)
np.take_along_axis(x, np.expand_dims(index_array, axis=-1), axis=-1)

In [None]:
# Same as np.amax(x, axis=-1)
np.take_along_axis(x, np.expand_dims(index_array, axis=-1), axis=-1).squeeze(axis=-1)

### [argsort](https://numpy.org/doc/1.23/reference/generated/numpy.argsort.html)

*Returns the indices that would sort an array.*

*Perform an indirect sort along the given axis using the algorithm specified by the kind keyword. It returns an array of indices of the same shape as a that index data along the given axis in sorted order.*

One dimensional array

In [None]:
x = np.array([3, 1, 2])
np.argsort(x)

Two dimensional array

In [None]:
x = np.array([[0, 3], [2, 2]])
ind = np.argsort(x, axis=0)  # sorts along first axis (down)
ind

In [None]:
np.take_along_axis(x, ind, axis=0)  # same as np.sort(x, axis=0)

In [None]:
ind = np.argsort(x, axis=1)  # sorts along last axis (across)
ind

In [None]:
np.take_along_axis(x, ind, axis=1)  # same as np.sort(x, axis=1)

Indices of the sorted elements of a N-dimensional array:

In [None]:
ind = np.unravel_index(np.argsort(x, axis=None), x.shape)
ind

In [None]:
x[ind]  # same as np.sort(x, axis=None)

Sorting with keys:

In [None]:
x = np.array([(1, 0), (0, 1)], dtype=[('x', '<i4'), ('y', '<i4')])
x

In [None]:
np.argsort(x, order=('x','y'))

In [None]:
np.argsort(x, order=('y','x'))

### [random.seed](https://numpy.org/doc/1.23/reference/random/generated/numpy.random.seed.html)

*Reseed a legacy MT19937 BitGenerator*

Legacy function

In [None]:
from numpy.random import MT19937
from numpy.random import RandomState, SeedSequence
rs = RandomState(MT19937(SeedSequence(123456789)))

In [None]:
# Later, you want to restart the stream
rs = RandomState(MT19937(SeedSequence(987654321)))

In [None]:
rs.choice(100)

### [random.permutation](https://numpy.org/doc/1.23/reference/random/generated/numpy.random.permutation.html)

*Randomly permute a sequence, or return a permuted range.*

*If x is a multi-dimensional array, it is only shuffled along its first index.*


In [None]:
np.random.permutation(10)

In [None]:
np.random.permutation([1, 4, 9, 12, 15])

In [None]:
arr = np.arange(9).reshape((3, 3))
np.random.permutation(arr)

### [count_nonzero](https://numpy.org/doc/1.23/reference/generated/numpy.count_nonzero.html)

*Counts the number of non-zero values in the array a.*

In [None]:
np.count_nonzero(np.eye(4))


In [None]:
a = np.array([[0, 1, 7, 0],
              [3, 0, 2, 19]])
np.count_nonzero(a)

In [None]:
np.count_nonzero(a, axis=0)

In [None]:
np.count_nonzero(a, axis=1)

In [None]:
np.count_nonzero(a, axis=1, keepdims=True)

### [var](https://numpy.org/doc/1.23/reference/generated/numpy.var.html), [std](https://numpy.org/doc/1.23/reference/generated/numpy.std.html)

*Compute the variance and std. deviation (respectively) on a flattened array (by default) or along the specified axis.*

In [None]:
a = np.array([[1, 2], [3, 4]])
np.var(a)

In [None]:
np.var(a, axis=0)

In [None]:
np.var(a, axis=1)

In [None]:
a = np.zeros((2, 512*512), dtype=np.float32)
a[0, :] = 1.0
a[1, :] = 0.1
np.var(a)

In [None]:
np.var(a, dtype=np.float64) # variance is more accurate in float

Specifying a where argument:

In [None]:
a = np.array([[14, 8, 11, 10], [7, 9, 10, 11], [10, 15, 5, 10]])
np.var(a)

In [None]:
np.var(a, where=[[True], [True], [False]]) # only first two rows

In [None]:
np.var(a, where=[True, True, False, False]) # only first two columns

### [row_stack](https://numpy.org/doc/1.23/reference/generated/numpy.row_stack.html) (vstack)

*Stack arrays in sequence vertically (row wise).*  

*The functions concatenate, stack and block provide more general stacking and concatenation operations.*

In [None]:
a = np.array([1, 2, 3])
b = np.array([4, 5, 6])
np.vstack((a,b))

In [None]:
a = np.array([[1], [2], [3]])
b = np.array([[4], [5], [6]])
np.vstack((a,b))

### [column_stack](https://numpy.org/doc/1.23/reference/generated/numpy.column_stack.html) (hstack)

*Stack arrays in sequence horizontally (column wise).*  

*The functions concatenate, stack and block provide more general stacking and concatenation operations.*

In [None]:
a = np.array([1,2,3])
b = np.array([2,3,4])
np.column_stack((a,b))

## pandas

#### Functions

##### [cut()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.cut.html)

*Bin values into discrete intervals.*

In [None]:
pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3) # which bin does each data belong to?

In [None]:
pd.cut(np.array([1, 7, 5, 4, 6, 3]), 3, retbins=True) # also returns the bins

In [None]:
# assigns data specific labels, and ordered.
pd.cut(np.array([1, 7, 5, 4, 6, 3]),
       3, labels=["bad", "medium", "good"])

In [None]:
# Passing a Series as an input returns a Series with categorical dtype:
s = pd.Series(np.array([2, 4, 6, 8, 10]),
              index=['a', 'b', 'c', 'd', 'e'])
pd.cut(s, 3)

### [DataFrame](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.html)

*Two-dimensional, size-mutable, potentially heterogeneous tabular data.*

*Data structure also contains labeled axes (rows and columns). Arithmetic operations align on both row and column labels. Can be thought of as a dict-like container for Series objects. The primary pandas data structure.*

#### Examples

In [None]:
d = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data=d)
df

In [None]:
# Notice that the inferred dtype is int64.
df.dtypes

In [None]:
# To enforce a single dtype:
df = pd.DataFrame(data=d, dtype=np.int8)
df.dtypes

In [None]:
# Constructing DataFrame from a dictionary including Series:
d = {'col1': [0, 1, 2, 3], 'col2': pd.Series([2, 3], index=[2, 3])}
pd.DataFrame(data=d, index=[0, 1, 2, 3])

In [None]:
# Constructing DataFrame from numpy ndarray:
df2 = pd.DataFrame(np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]]),
                   columns=['a', 'b', 'c'])
df2

In [None]:
# Constructing DataFrame from a numpy ndarray that has labeled columns:
data = np.array([(1, 2, 3), (4, 5, 6), (7, 8, 9)],
                dtype=[("a", "i4"), ("b", "i4"), ("c", "i4")])
df3 = pd.DataFrame(data, columns=['c', 'a'])
df3

#### Methods

##### [astype()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.astype.html)

*Cast a pandas object to a specified dtype dtype.*

In [None]:
d = {'col1': [1, 2], 'col2': [3, 4]}
df = pd.DataFrame(data=d)
df.dtypes

In [None]:
# Cast all columns to int32
df.astype('int32').dtypes

In [None]:
# Cast col1 to int32 using a dictionary
df.astype({'col1': 'int32'}).dtypes

In [None]:
ser = pd.Series([1, 2], dtype='int32')
ser.astype('int64')

In [None]:
# int32 to category dtype
ser.astype('category')

##### [dtypes()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.dtypes.html)

*Return the dtypes in the DataFrame, as a Series with the data type of each column.*

In [None]:
df = pd.DataFrame({'float': [1.0],
                   'int': [1],
                   'datetime': [pd.Timestamp('20180310')],
                   'string': ['foo']})

df.dtypes

##### [select_dtypes()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.select_dtypes.html)

*Return a subset of the DataFrame’s columns based on the column dtypes.*

*- To select all numeric types, use np.number or 'number'*

*- To select strings you must use the object dtype, but note that this will return all object dtype columns*

*- To select datetimes, use np.datetime64, 'datetime' or 'datetime64'*

*- To select Pandas categorical dtypes, use 'category'*

In [None]:
df = pd.DataFrame({'a': [1, 2] * 3,
                   'b': [True, False] * 3,
                   'c': [1.0, 2.0] * 3})
df

In [None]:
df.select_dtypes(include='bool')

In [None]:
df.select_dtypes(include=['float64'])

In [None]:
df.select_dtypes(exclude=['int64'])

##### [to_numpy()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.to_numpy.html)

*Convert the DataFrame to a NumPy array.*

*By default, the dtype of the returned array will be the common NumPy dtype of all types in the DataFrame.*

*Alternatively, use **values** attribute*

In [None]:
pd.DataFrame({"A": [1, 2], "B": [3, 4]}).to_numpy()

In [None]:
pd.DataFrame({"A": [1, 2], "B": [3, 4]}).values # .to_numpy() is equivalent to .values

In [None]:
df = pd.DataFrame({"A": [1, 2], "B": [3.0, 4.5]})
df.to_numpy()

##### [head()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.head.html)

*Return the first n rows.*

In [None]:
df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
                   'monkey', 'parrot', 'shark', 'whale', 'zebra']})
df.head()

In [None]:
df.head(3) # Viewing first 3 lines

In [None]:
df.head(-3) # Viewing all, but last 3.

##### [tail()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.tail.html)

*Return the last n rows.*

In [None]:
df = pd.DataFrame({'animal': ['alligator', 'bee', 'falcon', 'lion',
                   'monkey', 'parrot', 'shark', 'whale', 'zebra']})

df.tail()

In [None]:
df.tail(3) # Viewing last 3 lines

In [None]:
df.tail(-3) # Viewing all but first 3 lines

##### [columns()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.columns.html)

*The column labels of the DataFrame.*

In [None]:
df = pd.DataFrame({'float': [1.0],
                   'int': [1],
                   'datetime': [pd.Timestamp('20180310')],
                   'string': ['foo']})

df.columns

##### [any()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.any.html), [all()](https://pandas.pydata.org/pandas-docs/stable/reference/api/pandas.DataFrame.all.html)

*Return whether any(all) element is True, potentially over an axis.*

*Returns False(True) unless there is at least one element within a series or along a Dataframe axis that is True(False) or equivalent (e.g. non-zero or non-empty)*

In [None]:
pd.Series([False, False]).any()

In [None]:
df = pd.DataFrame({"A": [1, 2], "B": [0, 2], "C": [0, 0]})
df.any()

In [None]:
df = pd.DataFrame({"A": [True, False], "B": [1, 0]})
df.any(axis=1) # across rows

In [None]:
df = pd.DataFrame({'col1': [True, True], 'col2': [True, False]})
df.all()

##### [index()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.index.html)

*The index (row labels) of the DataFrame.*

In [None]:
df = pd.DataFrame([('bird', 389.0),
                   ('bird', 24.0),
                   ('mammal', 80.5),
                   ('mammal', np.nan)],
                  index=['falcon', 'parrot', 'lion', 'monkey'],
                  columns=('class', 'max_speed'))

df.index

##### [loc()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.loc.html)

*Access a group of rows and columns by label(s) or a boolean array.*

In [None]:
df = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
     index=['cobra', 'viper', 'sidewinder'],
     columns=['max_speed', 'shield'])

df.loc['viper']

In [None]:
# List of labels. Note using [[]] returns a DataFrame.
df.loc[['viper', 'sidewinder']]

In [None]:
# Single label for row and column
df.loc['cobra', 'shield']

In [None]:
# Conditional that returns a boolean Series
df.loc[df['shield'] > 6]

In [None]:
# Conditional that returns a boolean Series with column labels specified
df.loc[df['shield'] > 6, ['max_speed']]

In [None]:
# Callable that returns a boolean Series
df.loc[lambda df: df['shield'] == 8]

In [None]:
# Set value for all items matching the list of labels
df.loc[['viper', 'sidewinder'], ['shield']] = 50
df

In [None]:
# Set value for an entire row
df.loc['cobra'] = 10
df

In [None]:
# Set value for an entire column
df.loc[:, 'max_speed'] = 30
df

In [None]:
# Set value for rows matching callable condition
df.loc[df['shield'] > 35] = 0
df

In [None]:
df = pd.DataFrame([[1, 2], [4, 5], [7, 8]],
     index=[7, 8, 9], columns=['max_speed', 'shield'])
df.loc[7:9] # loc includes first and last index, since it's integer-based.

##### [iloc()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.iloc.html)

*Purely integer-location based indexing for selection by position.*

In [None]:
mydict = [{'a': 1, 'b': 2, 'c': 3, 'd': 4},
          {'a': 100, 'b': 200, 'c': 300, 'd': 400},
          {'a': 1000, 'b': 2000, 'c': 3000, 'd': 4000 }]
df = pd.DataFrame(mydict)
df

In [None]:
type(df.iloc[0])

In [None]:
# Indexing with scalar integer
df.iloc[0]

In [None]:
# Indexing with a list of integers.
df.iloc[[0, 1]]

In [None]:
# Indexing with a slice object
df.iloc[:3]

In [None]:
# Indexing with a boolean mask
df.iloc[[True, False, True]]

Indexing both axes

In [None]:
# With scalar integers
df.iloc[0, 1]

In [None]:
# With lists of integers.
df.iloc[[0, 2], [1, 3]]

In [None]:
# With a boolean array whose length matches the columns.
df.iloc[:, [True, False, True, False]]

##### [info()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.info.html)

*Prints information about a DataFrame including the index dtype and columns, non-null values and memory usage.*

In [None]:
int_values = [1, 2, 3, 4, 5]
text_values = ['alpha', 'beta', 'gamma', 'delta', 'epsilon']
float_values = [0.0, 0.25, 0.5, 0.75, 1.0]
df = pd.DataFrame({"int_col": int_values, "text_col": text_values,
                  "float_col": float_values})
df

In [None]:
df.info()

##### [describe()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.describe.html)

*Generate descriptive statistics including those that summarize the central tendency, dispersion and shape of a dataset’s distribution, excluding NaN values.*

In [None]:
# Describing a numeric Series.
s = pd.Series([1, 2, 3])
s.describe()

In [None]:
# Describing a categorical Series.
s = pd.Series(['a', 'a', 'b', 'c'])
s.describe()

In [None]:
# Describing a DataFrame. By default only numeric fields are returned.
df = pd.DataFrame({'categorical': pd.Categorical(['d','e','f']),
                   'numeric': [1, 2, 3],
                   'object': ['a', 'b', 'c']
                  })
df.describe()

In [None]:
# Describing all columns of a DataFrame regardless of data type.
df.describe(include='all')

In [None]:
# Describing a column from a DataFrame
df['numeric'].describe()

##### [insert()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.insert.html)

*Insert column into DataFrame at specified location.*

In [None]:
df = pd.DataFrame({'col1': [1, 2], 'col2': [3, 4]})
df

In [None]:
df.insert(1, "newcol", [99, 99])
df

In [None]:
df.insert(0, "col1", [100, 100], allow_duplicates=True)
df

##### [pop()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.pop.html)

*Return item and drop from frame. Raise KeyError if not found.*

In [None]:
df = pd.DataFrame([('falcon', 'bird', 389.0),
                   ('parrot', 'bird', 24.0),
                   ('lion', 'mammal', 80.5),
                   ('monkey', 'mammal', np.nan)],
                  columns=('name', 'class', 'max_speed'))
df

In [None]:
df.pop('class')
df

##### [copy()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.copy.html)

*Make a copy of this object’s indices and data.*


In [None]:
s = pd.Series([1, 2], index=["a", "b"])
deep = s.copy()
shallow = s.copy(deep=False)
s[0] = 2 # will affect shallow
shallow

In [None]:
deep # but will not affect deep

##### [plot()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.plot.html)

*Make plots of Series or DataFrame, using matplotlib (by default)*

##### [boxplot()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.boxplot.html)

*Make a box-and-whisker plot from DataFrame columns, optionally grouped by some other columns.*

In [None]:
np.random.seed(1234)
df = pd.DataFrame(np.random.randn(10, 4),
                  columns=['Col1', 'Col2', 'Col3', 'Col4'])
boxplot = df.boxplot(column=['Col1', 'Col2', 'Col3'])

##### [corr()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.corr.html)

*Compute pairwise correlation of columns, excluding NA/null values.*

In [None]:
df = pd.DataFrame([(.2, .3), (.0, .6), (.6, .0), (.2, .1)],
                  columns=['dogs', 'cats'])
df.corr()

In [None]:
df

##### [drop()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.drop.html)

*Remove rows or columns by specifying label names and corresponding axis, or by specifying directly index or column names.*

In [None]:
df = pd.DataFrame(np.arange(12).reshape(3, 4),
                  columns=['A', 'B', 'C', 'D'])
df

In [None]:
df.drop(['B', 'C'], axis=1)

In [None]:
df.drop(columns=['B', 'C'])

In [None]:
# drop a row by index
df.drop([0, 1])

##### [dropna()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.dropna.html)

*Remove missing values.*

In [None]:
df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
                   "toy": [np.nan, 'Batmobile', 'Bullwhip'],
                   "born": [pd.NaT, pd.Timestamp("1940-04-25"),
                            pd.NaT]}) # pd.NaT is Not a Time, pd.nan is Not a Number.
df

In [None]:
# Drop the rows where at least one element is missing.
df.dropna()

In [None]:
# Drop the columns where at least one element is missing.
df.dropna(axis='columns') # same as df.dropna(axis=1)

In [None]:
# Drop the rows where all elements are missing.
df.dropna(how='all')

In [None]:
# Keep rows with at least 2 non-NAs.
df.dropna(thresh=2)

In [None]:
# Define in which columns to look for missing values.
df.dropna(subset=['name', 'toy'])

In [None]:
# Keep the DataFrame with valid entries in the same variable.
df.dropna(inplace=True)
df

##### [isna()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.isna.html), [isnull()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.isnull.html)

*Return a boolean same-sized object indicating if the values are NA.*


In [None]:
df = pd.DataFrame(dict(age=[5, 6, np.NaN],
                   born=[pd.NaT, pd.Timestamp('1939-05-27'),
                         pd.Timestamp('1940-04-25')],
                   name=['Alfred', 'Batman', ''],
                   toy=[None, 'Batmobile', 'Joker']))
df

In [None]:
df.isna()

In [None]:
# Show which entries in a Series are NA.
df['toy'].isna()

isnull() is an alias for isna()

In [None]:
df.isnull() # same as df.isna()

##### [notna()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.notna.html), [notnull()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.notnull.html)

*Return a boolean same-sized object indicating if the values are not NA.*


In [None]:
df = pd.DataFrame(dict(age=[5, 6, np.NaN],
                   born=[pd.NaT, pd.Timestamp('1939-05-27'),
                         pd.Timestamp('1940-04-25')],
                   name=['Alfred', 'Batman', ''],
                   toy=[None, 'Batmobile', 'Joker']))
df

In [None]:
df.notna()

In [None]:
# Show which entries in a Series are not NA.
df['name'].notna() # empty string is not considered as na

notnull() is an alias for notna()

In [None]:
df.notnull() # same as df.notna()

##### [reset_index()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.reset_index.html)

*Reset the index of the DataFrame, and use the default one instead. *

In [None]:
df = pd.DataFrame([('bird', 389.0),
                   ('bird', 24.0),
                   ('mammal', 80.5),
                   ('mammal', np.nan)],
                  index=['falcon', 'parrot', 'lion', 'monkey'],
                  columns=('class', 'max_speed'))
df

In [None]:
df.reset_index()

In [None]:
df.reset_index(drop=True)

##### [replace()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.replace.html)

*Replace values given in to_replace with other values dynamically*

In [None]:
s = pd.Series([1, 2, 3, 4, 5])
s.replace(1, 5)

First parameter is *to_replace* (None by default), and second parameter is *value* to replace with (None by default)

In [None]:
df = pd.DataFrame({'A': [0, 1, 2, 3, 4],
                   'B': [5, 6, 7, 8, 9],
                   'C': ['a', 'b', 'c', 'd', 'e']})
df.replace(0, 5)

Replace list of values

In [None]:
df.replace([0, 1, 2, 3], 4)

In [None]:
# Replace a list of values from another list
df.replace([0, 1, 2, 3], [4, 3, 2, 1])

Replace values from a dictionary

In [None]:
df.replace({0: 10, 1: 100})

In [None]:
df.replace({'A': 0, 'B': 5}, 100) # Columns are keys.

In [None]:
df.replace({'A': {0: 100, 4: 400}})

Replace regular expression

In [None]:
df = pd.DataFrame({'A': ['bat', 'foo', 'bait'],
                   'B': ['abc', 'bar', 'xyz']})
df.replace(to_replace=r'^ba.$', value='new', regex=True)

In [None]:
df.replace({'A': r'^ba.$'}, {'A': 'new'}, regex=True)

In [None]:
s = pd.Series([10, 'a', 'a', 'b', 'a'])
s.replace('a', 11)

In [None]:
s.replace('a') # forward fill

##### [transpose()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.transpose.html)

*Reflect the DataFrame over its main diagonal by writing rows as columns and vice-versa. The property T is an accessor to the method transpose().*

In [None]:
d1 = {'col1': [1, 2], 'col2': [3, 4]}
df1 = pd.DataFrame(data=d1)
df1

In [None]:
df1_transposed = df1.transpose() # or df1.T
df1_transposed

The property T is an accessor to this method

##### [hist()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.hist.html)

*Make a histogram of the DataFrame’s columns, using matplotlib.pyplot.hist().*


In [None]:
df = pd.DataFrame({
    'length': [1.5, 0.5, 1.2, 0.9, 3],
    'width': [0.7, 0.2, 0.15, 0.2, 1.1]
    }, index=['pig', 'rabbit', 'duck', 'chicken', 'horse'])
hist = df.hist(bins=3)

##### [sum()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sum.html)

*Return the sum of the values over the requested axis.  Equivalent to the method numpy.sum.*

In [None]:
s = pd.Series([4, 2, 0, 8, np.nan], name='legs')
s # skipna is True by default

In [None]:
s.sum(skipna=False)

##### [mean()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.mean.html), [median()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.median.html)

*Return the mean and median respectively of the values over the requested axis.*

##### [var()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.var.html)

*Return unbiased variance over requested axis.*

*Normalized by N-1 by default. This can be changed using the ddof argument.*

In [None]:
df = pd.DataFrame({'person_id': [0, 1, 2, 3],
                  'age': [21, 25, 62, 43],
                  'height': [1.61, 1.87, 1.49, 2.01]}
                 ).set_index('person_id')
df

In [None]:
df.var() # Sample variance

In [None]:
df.var(ddof=0) # Population variance

##### [std()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.std.html)

*Return sample standard deviation over requested axis.*

*Normalized by N-1 by default. This can be changed using the ddof argument.*

In [None]:
df = pd.DataFrame({'person_id': [0, 1, 2, 3],
                  'age': [21, 25, 62, 43],
                  'height': [1.61, 1.87, 1.49, 2.01]}
                 ).set_index('person_id')
df

In [None]:
df.std() # Sample std. deviation

In [None]:
df.std(ddof=0) # Population std. deviation

##### [groupby()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.groupby.html)

*Group DataFrame using a mapper or by a Series of columns.*

*A groupby operation involves some combination of splitting the object, applying a function, and combining the results.*

In [None]:
df = pd.DataFrame({'Animal': ['Falcon', 'Falcon',
                              'Parrot', 'Parrot'],
                   'Max Speed': [380., 370., 24., 26.]})
df

In [None]:
df.groupby(['Animal']).mean()

In [None]:
l = [["a", 12, 12], [None, 12.3, 33.], ["b", 12.3, 123], ["a", 1, 1]]
df = pd.DataFrame(l, columns=["a", "b", "c"])
df

In [None]:
df.groupby(by=["a"]).sum()

In [None]:
df.groupby(by="a", dropna=False).sum()

##### [where()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.where.html)

*Replace values where the condition is False.*

*For each element in the calling DataFrame, if cond is True the element is used; otherwise the corresponding element from the DataFrame other is used.*

*Note that in NumPy, where() method behaves differently, and takes 3 arguments*

In [None]:
s = pd.Series(range(5))
s.where(s > 0)

In [None]:
s.where(s > 1, 10)

In [None]:
df = pd.DataFrame(np.arange(10).reshape(-1, 2), columns=['A', 'B'])
df

In [None]:
m = df % 3 == 0
df.where(m, -df)

##### [sort_values()](https://pandas.pydata.org/docs/reference/api/pandas.DataFrame.sort_values.html)

*Sort by the values along either axis.*

In [None]:
df = pd.DataFrame({
    'col1': ['A', 'A', 'B', np.nan, 'D', 'C'],
    'col2': [2, 1, 9, 8, 7, 4],
    'col3': [0, 1, 9, 4, 2, 3],
    'col4': ['a', 'B', 'c', 'D', 'e', 'F']
})
df

In [None]:
# Sort by col1
df.sort_values(by=['col1'])

In [None]:
# sort by multiple columns
df.sort_values(by=['col1', 'col2'])

In [None]:
# Sort in descending order
df.sort_values(by='col1', ascending=False)

In [None]:
# Sorting with a key function
df.sort_values(by='col4', key=lambda col: col.str.lower())

### [Series](https://pandas.pydata.org/docs/reference/api/pandas.Series.html)

*One-dimensional ndarray with axis labels (including time series).*

*Labels need not be unique but must be a hashable type.*

#### Examples

In [None]:
d = {'a': 1, 'b': 2, 'c': 3}
ser = pd.Series(data=d, index=['a', 'b', 'c'])
ser

In [None]:
d = {'a': 1, 'b': 2, 'c': 3}
ser = pd.Series(data=d, index=['x', 'y', 'z'])
ser

In [None]:
r = np.array([1, 2]) # a simple list [1, 2] also works
ser = pd.Series(r, copy=False)
ser.iloc[0] = 999
ser

#### Methods

##### [to_frame()](https://pandas.pydata.org/docs/reference/api/pandas.Series.to_frame.html)

*Convert Series to DataFrame.*

In [None]:
s = pd.Series(["a", "b", "c"],
              name="vals")
s.to_frame()

##### [value_counts()](https://pandas.pydata.org/docs/reference/api/pandas.Series.value_counts.html)

*Return a Series containing counts of unique values.*

*The resulting object will be in descending order so that the first element is the most frequently-occurring element. Excludes NA values by default.*

In [None]:
index = pd.Index([3, 1, 2, 3, 4, np.nan])
index.value_counts()

In [None]:
# With normalize set to True, returns the relative frequency by dividing all values by the sum of values.
s = pd.Series([3, 1, 2, 3, 4, np.nan])
s.value_counts(normalize=True)

In [None]:
# Counting in bins
s.value_counts(bins=3)

##### [unique()](https://pandas.pydata.org/docs/reference/api/pandas.Series.unique.html)

*Return unique values of Series object.*


In [None]:
pd.Series([2, 1, 3, 3], name='A').unique()

##### [count()](https://pandas.pydata.org/docs/reference/api/pandas.Series.count.html)

*Return number of non-NA/null observations in the Series.*

In [None]:
s = pd.Series([0.0, 1.0, np.nan])
s.count()

##### [replace()](https://pandas.pydata.org/docs/reference/api/pandas.Series.replace.html)

*Replace values given in to_replace with other values dynamically.*

In [None]:
s = pd.Series([1, 2, 3, 4, 5])
s.replace(1, 5)

In [None]:
s = pd.Series([10, 'a', 'a', 'b', 'a'])
s.replace('a')

##### [mean()](https://pandas.pydata.org/docs/reference/api/pandas.Series.mean.html), [median()](https://pandas.pydata.org/docs/reference/api/pandas.Series.median.html)

*Return the mean and median repsectively of the values over the requested axis.*

[var()](https://pandas.pydata.org/docs/reference/api/pandas.Series.var.html), [std()](https://pandas.pydata.org/docs/reference/api/pandas.Series.std.html)

*Return unbiased variance and std. deviation repspectively over requested axis.*

*Normalized by N-1 by default. This can be changed using the ddof argument.*

##### [hist()](https://pandas.pydata.org/docs/reference/api/pandas.Series.hist.html)

*Draw histogram of the input series using matplotlib.*

### [Bunch](https://scikit-learn.org/stable/modules/generated/sklearn.utils.Bunch.html)

*Container object exposing keys as attributes.*

Typically returned by fetch_* APIs of sklearn.datasets.  Bunch object has the following keys in it: frame, data, target, target_names, feature_names

### [read_csv](https://pandas.pydata.org/docs/reference/api/pandas.read_csv.html)

*Read a comma-separated values (csv) file into DataFrame.*

*Also supports optionally iterating or breaking of the file into chunks.*

In [None]:
# Downloads test.csv with no header, and two columns Col1 and Col2.
# pd.read_csv('test.csv', header=None, names=['Col1', 'Col2'])

### [concat](https://pandas.pydata.org/docs/reference/api/pandas.concat.html)

*Concatenate pandas objects along a particular axis with optional set logic along the other axes.*

Combine two Series.

In [None]:
s1 = pd.Series(['a', 'b'])
s2 = pd.Series(['c', 'd'])
pd.concat([s1, s2])

Clear the existing index and reset it in the result by setting the ignore_index option to True.

In [None]:
pd.concat([s1, s2], ignore_index=True)

Add a hierarchical index at the outermost level of the data with the keys option.

In [None]:
pd.concat([s1, s2], keys=['s1', 's2'])

Label the index keys you create with the names option.

In [None]:
pd.concat([s1, s2], keys=['s1', 's2'],
          names=['Series name', 'Row ID'])

Combine two DataFrame objects with identical columns.

In [None]:
df1 = pd.DataFrame([['a', 1], ['b', 2]],
                   columns=['letter', 'number'])
df2 = pd.DataFrame([['c', 3], ['d', 4]],
                   columns=['letter', 'number'])
pd.concat([df1, df2], ignore_index=True)

Combine DataFrame objects with overlapping columns and return everything. Columns outside the intersection will be filled with NaN values.

In [None]:
df3 = pd.DataFrame([['c', 3, 'cat'], ['d', 4, 'dog']],
                   columns=['letter', 'number', 'animal'])

pd.concat([df1, df3], sort=False, ignore_index=True)

Combine DataFrame Examples with overlapping columns and return only those that are shared by passing inner to the join keyword argument.

In [None]:
pd.concat([df1, df3], join="inner", ignore_index=True)

Combine DataFrame Examples horizontally along the x axis by passing in axis=1.

In [None]:
df4 = pd.DataFrame([['bird', 'polly'], ['monkey', 'george']],
                   columns=['animal', 'name'])
pd.concat([df1, df4], axis=1)

In [None]:
df5 = pd.DataFrame([1], index=['a'])
df6 = pd.DataFrame([2], index=['a'])
try:
  pd.concat([df5, df6], verify_integrity=True) # Raises exception.  Opposite of ignore_index=True.
except ValueError:
  print("Exception raised!")

### [get_dummies](https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html)

*Convert categorical variable into dummy/indicator variables.*

In [None]:
s = pd.Series(list('abca'))
pd.get_dummies(s)

In [None]:
s1 = ['a', 'b', np.nan]
pd.get_dummies(s1)

In [None]:
pd.get_dummies(s1, dummy_na=True)

In [None]:
df = pd.DataFrame({'A': ['a', 'b', 'a'], 'B': ['b', 'a', 'c'],
                   'C': [1, 2, 3]})
pd.get_dummies(df, prefix=['col1', 'col2'])

In [None]:
pd.get_dummies(pd.Series(list('abcaa')))

In [None]:
pd.get_dummies(pd.Series(list('abcaa')), drop_first=True)

In [None]:
pd.get_dummies(pd.Series(list('abc')), dtype=float)

## pandas.plotting

### [scatter_matrix](https://pandas.pydata.org/docs/reference/api/pandas.plotting.scatter_matrix.html)

*Draw a matrix of scatter plots.*

In [None]:
df = pd.DataFrame(np.random.randn(1000, 4), columns=['A','B','C','D'])
pd.plotting.scatter_matrix(df, alpha=0.2)

## sns

### [histplot](https://seaborn.pydata.org/generated/seaborn.histplot.html)

*Plot univariate or bivariate histograms to show distributions of datasets.*

In [None]:
penguins = sns.load_dataset("penguins")
sns.histplot(data=penguins, x="flipper_length_mm")

Flip the plot by assigning the data variable to the y axis:

In [None]:
sns.histplot(data=penguins, y="flipper_length_mm")

Check how well the histogram represents the data by specifying a different bin width:

In [None]:
sns.histplot(data=penguins, x="flipper_length_mm", binwidth=3)

You can also define the total number of bins to use:

In [None]:
sns.histplot(data=penguins, x="flipper_length_mm", bins=30)

### [scatterplot](https://seaborn.pydata.org/generated/seaborn.scatterplot.html)

*Draw a scatter plot with possibility of several semantic groupings.*


In [None]:
tips = sns.load_dataset("tips")
sns.scatterplot(data=tips, x="total_bill", y="tip")

Assigning a variable to hue will map its levels to the color of the points:

In [None]:
sns.scatterplot(data=tips, x="total_bill", y="tip", hue="time")

Pass the name of a categorical palette or explicit colors (as a Python list of dictionary) to force categorical mapping of the hue variable:

In [None]:
sns.scatterplot(data=tips, x="total_bill", y="tip", hue="size", palette="deep")

### [heatmap](https://seaborn.pydata.org/generated/seaborn.heatmap.html)

*Plot rectangular data as a color-encoded matrix.*

In [None]:
import numpy as np; np.random.seed(0)
import seaborn as sns; sns.set_theme()
uniform_data = np.random.rand(10, 12)
ax = sns.heatmap(uniform_data)

Plot a dataframe with meaningful row and column labels:

In [None]:
flights = sns.load_dataset("flights")
flights = flights.pivot(index="month", columns="year", values="passengers")
ax = sns.heatmap(flights)

Annotate each cell with the numeric value using integer formatting:

In [None]:
ax = sns.heatmap(flights, annot=True, fmt="d")

Use a different colormap:

In [None]:
ax = sns.heatmap(flights, cmap="YlGnBu")

### [pairplot](https://seaborn.pydata.org/generated/seaborn.pairplot.html)

*Plot pairwise relationships in a dataset.*

In [None]:
penguins = sns.load_dataset("penguins")
sns.pairplot(penguins)

Assigning a hue variable adds a semantic mapping and changes the default marginal plot to a layered kernel density estimate (KDE):

In [None]:
sns.pairplot(penguins, hue="species")

Or histplot() to draw both bivariate and univariate histograms:

In [None]:
sns.pairplot(penguins, kind="hist")

### [set_style](https://seaborn.pydata.org/generated/seaborn.set_style.html)

*Set the parameters that control the general style of the plots.*

*The style parameters control properties like the color of the background and whether a grid is enabled by default.*

In [None]:
sns.set_style("whitegrid")
sns.barplot(x=["A", "B", "C"], y=[1, 3, 2])

You can also selectively override seaborn’s default parameter values:

In [None]:
sns.set_style("darkgrid", {"grid.color": ".6", "grid.linestyle": ":"})
sns.lineplot(x=["A", "B", "C"], y=[1, 3, 2])

## sklearn.feature_extraction

### [DictVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.DictVectorizer.html?highlight=dictvectorizer)
*Convert a collection of text documents to a matrix of token counts.*

*This transformer turns lists of mappings (dict-like Examples) of feature names to feature values into Numpy arrays or scipy.sparse matrices for use with scikit-learn estimators.*

#### Examples

In [None]:
from sklearn.feature_extraction import DictVectorizer
v = DictVectorizer(sparse=False)
D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
X = v.fit_transform(D)
X

In [None]:
v.inverse_transform(X) == [{'bar': 2.0, 'foo': 1.0},
...                            {'baz': 1.0, 'foo': 3.0}]
v.transform({'foo': 4, 'unseen_feature': 3})

In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.feature_selection import SelectKBest, chi2
v = DictVectorizer()
D = [{'foo': 1, 'bar': 2}, {'foo': 3, 'baz': 1}]
X = v.fit_transform(D)
support = SelectKBest(chi2, k=2).fit(X, [0, 1])
v.get_feature_names_out()

In [None]:
v.restrict(support.get_support())

In [None]:
v.get_feature_names_out()

#### Methods

##### fit -- <sub><sup>*Learn a vocabulary dictionary of all tokens in the raw documents*</sup></sub>

##### transform -- <sub><sup>*Transform documents to document-term matrix.*</sup></sub>

##### fit_transform -- <sub><sup>*Learn the vocabulary dictionary and return document-term matrix.*</sup></sub>

### [FeatureHasher](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.FeatureHasher.html)

*Implements feature hashing, aka the hashing trick.*

*This class turns sequences of symbolic feature names (strings) into scipy.sparse matrices, using a hash function to compute the matrix column corresponding to a name. The hash function employed is the signed 32-bit version of Murmurhash3.*

In [None]:
from sklearn.feature_extraction import FeatureHasher
h = FeatureHasher(n_features=10)
D = [{'dog': 1, 'cat':2, 'elephant':4},{'dog': 2, 'run': 5}]
f = h.transform(D)
f.toarray()

## sklearn.feature_extraction.text

### [TfidfVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html?highlight=tfidfvectorizer)

*Convert a collection of raw documents to a matrix of TF-IDF features.*

*Equivalent to CountVectorizer followed by TfidfTransformer.*


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

In [None]:
print(X.shape)

### [CountVectorizer](https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.CountVectorizer.html)

*Convert a collection of text documents to a matrix of token counts.*

*This implementation produces a sparse representation of the counts using scipy.sparse.csr_matrix.*


#### Examples

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(corpus)
vectorizer.get_feature_names_out()

In [None]:
print(X.toarray())

#### Methods

##### fit -- <sub><sup>*Learn a vocabulary dictionary of all tokens in the raw documents.*</sup></sub>


##### transform -- <sub><sup>*Transform documents to document-term matrix.*</sup></sub>


##### fit_transform -- <sub><sup>*learn the vocabulary dictionary and return document-term matrix.*</sup></sub>


#### Attributes

##### vocabulary_ -- <sub><sup>*A mapping of terms to feature indices.*</sup></sub>


## sklearn.feature_selection

### [VarianceThreshold](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.VarianceThreshold.html?highlight=variancethreshold)

*Feature selector that removes all low-variance features.*

*This feature selection algorithm looks only at the features (X), not the desired outputs (y), and can thus be used for unsupervised learning.*

#### Examples

In [None]:
from sklearn.feature_selection import VarianceThreshold
X = [[0, 2, 0, 3], [0, 1, 4, 3], [0, 1, 1, 3]]
selector = VarianceThreshold()
selector.fit_transform(X)

#### Methods

##### fit -- <sub><sup>*Learn empirical variances from X.*</sup></sub>


##### transform-- <sub><sup>*Reduce X to the selected features.*</sup></sub>


##### fit_transform -- <sub><sup>*Fit to data, then transform it.*</sup></sub>

### [SelectKBest](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectKBest.html?highlight=selectkbest)

*Select features according to the k highest scores.*


#### Examples

In [None]:
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectKBest, chi2
X, y = load_digits(return_X_y=True)
X.shape

In [None]:
X_new = SelectKBest(chi2, k=20).fit_transform(X, y)
X_new.shape

#### Methods

##### fit -- <sub><sup>*Run score function on (X, y) and get the appropriate features.*</sup></sub>


##### transform -- <sub><sup>*Reduce X to the selected features.*</sup></sub>


##### fit_transform -- <sub><sup>*Fit to data, then transform it.*</sup></sub>


##### get_feature_names_out -- <sub><sup>*Mask feature names according to selected features.*</sup></sub>

### [SelectPercentile](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectPercentile.html?highlight=selectpercentile)

*Select features according to a percentile of the highest scores.*


#### Examples

In [None]:
from sklearn.datasets import load_digits
from sklearn.feature_selection import SelectPercentile, chi2
X, y = load_digits(return_X_y=True)
X.shape

In [None]:
X_new = SelectPercentile(chi2, percentile=10).fit_transform(X, y)
X_new.shape

#### Methods

##### fit -- <sub><sup>*Run score function on (X, y) and get the appropriate features.*</sup></sub>

##### transform -- <sub><sup>*Reduce X to the selected features.*</sup></sub>


##### fit_transform -- <sub><sup>*Fit to data, then transform it.*</sup></sub>


##### get_feature_names_out -- <sub><sup>*Mask feature names according to selected features.*</sup></sub>

### [GenericUnivariateSelect](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.GenericUnivariateSelect.html?highlight=genericunivariateselect)

*Univariate feature selector with configurable strategy.*


#### Examples

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.feature_selection import GenericUnivariateSelect, chi2
X, y = load_breast_cancer(return_X_y=True)
X.shape

In [None]:
transformer = GenericUnivariateSelect(chi2, mode='k_best', param=20)
X_new = transformer.fit_transform(X, y)
X_new.shape

#### Methods

##### fit -- <sub><sup>*Run score function on (X, y) and get the appropriate features.*</sup></sub>

##### transform -- <sub><sup>*Reduce X to the selected features.*</sup></sub>


##### fit_transform -- <sub><sup>*Fit to data, then transform it.*</sup></sub>


##### get_feature_names_out -- <sub><sup>*Mask feature names according to selected features.*</sup></sub>

### [mutual_info_regression](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_regression.html?highlight=mutual_info_regression)

*Estimate mutual information for a continuous target variable.*

*Mutual information (MI) [1] between two random variables is a non-negative value, which measures the dependency between the variables. It is equal to zero if and only if two random variables are independent, and higher values mean higher dependency.*

### [mutual_info_classif](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.mutual_info_classif.html?highlight=mutual_info_classif)

*Estimate mutual information for a discrete target variable.*

*Mutual information (MI) [1] between two random variables is a non-negative value, which measures the dependency between the variables. It is equal to zero if and only if two random variables are independent, and higher values mean higher dependency.*

### [RFE](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFE.html)

*Feature ranking with recursive feature elimination.*

#### Examples

In [None]:
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
estimator = SVR(kernel="linear")
selector = RFE(estimator, n_features_to_select=5, step=1)
selector = selector.fit(X, y)
selector.support_

In [None]:
selector.ranking_

#### Methods

##### fit -- <sub><sup>*Fit the RFE model and then the underlying estimator on the selected features.*</sup></sub>


##### transform -- <sub><sup>*Reduce X to the selected features.*</sup></sub>


##### fit_transform -- <sub><sup>*Fit to data, then transform it.*</sup></sub>


##### get_feature_names_out -- <sub><sup>*Mask feature names according to selected features.*</sup></sub>

### [RFECV](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.RFECV.html)

*Recursive feature elimination with cross-validation to select the number of features.*

#### Examples

In [None]:
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFECV
from sklearn.svm import SVR
X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
estimator = SVR(kernel="linear")
selector = RFECV(estimator, step=1, cv=5)
selector = selector.fit(X, y)
selector.support_

In [None]:
selector.ranking_

#### Methods

##### fit -- <sub><sup>*Fit the RFE model and automatically tune the number of selected features.*</sup></sub>


##### transform -- <sub><sup>*Reduce X to the selected features.*</sup></sub>


##### fit_transform -- <sub><sup>*Fit to data, then transform it.*</sup></sub>


##### get_feature_names_out -- <sub><sup>*Mask feature names according to selected features.*</sup></sub>

### [SelectFromModel](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SelectFromModel.html)

*Meta-transformer for selecting features based on importance weights.*

#### Examples

In [None]:
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
X = [[ 0.87, -1.34,  0.31 ],
     [-2.79, -0.02, -0.85 ],
     [-1.34, -0.48, -2.55 ],
     [ 1.92,  1.48,  0.65 ]]
y = [0, 1, 0, 1]
selector = SelectFromModel(estimator=LogisticRegression()).fit(X, y)
selector.estimator_.coef_

In [None]:
selector.threshold_

In [None]:
selector.get_support()

In [None]:
selector.transform(X)

#### Methods

##### fit -- <sub><sup>*Fit the SelectFromModel meta-transformer.*</sup></sub>


##### transform -- <sub><sup>*Reduce X to the selected features.*</sup></sub>


##### fit_transform -- <sub><sup>*Fit to data, then transform it.*</sup></sub>


##### get_feature_names_out -- <sub><sup>*Mask feature names according to selected features.*</sup></sub>

### [SequentialFeatureSelector](https://scikit-learn.org/stable/modules/generated/sklearn.feature_selection.SequentialFeatureSelector.html)

*Transformer that performs Sequential Feature Selection.*

*This Sequential Feature Selector adds (forward selection) or removes (backward selection) features to form a feature subset in a greedy fashion. At each stage, this estimator chooses the best feature to add or remove based on the cross-validation score of an estimator. In the case of unsupervised learning, this Sequential Feature Selector looks only at the features (X), not the desired outputs (y).*

#### Examples

In [None]:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.neighbors import KNeighborsClassifier
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y=True)
knn = KNeighborsClassifier(n_neighbors=3)
sfs = SequentialFeatureSelector(knn, n_features_to_select=3)
sfs.fit(X, y)

In [None]:
sfs.get_support()

In [None]:
sfs.transform(X).shape

#### Methods

##### fit -- <sub><sup>*Learn the features to select from X.*</sup></sub>


##### transform -- <sub><sup>*Reduce X to the selected features.*</sup></sub>


##### fit_transform -- <sub><sup>*Fit to data, then transform it.*</sup></sub>


##### get_feature_names_out -- <sub><sup>*Mask feature names according to selected features.*</sup></sub>

## sklearn.impute

### [SimpleImputer](https://scikit-learn.org/stable/modules/generated/sklearn.impute.SimpleImputer.html)

*Imputation transformer for completing missing values.*


#### Examples

In [None]:
import numpy as np
from sklearn.impute import SimpleImputer
imp_mean = SimpleImputer(missing_values=np.nan, strategy='mean')
imp_mean.fit([[7, 2, 3], [4, np.nan, 6], [10, 5, 9]])

X = [[np.nan, 2, 3], [4, np.nan, 6], [10, np.nan, 9]]
print(imp_mean.transform(X))

#### Methods

##### fit -- <sub><sup>*Fit the imputer on X.*</sup></sub>

##### transform -- <sub><sup>*Impute all missing values in X.*</sup></sub>


##### fit_transform -- <sub><sup>*Fit to data, then transform it.*</sup></sub>


##### statistics_ -- <sub><sup>*The imputation fill value for each feature.*</sup></sub>


### [KNNImputer](https://scikit-learn.org/stable/modules/generated/sklearn.impute.KNNImputer.html)

*Imputation for completing missing values using k-Nearest Neighbors.*

*Each sample’s missing values are imputed using the mean value from n_neighbors nearest neighbors found in the training set. Two samples are close if the features that neither is missing are close.*


#### Examples

In [None]:
import numpy as np
from sklearn.impute import KNNImputer
X = [[1, 2, np.nan], [3, 4, 3], [np.nan, 6, 5], [8, 8, 7]]
imputer = KNNImputer(n_neighbors=2)
imputer.fit_transform(X)

#### Methods

##### fit -- <sub><sup>*Fit the imputer on X.*</sup></sub>

##### transform -- <sub><sup>*Impute all missing values in X.*</sup></sub>


##### fit_transform -- <sub><sup>*Fit to data, then transform it.*</sup></sub>


### [MissingIndicator](https://scikit-learn.org/stable/modules/generated/sklearn.impute.MissingIndicator.html)

*Binary indicators for missing values.*

*Note that this component typically should not be used in a vanilla Pipeline consisting of transformers and a classifier, but rather could be added using a FeatureUnion or ColumnTransformer.*

#### Examples

In [None]:
import numpy as np
from sklearn.impute import MissingIndicator
X1 = np.array([[np.nan, 1, 3],
               [4, 0, np.nan],
               [8, 1, 0]])
X2 = np.array([[5, 1, np.nan],
               [np.nan, 2, 3],
               [2, 4, 0]])
indicator = MissingIndicator()
indicator.fit(X1)

X2_tr = indicator.transform(X2)
X2_tr

#### Methods

## sklearn.preprocessing

### [StandardScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html)

*Standardize features by removing the mean and scaling to unit variance.*

*The standard score of a sample x is calculated as: z = (x - u) / s*

*where u is the mean of the training samples or zero if with_mean=False, and s is the standard deviation of the training samples or one if with_std=False.*


#### Examples

In [None]:
from sklearn.preprocessing import StandardScaler
data = [[0, 0], [0, 0], [1, 1], [1, 1]]
scaler = StandardScaler()
print(scaler.fit(data))

In [None]:
print(scaler.mean_)

In [None]:
print(scaler.transform(data))

In [None]:
print(scaler.transform([[2, 2]]))

#### Methods

##### fit -- <sub><sup>*Compute the mean and std to be used for later scaling.*</sup></sub>


##### partial_fit -- <sub><sup>*Online computation of mean and std on X for later scaling.*</sup></sub>

##### transform -- <sub><sup>*Perform standardization by centering and scaling.*</sup></sub>


##### fit_transform -- <sub><sup>*Fit to data, then transform it.*</sup></sub>



### [MinMaxScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html)

*Transform features by scaling each feature to a given range.*

*This estimator scales and translates each feature individually such that it is in the given range on the training set, e.g. between zero and one.*





#### Examples

In [None]:
from sklearn.preprocessing import MinMaxScaler
data = [[-1, 2], [-0.5, 6], [0, 10], [1, 18]]
scaler = MinMaxScaler()
print(scaler.fit(data))

In [None]:
print(scaler.data_max_)

In [None]:
print(scaler.transform(data))

In [None]:
print(scaler.transform([[2, 2]]))

#### Methods

##### fit -- <sub><sup>*Compute the minimum and maximum to be used for later scaling.*</sup></sub>


##### partial_fit -- <sub><sup>*Online computation of min and max on X for later scaling.*</sup></sub>

##### transform -- <sub><sup>*Scale features of X according to feature_range.*</sup></sub>


##### fit_transform -- <sub><sup>*Fit to data, then transform it.*</sup></sub>



### [MaxAbsScaler](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MaxAbsScaler.html)

*Scale each feature by its maximum absolute value.*

*This estimator scales and translates each feature individually such that the maximal absolute value of each feature in the training set will be 1.0. It does not shift/center the data, and thus does not destroy any sparsity.*

#### Examples

In [None]:
from sklearn.preprocessing import MaxAbsScaler
X = [[ 1., -1.,  2.],
     [ 2.,  0.,  0.],
     [ 0.,  1., -1.]]
transformer = MaxAbsScaler().fit(X)
transformer

In [None]:
transformer.transform(X)

#### Methods

##### fit -- <sub><sup>*Compute the maximum absolute value to be used for later scaling.*</sup></sub>


##### partial_fit -- <sub><sup>*Online computation of max absolute value of X for later scaling.*</sup></sub>

##### transform -- <sub><sup>*Scale the data.*</sup></sub>


##### fit_transform -- <sub><sup>*Fit to data, then transform it.*</sup></sub>



### [FunctionTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.FunctionTransformer.html)

*Constructs a transformer from an arbitrary callable.*

*A FunctionTransformer forwards its X (and optionally y) arguments to a user-defined function or function object and returns the result of this function. This is useful for stateless transformations such as taking the log of frequencies, doing custom scaling, etc.*

#### Examples

In [None]:
import numpy as np
from sklearn.preprocessing import FunctionTransformer
transformer = FunctionTransformer(np.log1p)
X = np.array([[0, 1], [2, 3]])
transformer.transform(X)

#### Methods

##### fit -- <sub><sup>*Fit transformer by checking X.*</sup></sub>

##### transform -- <sub><sup>*Transform X using the forward function.*</sup></sub>

##### fit_transform -- <sub><sup>*Fit to data, then transform it.*</sup></sub>

### [PolynomialFeatures](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.PolynomialFeatures.html)

*Generate polynomial and interaction features.*

*Generate a new feature matrix consisting of all polynomial combinations of the features with degree less than or equal to the specified degree. For example, if an input sample is two dimensional and of the form [a, b], the degree-2 polynomial features are [1, a, b, a^2, ab, b^2].*

#### Examples

In [None]:
import numpy as np
from sklearn.preprocessing import PolynomialFeatures
X = np.arange(6).reshape(3, 2)
X

In [None]:
poly = PolynomialFeatures(2)
poly.fit_transform(X)


In [None]:
poly = PolynomialFeatures(interaction_only=True)
poly.fit_transform(X)

#### Methods

##### fit -- <sub><sup>*Compute number of output features.*</sup></sub>


##### transform -- <sub><sup>*Transform data to polynomial features.*</sup></sub>


##### fit_transform -- <sub><sup>*Fit to data, then transform it.*</sup></sub>


##### get_feature_names_out -- <sub><sup>*DEPRECATED.  Do not use.*</sup></sub>

### [KBinsDiscretizer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.KBinsDiscretizer.html)

*Bin continuous data into intervals.*

#### Examples

In [None]:
from sklearn.preprocessing import KBinsDiscretizer
X = [[-2, 1, -4,   -1],
     [-1, 2, -3, -0.5],
     [ 0, 3, -2,  0.5],
     [ 1, 4, -1,    2]]
est = KBinsDiscretizer(n_bins=3, encode='ordinal', strategy='uniform')
est.fit(X)

In [None]:
Xt = est.transform(X)
Xt

In [None]:
est.bin_edges_[0]

In [None]:
est.inverse_transform(Xt)

#### Methods

##### fit -- <sub><sup>*Fit the estimator.*</sup></sub>

##### transform -- <sub><sup>*Discretize the data.*</sup></sub>

##### fit_transform -- <sub><sup>*Fit to data, then transform it.*</sup></sub>

### [OneHotEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html)

*Encode categorical features as a one-hot numeric array.*

*The input to this transformer should be an array-like of integers or strings, denoting the values taken on by categorical (discrete) features. The features are encoded using a one-hot (aka ‘one-of-K’ or ‘dummy’) encoding scheme. This creates a binary column for each category and returns a sparse matrix or dense array (depending on the sparse parameter)*

#### Examples

In [None]:
from sklearn.preprocessing import OneHotEncoder
enc = OneHotEncoder(handle_unknown='ignore')
X = [['Male', 1], ['Female', 3], ['Female', 2]]
enc.fit(X)

In [None]:
enc.categories_

In [None]:
enc.transform([['Female', 1], ['Male', 4]]).toarray()

In [None]:
enc.inverse_transform([[0, 1, 1, 0, 0], [0, 0, 0, 1, 0]])

In [None]:
enc.get_feature_names_out(['gender', 'group'])

One can always drop the first column for each feature.

In [None]:
drop_enc = OneHotEncoder(drop='first').fit(X)
drop_enc.categories_

In [None]:
drop_enc.transform([['Female', 1], ['Male', 2]]).toarray()

Or drop a column for feature only having 2 categories.

In [None]:
drop_binary_enc = OneHotEncoder(drop='if_binary').fit(X)
drop_binary_enc.categories_

In [None]:
drop_binary_enc.transform([['Female', 1], ['Male', 2]]).toarray()

#### Methods

##### fit -- <sub><sup>*Fit OneHotEncoder to X.*</sup></sub>

##### transform -- <sub><sup>*Transform X using one-hot encoding.*</sup></sub>

##### fit_transform -- <sub><sup>*Fit OneHotEncoder to X, then transform X.*</sup></sub>

#### Attributes

##### categories_ -- <sub><sup>*The categories of each feature determined during fitting (in order of the features in X and corresponding with the output of transform).*</sup></sub>

### [LabelEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html)

*Encode target labels with value between 0 and n_classes-1.*

*This transformer should be used to encode target values, i.e. y, and not the input X.*

#### Examples

In [None]:
from sklearn import preprocessing
le = preprocessing.LabelEncoder()
le.fit([1, 2, 2, 6])

In [None]:
le.classes_

In [None]:
le.transform([1, 1, 2, 6])

In [None]:
le.inverse_transform([0, 0, 1, 2])

It can also be used to transform non-numerical labels (as long as they are hashable and comparable) to numerical labels.

In [None]:
le = preprocessing.LabelEncoder()
le.fit(["paris", "paris", "tokyo", "amsterdam"])

In [None]:
list(le.classes_)

In [None]:
le.transform(["tokyo", "tokyo", "paris"])

In [None]:
list(le.inverse_transform([2, 2, 1]))

#### Methods

##### fit -- <sub><sup>*Fit label encoder.*</sup></sub>

##### transform -- <sub><sup>*Transform labels to normalized encoding.*</sup></sub>

##### fit_transform -- <sub><sup>*Fit label encoder and return encoded labels.*</sup></sub>

#### Attributes

##### classes_ -- <sub><sup>*Holds the label for each class.*</sup></sub>

### [OrdinalEncoder](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OrdinalEncoder.html)

*Encode categorical features as an integer array.*

*The input to this transformer should be an array-like of integers or strings, denoting the values taken on by categorical (discrete) features. The features are converted to ordinal integers. This results in a single column of integers (0 to n_categories - 1) per feature.*


#### Examples

In [None]:
from sklearn.preprocessing import OrdinalEncoder
enc = OrdinalEncoder()
X = [['Male', 1], ['Female', 3], ['Female', 2]]
enc.fit(X)

In [None]:
enc.categories_

In [None]:
enc.transform([['Female', 3], ['Male', 1]])

In [None]:
enc.inverse_transform([[1, 0], [0, 1]])

By default, OrdinalEncoder is lenient towards missing values by propagating them.

In [None]:
import numpy as np
X = [['Male', 1], ['Female', 3], ['Female', np.nan]]
enc.fit_transform(X)

#### Methods

##### fit -- <sub><sup>*Fit ordinal encoder.*</sup></sub>

##### transform -- <sub><sup>*Transform X to ordinal codes.*</sup></sub>

##### fit_transform -- <sub><sup>*Fit to data, then transform it.*</sup></sub>

#### Attributes

##### categories_ -- <sub><sup>*The categories of each feature determined during fit (in order of the features in X and corresponding with the output of transform).*</sup></sub>

### [LabelBinarizer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelBinarizer.html)

*Binarize labels in a one-vs-all fashion.*

*Several regression and binary classification algorithms are available in scikit-learn. A simple way to extend these algorithms to the multi-class classification case is to use the so-called one-vs-all scheme.*

#### Examples

In [None]:
from sklearn import preprocessing
lb = preprocessing.LabelBinarizer()
lb.fit([1, 2, 6, 4, 2])

In [None]:
lb.classes_

In [None]:
lb.transform([1, 6])

Binary targets transform to a column vector

In [None]:
lb = preprocessing.LabelBinarizer()
lb.fit_transform(['yes', 'no', 'no', 'yes'])

Passing a 2D matrix for multilabel classification

In [None]:
import numpy as np
lb.fit(np.array([[0, 1, 1], [1, 0, 0]]))

In [None]:
lb.classes_

In [None]:
lb.transform([0, 1, 2, 1])

#### Methods

##### fit -- <sub><sup>*Fit label binarizer.*</sup></sub>

##### transform -- <sub><sup>*Transform multi-class labels to binary labels.*</sup></sub>

##### fit_transform -- <sub><sup>*Fit label binarizer/transform multi-class labels to binary labels.*</sup></sub>

#### Attributes

##### classes_ -- <sub><sup>*Holds the label for each class.*</sup></sub>

### [MultiLabelBinarizer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MultiLabelBinarizer.html)

*Transform between iterable of iterables and a multilabel format.*

*Although a list of sets or tuples is a very intuitive format for multilabel data, it is unwieldy to process. This transformer converts between this intuitive format and the supported multilabel format: a (samples x classes) binary matrix indicating the presence of a class label.*

#### Examples

In [None]:
from sklearn.preprocessing import MultiLabelBinarizer
mlb = MultiLabelBinarizer()
mlb.fit_transform([(1, 2), (3,)])

In [None]:
mlb.classes_

In [None]:
mlb.fit_transform([{'sci-fi', 'thriller'}, {'comedy'}])

In [None]:
list(mlb.classes_)

A common mistake is to pass in a list, which leads to the following issue:

In [None]:
mlb = MultiLabelBinarizer()
mlb.fit(['sci-fi', 'thriller', 'comedy'])  # this is wrong.

In [None]:
mlb.classes_

To correct this, the list of labels should be passed in as:

In [None]:
mlb = MultiLabelBinarizer()
mlb.fit([['sci-fi', 'thriller', 'comedy']])

In [None]:
mlb.classes_

#### Methods

##### fit -- <sub><sup>*Fit the label sets binarizer, storing classes_.*</sup></sub>

##### transform -- <sub><sup>*Transform the given label sets.*</sup></sub>

##### fit_transform -- <sub><sup>*Fit the label sets binarizer and transform the given label sets.*</sup></sub>

##### classes_ -- <sub><sup>*A copy of the classes parameter when provided. Otherwise it corresponds to the sorted set of classes found when fitting.*</sup></sub>

### [add_dummy_feature](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.add_dummy_feature.html)

*Augment dataset with an additional dummy feature.*

*This is useful for fitting an intercept term with implementations which cannot otherwise fit it directly.*

In [None]:
from sklearn.preprocessing import add_dummy_feature
add_dummy_feature([[0, 1], [1, 0]])

### [Normalizer](https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.Normalizer.html)

*Normalize samples individually to unit norm.*

*Each sample (i.e. each row of the data matrix) with at least one non zero component is rescaled independently of other samples so that its norm (l1, l2 or inf) equals one.*

#### Examples

In [None]:
from sklearn.preprocessing import Normalizer
X = [[4, 1, 2, 2],
     [1, 3, 9, 3],
     [5, 7, 5, 1]]
transformer = Normalizer().fit(X)  # fit does nothing.

In [None]:
transformer.transform(X)  # Normalize samples individually to unit norm.

## sklearn.compose

### [ColumnTransformer](https://scikit-learn.org/stable/modules/generated/sklearn.compose.ColumnTransformer.html)

*Applies transformers to columns of an array or pandas DataFrame.*

*This estimator allows different columns or column subsets of the input to be transformed separately and the features generated by each transformer will be concatenated to form a single feature space. This is useful for heterogeneous or columnar data, to combine several feature extraction mechanisms or transformations into a single transformer.*


#### Examples

In [None]:
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer
ct = ColumnTransformer(
    [("norm1", Normalizer(norm='l1'), [0, 1]),
     ("norm2", Normalizer(norm='l1'), slice(2, 4))])
X = np.array([[0., 1., 2., 2.],
              [1., 1., 0., 1.]])
# Normalizer scales each row of X to unit norm. A separate scaling
# is applied for the two first and two last elements of each
# row independently.
ct.fit_transform(X)

#### Methods

##### fit -- <sub><sup>*Fit all transformers using X.*</sup></sub>

##### transform -- <sub><sup>*Transform X separately by each transformer, concatenate results.*</sup></sub>

##### fit_transform -- <sub><sup>*Fit all transformers, transform the data and concatenate results.*</sup></sub>

### [TransformedTargetRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.compose.TransformedTargetRegressor.html)

*Meta-estimator to regress on a transformed target.*

*Useful for applying a non-linear transformation to the target y in regression problems. This transformation can be given as a Transformer such as the QuantileTransformer or as a function and its inverse such as np.log and np.exp.*

#### Examples

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.compose import TransformedTargetRegressor
tt = TransformedTargetRegressor(regressor=LinearRegression(),
                                func=np.log, inverse_func=np.exp)
X = np.arange(4).reshape(-1, 1)
y = np.exp(2 * X).ravel()
tt.fit(X, y)

In [None]:
tt.score(X, y)

In [None]:
tt.regressor_.coef_

#### Methods

##### fit -- <sub><sup>*Fit the model according to the given training data.*</sup></sub>


##### predict -- <sub><sup>*Predict using the base regressor, applying inverse.*</sup></sub>


##### score -- <sub><sup>*Return the coefficient of determination of the prediction.*</sup></sub>


## sklearn.linear_model

### [LinearRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LinearRegression.html)

*Ordinary least squares Linear Regression.*

*LinearRegression fits a linear model with coefficients w = (w1, …, wp) to minimize the residual sum of squares between the observed targets in the dataset, and the targets predicted by the linear approximation.*

#### Examples

In [None]:
import numpy as np
from sklearn.linear_model import LinearRegression
X = np.array([[1, 1], [1, 2], [2, 2], [2, 3]])
# y = 1 * x_0 + 2 * x_1 + 3
y = np.dot(X, np.array([1, 2])) + 3
reg = LinearRegression().fit(X, y)

In [None]:
reg.score(X, y)

In [None]:
reg.coef_

In [None]:
reg.intercept_

In [None]:
reg.predict(np.array([[3, 5]]))

#### Methods

##### fit -- <sub><sup>*Fit linear model.*</sup></sub>


##### predict -- <sub><sup>*Predict using the linear model.*</sup></sub>


##### score -- <sub><sup>*Return the coefficient of determination of the prediction.*</sup></sub>


#### Attributes

##### coef_ -- <sub><sup>*Estimated coefficients for the linear regression problem.*</sup></sub>


##### intercept_ -- <sub><sup>*Independent term in the linear model. Set to 0.0 if fit_intercept = False.*</sup></sub>

### [SGDRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDRegressor.html)

*Linear model fitted by minimizing a regularized empirical loss with SGD.*

*SGD stands for Stochastic Gradient Descent: the gradient of the loss is estimated each sample at a time and the model is updated along the way with a decreasing strength schedule (aka learning rate).*

#### Examples

In [None]:
import numpy as np
from sklearn.linear_model import SGDRegressor
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
n_samples, n_features = 10, 5
rng = np.random.RandomState(0)
y = rng.randn(n_samples)
X = rng.randn(n_samples, n_features)
# Always scale the input. The most convenient way is to use a pipeline.
reg = make_pipeline(StandardScaler(),
                    SGDRegressor(max_iter=1000, tol=1e-3))
reg.fit(X, y)

In [None]:
reg.score(X, y)

In [None]:
reg.predict(np.array([[3, 5, 4, 2, 1]]))

#### Methods

##### fit -- <sub><sup>*Fit linear model with Stochastic Gradient Descent.*</sup></sub>


##### partial_fit -- <sub><sup>*Perform one epoch of stochastic gradient descent on given samples.*</sup></sub>

##### predict -- <sub><sup>*Predict using the linear model.*</sup></sub>


##### score -- <sub><sup>*Return the coefficient of determination of the prediction.*</sup></sub>


#### Attributes

##### n_iter_ -- <sub><sup>*The actual number of iterations before reaching the stopping criterion.*</sup></sub>

##### t_ -- <sub><sup>*Number of weight updates performed during training. Same as (n_iter_ * n_samples)*</sup></sub>

##### coef_ -- <sub><sup>*Weights assigned to the features.*</sup></sub>

##### intercept_ -- <sub><sup>*The intercept term.*</sup></sub>

### [Lasso](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Lasso.html)

*Linear Model trained with L1 prior as regularizer (aka the Lasso).*

#### Examples

In [None]:
from sklearn import linear_model
clf = linear_model.Lasso(alpha=0.1)
clf.fit([[0,0], [1, 1], [2, 2]], [0, 1, 2])

clf.coef_

In [None]:
clf.intercept_

#### Methods

##### fit -- <sub><sup>*Fit model with coordinate descent.*</sup></sub>


##### predict -- <sub><sup>*Predict using the linear model.*</sup></sub>


##### score -- <sub><sup>*Return the coefficient of determination of the prediction.*</sup></sub>


#### Attributes

##### coef_ -- <sub><sup>*Parameter vector (w in the cost function formula).*</sup></sub>


##### intercept_ -- <sub><sup>*Independent term in decision function.*</sup></sub>

### [LassoCV](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LassoCV.html)

*Lasso linear model with iterative fitting along a regularization path.*

*The best model is selected by cross-validation.*

#### Examples

In [None]:
from sklearn.linear_model import LassoCV
from sklearn.datasets import make_regression
X, y = make_regression(noise=4, random_state=0)
reg = LassoCV(cv=5, random_state=0).fit(X, y)
reg.score(X, y)

In [None]:
reg.predict(X[:1,])

### [Ridge](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Ridge.html)

*Linear least squares with l2 regularization.*

#### Examples

In [None]:
from sklearn.linear_model import Ridge
import numpy as np
n_samples, n_features = 10, 5
rng = np.random.RandomState(0)
y = rng.randn(n_samples)
X = rng.randn(n_samples, n_features)
clf = Ridge(alpha=1.0)
clf.fit(X, y)

In [None]:
clf.score(X, y)

In [None]:
clf.coef_

In [None]:
clf.intercept_

In [None]:
clf.predict(np.array([[3, 5, 2, 4, 1]]))

#### Methods

##### fit -- <sub><sup>*Fit Ridge regression model.*</sup></sub>


##### predict -- <sub><sup>*Predict using the linear model.*</sup></sub>


##### score -- <sub><sup>*Return the coefficient of determination of the prediction.*</sup></sub>


#### Attributes

##### coef_ -- <sub><sup>*Weight vector(s).*</sup></sub>


##### intercept_ -- <sub><sup>*Independent term in decision function. Set to 0.0 if fit_intercept = False.*</sup></sub>

### [RidgeCV](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeCV.html)

*Ridge regression with built-in cross-validation.*

*By default, it performs efficient Leave-One-Out Cross-Validation.*

#### Examples

In [None]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import RidgeCV
X, y = load_diabetes(return_X_y=True)
clf = RidgeCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X, y)
clf.score(X, y)

#### Methods

##### fit -- <sub><sup>*Fit Ridge regression model with cv.*</sup></sub>


##### predict -- <sub><sup>*Predict using the linear model.*</sup></sub>


##### score -- <sub><sup>*Return the coefficient of determination of the prediction.*</sup></sub>

#### Attributes

##### alpha_ -- <sub><sup>*Estimated regularization parameter, or, if alpha_per_target=True, the estimated regularization parameter for each target.*</sup></sub>

##### best_score_ -- <sub><sup>*Score of base estimator with best alpha, or, if alpha_per_target=True, a score for each target.*</sup></sub>

### [LogisticRegression](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html)

*Logistic Regression (aka logit, MaxEnt) classifier.*

*In the multiclass case, the training algorithm uses the one-vs-rest (OvR) scheme if the ‘multi_class’ option is set to ‘ovr’, and uses the cross-entropy loss if the ‘multi_class’ option is set to ‘multinomial’. (Currently the ‘multinomial’ option is supported only by the ‘lbfgs’, ‘sag’, ‘saga’ and ‘newton-cg’ solvers.)*

#### Examples

In [None]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
X, y = load_iris(return_X_y=True)
clf = LogisticRegression(random_state=0).fit(X, y)
clf.predict(X[:2, :])

In [None]:
clf.predict_proba(X[:2, :])

In [None]:
clf.score(X, y)

#### Methods

##### fit -- <sub><sup>*Fit the model according to the given training data.*</sup></sub>


##### predict -- <sub><sup>*Predict class labels for samples in X.*</sup></sub>


##### score -- <sub><sup>*Return the mean accuracy on the given test data and labels.*</sup></sub>


#### Attributes

##### classes_ -- <sub><sup>*A list of class labels known to the classifier.*</sup></sub>

##### coef_ -- <sub><sup>*Coefficient of the features in the decision function.*</sup></sub>

##### intercept_ -- <sub><sup>*Intercept (a.k.a. bias) added to the decision function.*</sup></sub>

### [LogisticRegressionCV](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html)

*Logistic Regression CV (aka logit, MaxEnt) classifier.*

*This class implements logistic regression using liblinear, newton-cg, sag of lbfgs optimizer. The newton-cg, sag and lbfgs solvers support only L2 regularization with primal formulation. The liblinear solver supports both L1 and L2 regularization, with a dual formulation only for the L2 penalty. Elastic-Net penalty is only supported by the saga solver.*

#### Examples

In [None]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegressionCV
X, y = load_iris(return_X_y=True)
clf = LogisticRegressionCV(cv=5, random_state=0).fit(X, y)
clf.predict(X[:2, :])

In [None]:
clf.predict_proba(X[:2, :]).shape

In [None]:
clf.score(X, y)

### [SGDClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.SGDClassifier.html)

*Linear classifiers (SVM, logistic regression, etc.) with SGD training.*

*This estimator implements regularized linear models with stochastic gradient descent (SGD) learning: the gradient of the loss is estimated each sample at a time and the model is updated along the way with a decreasing strength schedule (aka learning rate). SGD allows minibatch (online/out-of-core) learning via the partial_fit method. For best results using the default learning rate schedule, the data should have zero mean and unit variance.*

#### Examples

In [None]:
import numpy as np
from sklearn.linear_model import SGDClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
Y = np.array([1, 1, 2, 2])
# Always scale the input. The most convenient way is to use a pipeline.
clf = make_pipeline(StandardScaler(),
                    SGDClassifier(max_iter=1000, tol=1e-3))
clf.fit(X, Y)

In [None]:
print(clf.predict([[-0.8, -1]]))

#### Methods

##### fit -- <sub><sup>*Fit linear model with Stochastic Gradient Descent.*</sup></sub>


##### partial_fit -- <sub><sup>*Perform one epoch of stochastic gradient descent on given samples.*</sup></sub>

##### predict -- <sub><sup>*Predict class labels for samples in X.*</sup></sub>


##### score -- <sub><sup>*Return the mean accuracy on the given test data and labels.*</sup></sub>


#### Attributes

##### n_iter_ -- <sub><sup>*The actual number of iterations before reaching the stopping criterion. For multiclass fits, it is the maximum over every binary fit.*</sup></sub>

##### t_ -- <sub><sup>*Number of weight updates performed during training. Same as (n_iter_ * n_samples).*</sup></sub>

##### coef_ -- <sub><sup>*Weights assigned to the features.*</sup></sub>

##### intercept_ -- <sub><sup>*Constants in decision function.*</sup></sub>

### [RidgeClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeClassifier.html)

*Classifier using Ridge regression.*

*This classifier first converts the target values into {-1, 1} and then treats the problem as a regression task (multi-output regression in the multiclass case).*


In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import RidgeClassifier
X, y = load_breast_cancer(return_X_y=True)
clf = RidgeClassifier().fit(X, y)
clf.score(X, y)

### [RidgeClassifierCV](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeClassifierCV.html)

*Ridge classifier with built-in cross-validation.*

*By default, it performs Leave-One-Out Cross-Validation. Currently, only the n_features > n_samples case is handled efficiently.*

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import RidgeClassifierCV
X, y = load_breast_cancer(return_X_y=True)
clf = RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X, y)
clf.score(X, y)

### [Perceptron](https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.Perceptron.html)

*Linear perceptron classifier.*

#### Examples

In [None]:
from sklearn.datasets import load_digits
from sklearn.linear_model import Perceptron
X, y = load_digits(return_X_y=True)
clf = Perceptron(tol=1e-3, random_state=0)
clf.fit(X, y)

clf.score(X, y)

#### Methods

##### fit -- <sub><sup>*Fit linear model with Stochastic Gradient Descent.*</sup></sub>


##### partial_fit -- <sub><sup>*Perform one epoch of stochastic gradient descent on given samples.*</sup></sub>

##### predict -- <sub><sup>*Predict class labels for samples in X.*</sup></sub>


##### score -- <sub><sup>*Return the mean accuracy on the given test data and labels*</sup></sub>


#### Attributes

##### classes_ -- <sub><sup>*The unique classes labels.*</sup></sub>

##### coef_ -- <sub><sup>*Weights assigned to the features.*</sup></sub>

##### intercept_ -- <sub><sup>*Constants in decision function.*</sup></sub>

## sklearn.svm

### [SVC](https://scikit-learn.org/stable/modules/generated/sklearn.svm.SVC.html)

*C-Support Vector Classification.*

*The implementation is based on libsvm. The fit time scales at least quadratically with the number of samples and may be impractical beyond tens of thousands of samples.*

#### Examples

In [None]:
import numpy as np
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
X = np.array([[-1, -1], [-2, -1], [1, 1], [2, 1]])
y = np.array([1, 1, 2, 2])
from sklearn.svm import SVC
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
clf.fit(X, y)

In [None]:
print(clf.predict([[-0.8, -1]]))

#### Methods

##### fit -- <sub><sup>*Fit the SVM model according to the given training data.*</sup></sub>


##### predict -- <sub><sup>*Perform classification on samples in X.*</sup></sub>


##### score -- <sub><sup>*Return the mean accuracy on the given test data and labels.*</sup></sub>


#### Attributes

##### classes_ -- <sub><sup>*Multipliers of parameter C for each class. Computed based on the class_weight parameter.*</sup></sub>

##### coef_ -- <sub><sup>*Weights assigned to the features when kernel="linear".*</sup></sub>

##### intercept_ -- <sub><sup>*Constants in decision function.*</sup></sub>

##### support_ -- <sub><sup>*Indices of support vectors.*</sup></sub>

##### support_vectors_ -- <sub><sup>*Support vectors.*</sup></sub>

##### n_support_ -- <sub><sup>*Number of support vectors for each class.*</sup></sub>

### [LinearSVC](https://scikit-learn.org/stable/modules/generated/sklearn.svm.LinearSVC.html)

*Linear Support Vector Classification.*

*Similar to SVC with parameter kernel=’linear’, but implemented in terms of liblinear rather than libsvm, so it has more flexibility in the choice of penalties and loss functions and should scale better to large numbers of samples.*

In [None]:
from sklearn.svm import LinearSVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
X, y = make_classification(n_features=4, random_state=0)
clf = make_pipeline(StandardScaler(),
                    LinearSVC(random_state=0, tol=1e-5))
clf.fit(X, y)

In [None]:
print(clf.named_steps['linearsvc'].coef_)

In [None]:
print(clf.named_steps['linearsvc'].intercept_)

In [None]:
print(clf.predict([[0, 0, 0, 0]]))

## sklearn.naive_bayes

### [MultinomialNB](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.MultinomialNB.html)

*Naive Bayes classifier for multinomial models.*

*The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.*

In [None]:
import numpy as np
rng = np.random.RandomState(1)
X = rng.randint(5, size=(6, 100))
y = np.array([1, 2, 3, 4, 5, 6])
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X, y)

In [None]:
print(clf.predict(X[2:3]))

### [GaussianNB](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html)

*Gaussian Naive Bayes (GaussianNB).*

*Can perform online updates to model parameters via partial_fit. *


In [None]:
import numpy as np
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
Y = np.array([1, 1, 1, 2, 2, 2])
from sklearn.naive_bayes import GaussianNB
clf = GaussianNB()
clf.fit(X, Y)

In [None]:
print(clf.predict([[-0.8, -1]]))

In [None]:
clf_pf = GaussianNB()
clf_pf.partial_fit(X, Y, np.unique(Y))

In [None]:
print(clf_pf.predict([[-0.8, -1]]))

### [BernoulliNB](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.BernoulliNB.html)

*Naive Bayes classifier for multivariate Bernoulli models.*

*Like MultinomialNB, this classifier is suitable for discrete data. The difference is that while MultinomialNB works with occurrence counts, BernoulliNB is designed for binary/boolean features.*


In [None]:
import numpy as np
rng = np.random.RandomState(1)
X = rng.randint(5, size=(6, 100))
Y = np.array([1, 2, 3, 4, 4, 5])
from sklearn.naive_bayes import BernoulliNB
clf = BernoulliNB()
clf.fit(X, Y)

print(clf.predict(X[2:3]))

### [ComplementNB](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.ComplementNB.html)

*The Complement Naive Bayes classifier described in Rennie et al. (2003).*

*The Complement Naive Bayes classifier was designed to correct the “severe assumptions” made by the standard Multinomial Naive Bayes classifier. It is particularly suited for imbalanced data sets.*

In [None]:
import numpy as np
rng = np.random.RandomState(1)
X = rng.randint(5, size=(6, 100))
y = np.array([1, 2, 3, 4, 5, 6])
from sklearn.naive_bayes import ComplementNB
clf = ComplementNB()
clf.fit(X, y)

print(clf.predict(X[2:3]))

## sklearn.tree

### [DecisionTreeClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html)

*A decision tree classifier.*

#### Examples

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
clf = DecisionTreeClassifier(random_state=0)
iris = load_iris()
cross_val_score(clf, iris.data, iris.target, cv=10)

#### Methods

##### fit -- <sub><sup>*Build a decision tree classifier from the training set (X, y).*</sup></sub>


##### predict -- <sub><sup>*Predict class or regression value for X.*</sup></sub>


##### score -- <sub><sup>*Return the mean accuracy on the given test data and labels.*</sup></sub>


#### Attributes

##### feature_importances_ -- <sub><sup>*Return the feature importances.*</sup></sub>


### [DecisionTreeRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeRegressor.html)

*A decision tree regressor.*

#### Examples

In [None]:
from sklearn.datasets import load_diabetes
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeRegressor
X, y = load_diabetes(return_X_y=True)
regressor = DecisionTreeRegressor(random_state=0)
cross_val_score(regressor, X, y, cv=10)

#### Methods

##### fit -- <sub><sup>*Build a decision tree regressor from the training set (X, y).*</sup></sub>


##### predict -- <sub><sup>*Predict class or regression value for X.*</sup></sub>


##### score -- <sub><sup>*Return the coefficient of determination of the prediction.*</sup></sub>


#### Attributes

##### feature_importances_ -- <sub><sup>*Return the feature importances.*</sup></sub>


## sklearn.neighbors

### [KNeighborsClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)

*Classifier implementing the k-nearest neighbors vote.*

#### Examples

In [None]:
X = [[0], [1], [2], [3]]
y = [0, 0, 1, 1]
from sklearn.neighbors import KNeighborsClassifier
neigh = KNeighborsClassifier(n_neighbors=3)
neigh.fit(X, y)

neigh.predict([[1.1]])

In [None]:
neigh.predict_proba([[0.9]])

#### Methods

##### fit -- <sub><sup>*Fit the k-nearest neighbors classifier from the training dataset.*</sup></sub>


##### predict -- <sub><sup>*Predict the class labels for the provided data.*</sup></sub>


##### score -- <sub><sup>*Return the mean accuracy on the given test data and labels.*</sup></sub>


#### Attributes

##### classes_ -- <sub><sup>*Class labels known to the classifier*</sup></sub>


## sklearn.cluster

### [KMeans](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.KMeans.html)

*K-Means clustering.*

#### Examples

In [None]:
from sklearn.cluster import KMeans
import numpy as np
X = np.array([[1, 2], [1, 4], [1, 0],
              [10, 2], [10, 4], [10, 0]])
kmeans = KMeans(n_clusters=2, random_state=0).fit(X)
kmeans.labels_

kmeans.predict([[0, 0], [12, 3]])

kmeans.cluster_centers_

#### Methods

##### fit -- <sub><sup>*Compute k-means clustering.*</sup></sub>


##### predict -- <sub><sup>*Predict the closest cluster each sample in X belongs to.*</sup></sub>


##### score -- <sub><sup>*Transform X to a cluster-distance space.*</sup></sub>


#### Attributes

##### inertia_ -- <sub><sup>*Sum of squared distances of samples to their closest cluster center, weighted by the sample weights if provided.*</sup></sub>


##### labels_ -- <sub><sup>*Labels of each point*</sup></sub>


##### cluster_centers_ -- <sub><sup>*Coordinates of cluster centers.*</sup></sub>


### [AgglomerativeClustering](https://scikit-learn.org/stable/modules/generated/sklearn.cluster.AgglomerativeClustering.html)

*Recursively merges pair of clusters of sample data; uses linkage distance.*

#### Examples

In [None]:
from sklearn.cluster import AgglomerativeClustering
import numpy as np
X = np.array([[1, 2], [1, 4], [1, 0],
              [4, 2], [4, 4], [4, 0]])
clustering = AgglomerativeClustering().fit(X)
clustering

clustering.labels_

#### Methods

##### fit -- <sub><sup>*Fit the hierarchical clustering from features, or distance matrix.*</sup></sub>


##### predict -- <sub><sup>*Fit and return the result of each sample's clustering assignment.*</sup></sub>


#### Attributes

##### n_clusters_ -- <sub><sup>*The number of clusters found by the algorithm.*</sup></sub>


##### labels_ -- <sub><sup>*Cluster labels for each point.*</sup></sub>


##### n_leaves_ -- <sub><sup>*Number of leaves in the hierarchical tree.*</sup></sub>


## sklearn.ensemble

### [RandomForestClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html)

*A random forest is a meta estimator that fits a number of decision tree classifiers on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. The sub-sample size is controlled with the max_samples parameter if bootstrap=True (default), otherwise the whole dataset is used to build each tree.*

#### Examples

In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=4,
                           n_informative=2, n_redundant=0,
                           random_state=0, shuffle=False)
clf = RandomForestClassifier(max_depth=2, random_state=0)
clf.fit(X, y)

print(clf.predict([[0, 0, 0, 0]]))

#### Methods

##### fit -- <sub><sup>*Build a forest of trees from the training set (X, y).*</sup></sub>


##### predict -- <sub><sup>*Predict class for X.*</sup></sub>


##### score -- <sub><sup>*Return the mean accuracy on the given test data and labels.*</sup></sub>


#### Attributes

##### feature_importances_ -- <sub><sup>*The impurity-based feature importances.*</sup></sub>

### [RandomForestRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestRegressor.html)

*A random forest is a meta estimator that fits a number of classifying decision trees on various sub-samples of the dataset and uses averaging to improve the predictive accuracy and control over-fitting. The sub-sample size is controlled with the max_samples parameter if bootstrap=True (default), otherwise the whole dataset is used to build each tree.*

#### Examples

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.datasets import make_regression
X, y = make_regression(n_features=4, n_informative=2,
                       random_state=0, shuffle=False)
regr = RandomForestRegressor(max_depth=2, random_state=0)
regr.fit(X, y)

print(regr.predict([[0, 0, 0, 0]]))

#### Methods

##### fit -- <sub><sup>*Build a forest of trees from the training set (X, y).*</sup></sub>


##### predict -- <sub><sup>*Predict regression target for X.*</sup></sub>


##### score -- <sub><sup>*Return the coefficient of determination of the prediction.*</sup></sub>


#### Attributes

##### feature_importances_ -- <sub><sup>*The impurity-based feature importances.*</sup></sub>

### [AdaBoostClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html)

*An AdaBoost classifier is a meta-estimator that begins by fitting a classifier on the original dataset and then fits additional copies of the classifier on the same dataset but where the weights of incorrectly classified instances are adjusted such that subsequent classifiers focus more on difficult cases.*

#### Examples

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=1000, n_features=4,
                           n_informative=2, n_redundant=0,
                           random_state=0, shuffle=False)
clf = AdaBoostClassifier(n_estimators=100, random_state=0)
clf.fit(X, y)

clf.predict([[0, 0, 0, 0]])

clf.score(X, y)

#### Methods

##### fit -- <sub><sup>*Build a boosted classifier from the training set (X, y).*</sup></sub>


##### predict -- <sub><sup>*Predict classes for X.*</sup></sub>


##### score -- <sub><sup>*Return the mean accuracy on the given test data and labels.*</sup></sub>


#### Attributes

##### feature_importances_ -- <sub><sup>*The impurity-based feature importances.*</sup></sub>

### [AdaBoostRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostRegressor.html)

*An AdaBoost regressor is a meta-estimator that begins by fitting a regressor on the original dataset and then fits additional copies of the regressor on the same dataset but where the weights of instances are adjusted according to the error of the current prediction. As such, subsequent regressors focus more on difficult cases.*

#### Examples

In [None]:
from sklearn.ensemble import AdaBoostRegressor
from sklearn.datasets import make_regression
X, y = make_regression(n_features=4, n_informative=2,
                       random_state=0, shuffle=False)
regr = AdaBoostRegressor(random_state=0, n_estimators=100)
regr.fit(X, y)

regr.predict([[0, 0, 0, 0]])

regr.score(X, y)

#### Methods

##### fit -- <sub><sup>*Build a boosted regressor from the training set (X, y).*</sup></sub>


##### predict -- <sub><sup>*Predict regression value for X.*</sup></sub>


##### score -- <sub><sup>*Return the coefficient of determination of the prediction.*</sup></sub>


#### Attributes

##### feature_importances_ -- <sub><sup>*The impurity-based feature importances.*</sup></sub>

### [BaggingClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html)

*A Bagging classifier is an ensemble meta-estimator that fits base classifiers each on random subsets of the original dataset and then aggregate their individual predictions (either by voting or by averaging) to form a final prediction. Such a meta-estimator can typically be used as a way to reduce the variance of a black-box estimator (e.g., a decision tree), by introducing randomization into its construction procedure and then making an ensemble out of it.*

#### Examples

In [None]:
from sklearn.svm import SVC
from sklearn.ensemble import BaggingClassifier
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=100, n_features=4,
                           n_informative=2, n_redundant=0,
                           random_state=0, shuffle=False)
clf = BaggingClassifier(base_estimator=SVC(),
                        n_estimators=10, random_state=0).fit(X, y)
clf.predict([[0, 0, 0, 0]])

#### Methods

##### fit -- <sub><sup>*Build a Bagging ensemble of estimators from the training set (X, y).*</sup></sub>


##### predict -- <sub><sup>*Predict class for X.*</sup></sub>


##### score -- <sub><sup>*Return the mean accuracy on the given test data and labels.*</sup></sub>


#### Attributes

### [BaggingRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingRegressor.html)

*Bagging regressor is an ensemble meta-estimator that fits base regressors each on random subsets of the original dataset and then aggregate their individual predictions (either by voting or by averaging) to form a final prediction. Such a meta-estimator can typically be used as a way to reduce the variance of a black-box estimator (e.g., a decision tree), by introducing randomization into its construction procedure and then making an ensemble out of it.*

#### Examples

In [None]:
from sklearn.svm import SVR
from sklearn.ensemble import BaggingRegressor
from sklearn.datasets import make_regression
X, y = make_regression(n_samples=100, n_features=4,
                       n_informative=2, n_targets=1,
                       random_state=0, shuffle=False)
regr = BaggingRegressor(base_estimator=SVR(),
                        n_estimators=10, random_state=0).fit(X, y)
regr.predict([[0, 0, 0, 0]])

#### Methods

##### fit -- <sub><sup>*Build a Bagging ensemble of estimators from the training set (X, y).*</sup></sub>


##### predict -- <sub><sup>*Predict regression target for X.*</sup></sub>


##### score -- <sub><sup>*Return the coefficient of determination of the prediction.*</sup></sub>


#### Attributes

### [GradientBoostingClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html)

*GB builds an additive model in a forward stage-wise fashion; it allows for the optimization of arbitrary differentiable loss functions. In each stage n_classes_ regression trees are fit on the negative gradient of the loss function, e.g. binary or multiclass log loss. Binary classification is a special case where only a single regression tree is induced.*

#### Examples

In [None]:
from sklearn.datasets import make_hastie_10_2
from sklearn.ensemble import GradientBoostingClassifier

X, y = make_hastie_10_2(random_state=0)
X_train, X_test = X[:2000], X[2000:]
y_train, y_test = y[:2000], y[2000:]
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
    max_depth=1, random_state=0).fit(X_train, y_train)
clf.score(X_test, y_test)

#### Methods

##### fit -- <sub><sup>*Fit the gradient boosting model.*</sup></sub>


##### predict -- <sub><sup>*Predict class for X.*</sup></sub>


##### score -- <sub><sup>*Return the mean accuracy on the given test data and labels.*</sup></sub>


#### Attributes

##### feature_importances_ -- <sub><sup>*The impurity-based feature importances.*</sup></sub>

### [GradientBoostingRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingRegressor.html)

*GB builds an additive model in a forward stage-wise fashion; it allows for the optimization of arbitrary differentiable loss functions. In each stage a regression tree is fit on the negative gradient of the given loss function.*

#### Examples

In [None]:
from sklearn.datasets import make_regression
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
X, y = make_regression(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(
    X, y, random_state=0)
reg = GradientBoostingRegressor(random_state=0)
reg.fit(X_train, y_train)

reg.predict(X_test[1:2])

reg.score(X_test, y_test)

#### Methods

##### fit -- <sub><sup>*Fit the gradient boosting model.*</sup></sub>


##### predict -- <sub><sup>*Predict regression target for X.*</sup></sub>


##### score -- <sub><sup>*Return the coefficient of determination of the prediction.*</sup></sub>


#### Attributes

##### feature_importances_ -- <sub><sup>*The impurity-based feature importances.*</sup></sub>

## sklearn.dummy

### [DummyRegressor](https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyRegressor.html)

*Regressor that makes predictions using simple rules.*

*This regressor is useful as a simple baseline to compare with other (real) regressors. Do not use it for real problems.*

#### Examples

In [None]:
import numpy as np
from sklearn.dummy import DummyRegressor
X = np.array([1.0, 2.0, 3.0, 4.0])
y = np.array([2.0, 3.0, 5.0, 10.0])
dummy_regr = DummyRegressor(strategy="mean")
dummy_regr.fit(X, y)

In [None]:
dummy_regr.predict(X)

In [None]:
dummy_regr.score(X, y)

#### Methods

##### fit -- <sub><sup>*Fit the random regressor.*</sup></sub>


##### predict -- <sub><sup>*Perform classification on test vectors X.*</sup></sub>


##### score -- <sub><sup>*Return the coefficient of determination of the prediction.*</sup></sub>


### [DummyClassifier](https://scikit-learn.org/stable/modules/generated/sklearn.dummy.DummyClassifier.html)

*DummyClassifier makes predictions that ignore the input features.*

*This classifier serves as a simple baseline to compare against other more complex classifiers. The specific behavior of the baseline is selected with the strategy parameter.*

In [None]:
import numpy as np
from sklearn.dummy import DummyClassifier
X = np.array([-1, 1, 1, 1])
y = np.array([0, 1, 1, 1])
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X, y)

In [None]:
dummy_clf.predict(X)

In [None]:
dummy_clf.score(X, y)

## sklearn.datasets

### [load_iris](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.load_iris.html)

*Load and return the iris dataset (classification).*

*The iris dataset is a classic and very easy multi-class classification dataset.*

In [None]:
from sklearn.datasets import load_iris
data = load_iris()
data.target[[10, 25, 50]]

In [None]:
list(data.target_names)

### [fetch_california_housing](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_california_housing.html)


### [fetch_openml](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.fetch_openml.html)


### [make_regression](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_regression.html)


### [make_blobs](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_blobs.html)


In [None]:
from sklearn.datasets import make_blobs
X, y = make_blobs(n_samples=10, centers=3, n_features=2,
                  random_state=0)
print(X.shape)

y

In [None]:
X, y = make_blobs(n_samples=[3, 3, 4], centers=None, n_features=2,
                  random_state=0)
print(X.shape)

y

### [make_classification](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_classification.html)


### [make_multilabel_classification](https://scikit-learn.org/stable/modules/generated/sklearn.datasets.make_multilabel_classification.html)

## keras.datasets

### [mnist](https://keras.io/api/datasets/mnist/)

*Loads the MNIST dataset.*

*This is a dataset of 60,000 28x28 grayscale images of the 10 digits, along with a test set of 10,000 images.*

In [None]:
(X_train, y_train), (X_test, y_test) = mnist.load_data()
assert X_train.shape == (60000, 28, 28)
assert X_test.shape == (10000, 28, 28)
assert y_train.shape == (60000,)
assert y_test.shape == (10000,)

## sklearn.model_selection

### [train_test_split](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.train_test_split.html)

*Split arrays or matrices into random train and test subsets.*

In [None]:
import numpy as np
from sklearn.model_selection import train_test_split
X, y = np.arange(10).reshape((5, 2)), range(5)
X

In [None]:
list(y)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.33, random_state=42)

X_train

In [None]:
y_train

In [None]:
X_test

In [None]:
y_test

In [None]:
train_test_split(y, shuffle=False)

### [cross_validate](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_validate.html)

*Evaluate metric(s) by cross-validation and also record fit/score times.*

In [None]:
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_validate
from sklearn.metrics import make_scorer
from sklearn.metrics import confusion_matrix
from sklearn.svm import LinearSVC
diabetes = datasets.load_diabetes()
X = diabetes.data[:150]
y = diabetes.target[:150]
lasso = linear_model.Lasso()

In [None]:
# Single metric evaluation using cross_validate
cv_results = cross_validate(lasso, X, y, cv=3)
sorted(cv_results.keys())

cv_results['test_score']

In [None]:
# Multiple metric evaluation using cross_validate
scores = cross_validate(lasso, X, y, cv=3,
                        scoring=('r2', 'neg_mean_squared_error'),
                        return_train_score=True)
print(scores['test_neg_mean_squared_error'])

print(scores['train_r2'])

### [cross_val_score](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_score.html)

*Evaluate a score by cross-validation.*

In [None]:
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_score
diabetes = datasets.load_diabetes()
X = diabetes.data[:150]
y = diabetes.target[:150]
lasso = linear_model.Lasso()
print(cross_val_score(lasso, X, y, cv=3))

### [cross_val_predict](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.cross_val_predict.html)

*Generate cross-validated estimates for each input data point.*

*The data is split according to the cv parameter. Each sample belongs to exactly one test set, and its prediction is computed with an estimator fitted on the corresponding training set.*


In [None]:
from sklearn import datasets, linear_model
from sklearn.model_selection import cross_val_predict
diabetes = datasets.load_diabetes()
X = diabetes.data[:150]
y = diabetes.target[:150]
lasso = linear_model.Lasso()
y_pred = cross_val_predict(lasso, X, y, cv=3)

### [learning_curve](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.learning_curve.html)

*Determines cross-validated training and test scores for different training set sizes.*

*A cross-validation generator splits the whole dataset k times in training and test data. Subsets of the training set with varying sizes will be used to train the estimator and a score for each training subset size and the test set will be computed. Afterwards, the scores will be averaged over all k runs for each training subset size.*


### [validation_curve](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.validation_curve.html)

*Determine training and test scores for varying parameter values.*

*Compute scores for an estimator with different values of a specified parameter. This is similar to grid search with one parameter. However, this will also compute training scores and is merely a utility for plotting the results.*


### [ShuffleSplit](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.ShuffleSplit.html)

*Random permutation cross-validator*

*Yields indices to split data into training and test sets.*


In [None]:
import numpy as np
from sklearn.model_selection import ShuffleSplit
X = np.array([[1, 2], [3, 4], [5, 6], [7, 8], [3, 4], [5, 6]])
y = np.array([1, 2, 1, 2, 1, 2])


In [None]:
rs = ShuffleSplit(n_splits=5, test_size=.25, random_state=0)
rs.get_n_splits(X)

print(rs)

for train_index, test_index in rs.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)

In [None]:
rs = ShuffleSplit(n_splits=5, train_size=0.5, test_size=.25,
                  random_state=0)
for train_index, test_index in rs.split(X):
    print("TRAIN:", train_index, "TEST:", test_index)

### [StratifiedShuffleSplit](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedShuffleSplit.html)

*Provides train/test indices to split data in train/test sets.*

*This cross-validation object is a merge of StratifiedKFold and ShuffleSplit, which returns stratified randomized folds. The folds are made by preserving the percentage of samples for each class.*


#### Examples

In [None]:
import numpy as np
from sklearn.model_selection import StratifiedShuffleSplit
X = np.array([[1, 2], [3, 4], [1, 2], [3, 4], [1, 2], [3, 4]])
y = np.array([0, 0, 0, 1, 1, 1])
sss = StratifiedShuffleSplit(n_splits=5, test_size=0.5, random_state=0)
sss.get_n_splits(X, y)

print(sss)

for train_index, test_index in sss.split(X, y):
    print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

#### Methods

##### split -- <sub><sup>*Generate indices to split data into training and test set.*</sup></sub>

### [permutation_test_score](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.permutation_test_score.html)

*Evaluate the significance of a cross-validated score with permutations*

*Permutes targets to generate ‘randomized data’ and compute the empirical p-value against the null hypothesis that features and targets are independent.*

### [GridSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html)

*Exhaustive search over specified parameter values for an estimator.*

*The parameters of the estimator used to apply these methods are optimized by cross-validated grid-search over a parameter grid.*

#### Examples

In [None]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
iris = datasets.load_iris()
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = svm.SVC()
clf = GridSearchCV(svc, parameters)
clf.fit(iris.data, iris.target)

sorted(clf.cv_results_.keys())

#### Methods

##### fit -- <sub><sup>*Run fit with all sets of parameters.*</sup></sub>


##### predict -- <sub><sup>*Call predict on the estimator with the best found parameters.*</sup></sub>


##### transform -- <sub><sup>*Call transform on the estimator with the best found parameters.*</sup></sub>


#### Attributes

##### cv_results_ (only after fit) -- <sub><sup>*A dict with keys as column headers and values as columns, that can be imported into a pandas DataFrame.*</sup></sub>

##### best_index_ -- <sub><sup>*The index (of the cv_results_ arrays) which corresponds to the best candidate parameter setting.*</sup></sub>


##### best_estimator_ -- <sub><sup>Estimator that was chosen by the search, i.e. estimator which gave highest score (or smallest loss if specified) on the left out data.*</sup></sub>


##### best_params_ -- <sub><sup>*Parameter setting that gave the best results on the hold out data.*</sup></sub>


##### best_score_ -- <sub><sup>*Mean cross-validated score of the best_estimator*</sup></sub>

### [RandomizedSearchCV](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.RandomizedSearchCV.html)

*The parameters of the estimator used to apply these methods are optimized by cross-validated search over parameter settings.*

*In contrast to GridSearchCV, not all parameter values are tried out, but rather a fixed number of parameter settings is sampled from the specified distributions. The number of parameter settings that are tried is given by n_iter.*

#### Examples

In [None]:
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
iris = load_iris()
logistic = LogisticRegression(solver='saga', tol=1e-2, max_iter=200,
                              random_state=0)
distributions = dict(C=uniform(loc=0, scale=4),
                     penalty=['l2', 'l1'])
clf = RandomizedSearchCV(logistic, distributions, random_state=0)
search = clf.fit(iris.data, iris.target)
search.best_params_

#### Methods

##### fit -- <sub><sup>*Run fit with all sets of parameters.*</sup></sub>


##### predict -- <sub><sup>*Call predict on the estimator with the best found parameters.*</sup></sub>


##### transform -- <sub><sup>*Call transform on the estimator with the best found parameters.*</sup></sub>


#### Attributes

##### cv_results_ (only after fit) -- <sub><sup>*A dict with keys as column headers and values as columns, that can be imported into a pandas DataFrame.*</sup></sub>

##### best_index_ -- <sub><sup>*The index (of the cv_results_ arrays) which corresponds to the best candidate parameter setting.*</sup></sub>


##### best_estimator_ -- <sub><sup>Estimator that was chosen by the search, i.e. estimator which gave highest score (or smallest loss if specified) on the left out data.*</sup></sub>


##### best_params_ -- <sub><sup>*Parameter setting that gave the best results on the hold out data.*</sup></sub>


##### best_score_ -- <sub><sup>*Mean cross-validated score of the best_estimator*</sup></sub>

## sklearn.metrics

### [mean_squared_error](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_squared_error.html)

*Mean squared error regression loss.*

In [None]:
from sklearn.metrics import mean_squared_error
y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
mean_squared_error(y_true, y_pred)

In [None]:
y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
mean_squared_error(y_true, y_pred, squared=False)

In [None]:
y_true = [[0.5, 1],[-1, 1],[7, -6]]
y_pred = [[0, 2],[-1, 2],[8, -5]]
mean_squared_error(y_true, y_pred)

In [None]:
mean_squared_error(y_true, y_pred, squared=False)

In [None]:
mean_squared_error(y_true, y_pred, multioutput='raw_values')

In [None]:
mean_squared_error(y_true, y_pred, multioutput=[0.3, 0.7])

### [mean_absolute_error](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_error.html)

*Mean absolute error regression loss.*

In [None]:
from sklearn.metrics import mean_absolute_error
y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
mean_absolute_error(y_true, y_pred)

In [None]:
y_true = [[0.5, 1], [-1, 1], [7, -6]]
y_pred = [[0, 2], [-1, 2], [8, -5]]
mean_absolute_error(y_true, y_pred)

In [None]:
mean_absolute_error(y_true, y_pred, multioutput='raw_values')

In [None]:
mean_absolute_error(y_true, y_pred, multioutput=[0.3, 0.7])

### [mean_absolute_percentage_error](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.mean_absolute_percentage_error.html)

*Note here that the output is not a percentage in the range [0, 100] and a value of 100 does not mean 100% but 1e2. Furthermore, the output can be arbitrarily high when y_true is small (which is specific to the metric) or when abs(y_true - y_pred) is large (which is common for most regression metrics).*


In [None]:
from sklearn.metrics import mean_absolute_percentage_error
y_true = [3, -0.5, 2, 7]
y_pred = [2.5, 0.0, 2, 8]
mean_absolute_percentage_error(y_true, y_pred)

In [None]:
y_true = [[0.5, 1], [-1, 1], [7, -6]]
y_pred = [[0, 2], [-1, 2], [8, -5]]
mean_absolute_percentage_error(y_true, y_pred)

In [None]:
mean_absolute_percentage_error(y_true, y_pred, multioutput=[0.3, 0.7])

In [None]:
# the value when some element of the y_true is zero is arbitrarily high because
# of the division by epsilon
y_true = [1., 0., 2.4, 7.]
y_pred = [1.2, 0.1, 2.4, 8.]
mean_absolute_percentage_error(y_true, y_pred)

### [log_loss](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.log_loss.html)

*This is the loss function used in (multinomial) logistic regression and extensions of it such as neural networks, defined as the negative log-likelihood of a logistic model that returns y_pred probabilities for its training data y_true. The log loss is only defined for two or more labels.*


In [None]:
from sklearn.metrics import log_loss
log_loss(["spam", "ham", "ham", "spam"],
         [[.1, .9], [.9, .1], [.8, .2], [.35, .65]])

### [hinge_loss](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.hinge_loss.html)

*Average hinge loss (non-regularized).*

*The cumulated hinge loss is therefore an upper bound of the number of mistakes made by the classifier.*

In [None]:
from sklearn import svm
from sklearn.metrics import hinge_loss
X = [[0], [1]]
y = [-1, 1]
est = svm.LinearSVC(random_state=0)
est.fit(X, y)

In [None]:
pred_decision = est.decision_function([[-2], [3], [0.5]])
pred_decision

In [None]:
hinge_loss([-1, 1, 1], pred_decision)

In the multiclass case:

In [None]:
import numpy as np
X = np.array([[0], [1], [2], [3]])
Y = np.array([0, 1, 2, 3])
labels = np.array([0, 1, 2, 3])
est = svm.LinearSVC()
est.fit(X, Y)

In [None]:
pred_decision = est.decision_function([[-1], [2], [3]])
pred_decision

In [None]:
y_true = [0, 2, 3]
hinge_loss(y_true, pred_decision, labels=labels)

### [confusion_matrix](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.confusion_matrix.html)

In [None]:
from sklearn.metrics import confusion_matrix
y_true = [2, 0, 2, 2, 0, 1]
y_pred = [0, 0, 2, 2, 0, 2]
confusion_matrix(y_true, y_pred)

In [None]:
y_true = ["cat", "ant", "cat", "cat", "ant", "bird"]
y_pred = ["ant", "ant", "cat", "cat", "ant", "cat"]
confusion_matrix(y_true, y_pred, labels=["ant", "bird", "cat"])

In the binary case, we can extract true positives, etc as follows:

In [None]:
tn, fp, fn, tp = confusion_matrix([0, 1, 0, 1], [1, 1, 1, 0]).ravel()
(tn, fp, fn, tp)

### [ConfusionMatrixDisplay](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.ConfusionMatrixDisplay.html)

*Compute confusion matrix to evaluate the accuracy of a classification.*

In [None]:
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
X, y = make_classification(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=0)
clf = SVC(random_state=0)
clf.fit(X_train, y_train)

predictions = clf.predict(X_test)
cm = confusion_matrix(y_test, predictions, labels=clf.classes_)
disp = ConfusionMatrixDisplay(confusion_matrix=cm,
                              display_labels=clf.classes_)
disp.plot()

plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
X, y = make_classification(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=0)
clf = SVC(random_state=0)
clf.fit(X_train, y_train)

ConfusionMatrixDisplay.from_estimator(
    clf, X_test, y_test)

plt.show()

In [None]:
import matplotlib.pyplot as plt
from sklearn.datasets import make_classification
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
X, y = make_classification(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(
        X, y, random_state=0)
clf = SVC(random_state=0)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)
ConfusionMatrixDisplay.from_predictions(
   y_test, y_pred)

plt.show()

### [precision_score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_score.html)

*The precision is the ratio tp / (tp + fp) where tp is the number of true positives and fp the number of false positives. The precision is intuitively the ability of the classifier not to label as positive a sample that is negative.*

*The best value is 1 and the worst value is 0.*

In [None]:
from sklearn.metrics import precision_score
y_true = [0, 1, 2, 0, 1, 2]
y_pred = [0, 2, 1, 0, 0, 1]
precision_score(y_true, y_pred, average='macro')

In [None]:
precision_score(y_true, y_pred, average='micro')

In [None]:
precision_score(y_true, y_pred, average='weighted')

In [None]:
precision_score(y_true, y_pred, average=None)

In [None]:
y_pred = [0, 0, 0, 0, 0, 0]
precision_score(y_true, y_pred, average=None)

In [None]:
precision_score(y_true, y_pred, average=None, zero_division=1)

In [None]:
# multilabel classification
y_true = [[0, 0, 0], [1, 1, 1], [0, 1, 1]]
y_pred = [[0, 0, 0], [1, 1, 1], [1, 1, 0]]
precision_score(y_true, y_pred, average=None)

### [recall_score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.recall_score.html)

*The recall is the ratio tp / (tp + fn) where tp is the number of true positives and fn the number of false negatives. The recall is intuitively the ability of the classifier to find all the positive samples.*

*The best value is 1 and the worst value is 0.*

In [None]:
from sklearn.metrics import recall_score
y_true = [0, 1, 2, 0, 1, 2]
y_pred = [0, 2, 1, 0, 0, 1]
recall_score(y_true, y_pred, average='macro')

In [None]:
recall_score(y_true, y_pred, average='micro')

In [None]:
recall_score(y_true, y_pred, average='weighted')

In [None]:
recall_score(y_true, y_pred, average=None)

In [None]:
y_true = [0, 0, 0, 0, 0, 0]
recall_score(y_true, y_pred, average=None)

In [None]:
recall_score(y_true, y_pred, average=None, zero_division=1)

In [None]:
# multilabel classification
y_true = [[0, 0, 0], [1, 1, 1], [0, 1, 1]]
y_pred = [[0, 0, 0], [1, 1, 1], [1, 1, 0]]
recall_score(y_true, y_pred, average=None)

### [silhouette_score](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.silhouette_score.html)

*Compute the mean Silhouette Coefficient of all samples.*

*The Silhouette Coefficient is calculated using the mean intra-cluster distance (a) and the mean nearest-cluster distance (b) for each sample. The Silhouette Coefficient for a sample is (b - a) / max(a, b). To clarify, b is the distance between a sample and the nearest cluster that the sample is not a part of.*

*The best value is 1 and the worst value is -1.*

In [None]:
#sklearn.metrics.silhouette_score(X, labels)

### [make_scorer](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.make_scorer.html)

*This factory function wraps scoring functions for use in GridSearchCV and cross_val_score. It takes a score function, such as accuracy_score, mean_squared_error, adjusted_rand_score or average_precision_score and returns a callable that scores an estimator’s output. The signature of the call is (estimator, X, y) where estimator is the model to be evaluated, X is the data and y is the ground truth labeling (or None in the case of unsupervised models).*

In [None]:
from sklearn.metrics import fbeta_score, make_scorer
ftwo_scorer = make_scorer(fbeta_score, beta=2)
ftwo_scorer

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.svm import LinearSVC
grid = GridSearchCV(LinearSVC(), param_grid={'C': [1, 10]},
                    scoring=ftwo_scorer)

### [classification_report](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.classification_report.html)

*Build a text report showing the main classification metrics.*

In [None]:
from sklearn.metrics import classification_report
y_true = [0, 1, 2, 2, 2]
y_pred = [0, 0, 2, 2, 1]
target_names = ['class 0', 'class 1', 'class 2']
print(classification_report(y_true, y_pred, target_names=target_names))

In [None]:
y_pred = [1, 1, 0]
y_true = [1, 1, 1]
print(classification_report(y_true, y_pred, labels=[1, 2, 3], digits=4)) # precision of 4 decimals.

### [precision_recall_curve](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.precision_recall_curve.html)

*Compute precision-recall pairs for different probability thresholds.*

*Note: this implementation is restricted to the binary classification task.*

In [None]:
import numpy as np
from sklearn.metrics import precision_recall_curve
y_true = np.array([0, 0, 1, 1])
y_scores = np.array([0.1, 0.4, 0.35, 0.8])
precision, recall, thresholds = precision_recall_curve(
    y_true, y_scores)
precision

In [None]:
recall

In [None]:
thresholds

### [roc_curve](https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html)

*Compute Receiver operating characteristic (ROC).*

In [None]:
import numpy as np
from sklearn import metrics
y = np.array([1, 1, 2, 2])
scores = np.array([0.1, 0.4, 0.35, 0.8])
fpr, tpr, thresholds = metrics.roc_curve(y, scores, pos_label=2)
fpr

In [None]:
tpr

In [None]:
thresholds

## sklearn.decomposition

### [PCA](https://scikit-learn.org/stable/modules/generated/sklearn.decomposition.PCA.html)

*Principal component analysis (PCA).*

*Linear dimensionality reduction using Singular Value Decomposition of the data to project it to a lower dimensional space. The input data is centered but not scaled for each feature before applying the SVD.*

#### Examples

In [None]:
import numpy as np
from sklearn.decomposition import PCA
X = np.array([[-1, -1], [-2, -1], [-3, -2], [1, 1], [2, 1], [3, 2]])
pca = PCA(n_components=2)
pca.fit(X)

print(pca.explained_variance_ratio_)

print(pca.singular_values_)

In [None]:
pca = PCA(n_components=2, svd_solver='full')
pca.fit(X)

print(pca.explained_variance_ratio_)

print(pca.singular_values_)

In [None]:
pca = PCA(n_components=1, svd_solver='arpack')
pca.fit(X)

print(pca.explained_variance_ratio_)

print(pca.singular_values_)

#### Methods

##### fit -- <sub><sup>*Fit the model with X.*</sup></sub>


##### transform -- <sub><sup>*Apply dimensionality reduction to X.*</sup></sub>


##### fit_transform -- <sub><sup>*Fit the model with X and apply the dimensionality reduction on X.*</sup></sub>


#### Attributes

##### explained_variance_ -- <sub><sup>*The amount of variance explained by each of the selected components. The variance estimation uses n_samples - 1 degrees of freedom.  Equal to n_components largest eigenvalues of the covariance matrix of X.*</sup></sub>

## sklearn.pipeline

### [make_pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.make_pipeline.html)

*This is a shorthand for the Pipeline constructor; it does not require, and does not permit, naming the estimators. Instead, their names will be set to the lowercase of their types automatically.*

In [None]:
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
make_pipeline(StandardScaler(), GaussianNB(priors=None))

### [Pipeline](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html)

*Sequentially apply a list of transforms and a final estimator. Intermediate steps of the pipeline must be ‘transforms’, that is, they must implement fit and transform methods. The final estimator only needs to implement fit. The transformers in the pipeline can be cached using memory argument.*

#### Examples

In [None]:
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
X, y = make_classification(random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    random_state=0)
pipe = Pipeline([('scaler', StandardScaler()), ('svc', SVC())])
# The pipeline can be used as any other estimator
# and avoids leaking the test set into the train set
pipe.fit(X_train, y_train)

pipe.score(X_test, y_test)

#### Methods

##### fit -- <sub><sup>*Fit the model.*</sup></sub>

##### transform -- <sub><sup>*Transform the data, and apply transform with the final estimator.*</sup></sub>

##### fit_transform -- <sub><sup>*Fit the model and transform with the final estimator.*</sup></sub>

##### score -- <sub><sup>*Transform the data, and apply score with the final estimator.*</sup></sub>

### [FeatureUnion](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.FeatureUnion.html)

*This estimator applies a list of transformer Examples in parallel to the input data, then concatenates the results. This is useful to combine several feature extraction mechanisms into a single transformer.*

In [None]:
from sklearn.pipeline import FeatureUnion
from sklearn.decomposition import PCA, TruncatedSVD
union = FeatureUnion([("pca", PCA(n_components=1)),
                      ("svd", TruncatedSVD(n_components=2))])
X = [[0., 1., 3], [2., 2., 5]]
union.fit_transform(X)

## sklearn.set_config

### [set_config](https://scikit-learn.org/stable/modules/generated/sklearn.set_config.html)

*Set global scikit-learn configuration*

## sklearn.utils

### [all_estimators](https://scikit-learn.org/stable/modules/generated/sklearn.utils.all_estimators.html)

*Get a list of all estimators from sklearn.*

## sklearn.utils.multiclass

### [type_of_target](https://scikit-learn.org/stable/modules/generated/sklearn.utils.multiclass.type_of_target.html)

*Determine the type of data indicated by the target.*

In [None]:
from sklearn.utils.multiclass import type_of_target
import numpy as np
type_of_target([0.1, 0.6])

In [None]:
type_of_target([1, -1, -1, 1])

In [None]:
type_of_target(['a', 'b', 'a'])

In [None]:
type_of_target([1.0, 2.0])

In [None]:
type_of_target([1, 0, 2])

In [None]:
type_of_target([1.0, 0.0, 3.0])

In [None]:
type_of_target(['a', 'b', 'c'])

In [None]:
type_of_target(np.array([[1, 2], [3, 1]]))

In [None]:
type_of_target([[1, 2]])

In [None]:
type_of_target(np.array([[1.5, 2.0], [3.0, 1.6]]))

In [None]:
type_of_target(np.array([[0, 1], [1, 1]]))

## scipy.stats

### [uniform](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.uniform.html)

*A uniform continuous random variable.*

*In the standard form, the distribution is uniform on [0, 1]. Using the parameters loc and scale, one obtains the uniform distribution on [loc, loc + scale].*

In [None]:
from scipy.stats import uniform
import matplotlib.pyplot as plt

fig, ax = plt.subplots(1, 1)

# Calculate the first four moments:
mean, var, skew, kurt = uniform.stats(moments='mvsk')

# Display the probability density function (pdf):
x = np.linspace(uniform.ppf(0.01),
                uniform.ppf(0.99), 100)
ax.plot(x, uniform.pdf(x),
       'r-', lw=5, alpha=0.6, label='uniform pdf')

# Alternatively, the distribution object can be called (as a function) to fix the shape, location and scale parameters.
# This returns a “frozen” RV object holding the given parameters fixed.
#Freeze the distribution and display the frozen pdf.

rv = uniform()
ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf')

# Check accuracy of cdf and ppf:
vals = uniform.ppf([0.001, 0.5, 0.999])
np.allclose([0.001, 0.5, 0.999], uniform.cdf(vals))

# Generate random numbers:
r = uniform.rvs(size=1000)

# And compare the histogram:
ax.hist(r, density=True, histtype='stepfilled', alpha=0.2)
ax.legend(loc='best', frameon=False)
plt.show()

### [loguniform](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.loguniform.html)

*A loguniform or reciprocal continuous random variable.*

In [None]:
from scipy.stats import loguniform
import matplotlib.pyplot as plt
fig, ax = plt.subplots(1, 1)

# Calculate the first four moments:
a, b = 0.01, 1.25
mean, var, skew, kurt = loguniform.stats(a, b, moments='mvsk')

# Display the probability density function (pdf):
x = np.linspace(loguniform.ppf(0.01, a, b),
                loguniform.ppf(0.99, a, b), 100)
ax.plot(x, loguniform.pdf(x, a, b),
       'r-', lw=5, alpha=0.6, label='loguniform pdf')

# Freeze the distribution and display the frozen pdf:
rv = loguniform(a, b)
ax.plot(x, rv.pdf(x), 'k-', lw=2, label='frozen pdf')

# Check accuracy of cdf and ppf
vals = loguniform.ppf([0.001, 0.5, 0.999], a, b)
np.allclose([0.001, 0.5, 0.999], loguniform.cdf(vals, a, b))

# Generate random numbers:
r = loguniform.rvs(a, b, size=1000)

# And compare the histogram:
ax.hist(r, density=True, histtype='stepfilled', alpha=0.2)
ax.legend(loc='best', frameon=False)
plt.show()

## scipy.sparse

### [csr_matrix](https://docs.scipy.org/doc/scipy/reference/generated/scipy.sparse.csr_matrix.html)

*Compressed Sparse Row matrix*

In [None]:
from scipy.sparse import csr_matrix
csr_matrix((3, 4), dtype=np.int8).toarray()

In [None]:
row = np.array([0, 0, 1, 2, 2, 2])
col = np.array([0, 2, 2, 0, 1, 2])
data = np.array([1, 2, 3, 4, 5, 6])
csr = csr_matrix((data, (row, col)), shape=(3, 3)) # a[row[k], col[k]] = data[k]
csr.toarray()

In [None]:
csr.todense() # converts to matrix object

## imblearn.under_sampling

### [RandomUnderSampler](https://imbalanced-learn.org/stable/references/generated/imblearn.under_sampling.RandomUnderSampler.html)

*Under-sample the majority class(es) by randomly picking samples with or without replacement.*

#### Examples

In [None]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.under_sampling import RandomUnderSampler
X, y = make_classification(n_classes=2, class_sep=2,
 weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
print('Original dataset shape %s' % Counter(y))

In [None]:
rus = RandomUnderSampler(random_state=42)
X_res, y_res = rus.fit_resample(X, y)
print('Resampled dataset shape %s' % Counter(y_res))

## imblearn.over_sampling

### [RandomOverSampler](https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.RandomOverSampler.html)

*Object to over-sample the minority class(es) by picking samples at random with replacement. The bootstrap can be generated in a smoothed manner.*

#### Examples

In [None]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import RandomOverSampler
X, y = make_classification(n_classes=2, class_sep=2,
weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
print('Original dataset shape %s' % Counter(y))

In [None]:
ros = RandomOverSampler(random_state=42)
X_res, y_res = ros.fit_resample(X, y)
print('Resampled dataset shape %s' % Counter(y_res))

### [SMOTE](https://imbalanced-learn.org/stable/references/generated/imblearn.over_sampling.SMOTE.html)

*Class to perform over-sampling using SMOTE (Synthetic Minority Over-sampling Technique).*

#### Examples

In [None]:
from collections import Counter
from sklearn.datasets import make_classification
from imblearn.over_sampling import SMOTE
X, y = make_classification(n_classes=2, class_sep=2,
weights=[0.1, 0.9], n_informative=3, n_redundant=1, flip_y=0,
n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
print('Original dataset shape %s' % Counter(y))

In [None]:
sm = SMOTE(random_state=42)
X_res, y_res = sm.fit_resample(X, y)
print('Resampled dataset shape %s' % Counter(y_res))

## warnings

### [filterwarnings](https://stackoverflow.com/questions/29086398/sklearn-turning-off-warnings)

*Turn off warnings in sklearn*

In [None]:
import warnings
from sklearn.exceptions import DataConversionWarning
warnings.filterwarnings(action='ignore', category=DataConversionWarning)