# Python For Data Science Cheat Sheet Scikit-Learn

<b>Scikit-learn</b> is an open source Python library that implements a range of machine learning, preprocessing, cross-validation and visualization algorithms using a unified interface.


### A Basic Example:

In [1]:
from sklearn import neighbors, datasets, preprocessing 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score 

In [2]:
iris = datasets.load_iris() 
X, y = iris.data[:, :2], iris.target 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33) 
scaler = preprocessing.StandardScaler().fit(X_train) 
X_train = scaler.transform(X_train) 
X_test = scaler.transform(X_test) 
knn = neighbors.KNeighborsClassifier(n_neighbors=5) 
knn.fit(X_train, y_train) 
y_pred = knn.predict(X_test) 
accuracy_score(y_test, y_pred)

0.631578947368421

# LOADING THE DATA - Also see NumPy & Pandas
 <br>
 Your data needs to be numeric and stored as NumPy arrays or SciPy sparse matrices. Other types that are convertible to numeric arrays, such as Pandas DataFrame, are also acceptable.

In [8]:
import numpy as np 
X = np.random.random((10,5)) 
y = np.array(['M','M','F','F','M','F','M','M','F','F','F']) 
X[X < 0.7] = 0

# TRAINING AND TEST DATA

In [3]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=0)

# PREPROCESSING THE DATA

### Standardization

In [4]:
from sklearn.preprocessing import StandardScaler 
scaler = StandardScaler().fit(X_train) 
standardized_X = scaler.transform(X_train) 
standardized_X_test = scaler.transform(X_test)

### Normalization 

In [5]:
from sklearn.preprocessing import Normalizer 
scaler = Normalizer().fit(X_train) 
normalized_X = scaler.transform(X_train) 
normalized_X_test = scaler.transform(X_test)

### Binarization 

In [6]:
from sklearn.preprocessing import Binarizer 
binarizer = Binarizer(threshold=0.0).fit(X) 
binary_X = binarizer.transform(X)

### Encoding Categorical Features

In [7]:
from sklearn.preprocessing import LabelEncoder 
enc = LabelEncoder() 
y = enc.fit_transform(y) 

### Imputing Missing Values

In [8]:
from sklearn.preprocessing import Imputer 
imp = Imputer(missing_values=0, strategy='mean', axis=0) 
imp.fit_transform(X_train) 



array([[-0.91090798, -1.59775374],
       [-1.0271058 ,  0.08448757],
       [ 0.59966379, -1.59775374],
       [ 0.01867465, -0.96691325],
       [ 0.48346596, -0.33607276],
       [-1.25950146,  0.29476773],
       [-1.37569929,  0.71532806],
       [-0.79471015, -1.17719341],
       [-1.14330363,  0.71532806],
       [ 2.45882905,  1.55644871],
       [-0.79471015,  0.71532806],
       [-0.79471015,  1.34616854],
       [-0.21372101, -0.33607276],
       [ 0.83205945, -0.1257926 ],
       [-0.44611666,  1.76672887],
       [ 1.41304859,  0.29476773],
       [ 0.01867465, -0.54635292],
       [ 2.22643339, -0.96691325],
       [-0.32991883, -1.17719341],
       [ 0.13487248,  0.29476773],
       [-1.0271058 ,  1.13588838],
       [-1.49189712, -1.59775374],
       [ 0.59966379, -0.54635292],
       [-1.60809495, -0.33607276],
       [-0.91090798,  1.13588838],
       [ 1.64544425, -0.1257926 ],
       [ 0.25107031,  0.71532806],
       [ 0.48346596, -1.8080339 ],
       [ 1.8778399 ,

### Generating Polynomial Features 

In [9]:
from sklearn.preprocessing import PolynomialFeatures 
poly = PolynomialFeatures(5) 
poly.fit_transform(X)    

array([[1.        , 0.86667654, 0.90308062, ..., 0.        , 0.        ,
        0.20254947],
       [1.        , 0.73602211, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [1.        , 0.91796763, 0.81555171, ..., 0.        , 0.        ,
        0.64136385],
       [1.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

# CREATE YOUR MODEL

#### Supervised Learning Estimators


In [5]:
# Linear Regression 
from sklearn.linear_model import LinearRegression
lr = LinearRegression(normalize=True)   

# Support Vector Machines (SVM) 
from sklearn.svm import SVC 
svc = SVC(kernel='linear')  

# Naive Bayes 
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()   

# KNN 
from sklearn import neighbors 
knn = neighbors.KNeighborsClassifier(n_neighbors=5)

#### Unsupervised Learning Estimators

In [11]:
# Principal Component Analysis (PCA) 
from sklearn.decomposition import PCA 
pca = PCA(n_components=0.95)   

# K Means 
from sklearn.cluster import KMeans 
k_means = KMeans(n_clusters=3, random_state=0)


# MODEL FITTING

#### Supervised Learning Estimators


In [6]:
#Fit the model data
lr.fit(X, y)
knn.fit(X_train, y_train) 
svc.fit(X_train, y_train)

SVC(kernel='linear')

#### Unsupervised Learning Estimators

In [13]:
k_means.fit(X_train) #Fit the model to the data 
pca_model = pca.fit_transform(X_train)  #Fit to data, then transform it

# PREDICTION

#### Supervised Learning Estimators


In [None]:
y_pred = svc.predict(np.random.random((2,5))) ##Predict labels 
y_pred = lr.predict(X_test) ##Predict labels 
y_pred = knn.predict_proba(X_test) ##Estimate probability of a label

#### Unsupervised Learning Estimators

In [None]:
y_pred = k_means.predict(X_test) ##Predict labels in clustering algos

# EVALUATE YOUR MODEL´S PERFORMANCE

## Classification Metrics

### Accuracy Score 

In [None]:
knn.score(X_test, y_test) 
from sklearn.metrics import accuracy_score 
accuracy_score(y_test, y_pred)  

### Classification Report

In [None]:
from sklearn.metrics import classification_report 
print(classification_report(y_test, y_pred)) 

### Confusion Matrix 

In [None]:
from sklearn.metrics import confusion_matrix 
print(confusion_matrix(y_test, y_pred))

### Cross-Validation 

In [None]:
from sklearn.cross_validation import cross_val_score 
print(cross_val_score(knn, X_train, y_train, cv=4)) 
print(cross_val_score(lr, X, y, cv=2))

## Regression Metrics

### Mean Absolute Error

In [None]:
from sklearn.metrics import mean_absolute_error 
y_true = [3, -0.5, 2] 
mean_absolute_error(y_true, y_pred) 

### Mean Squared Error 

In [None]:
from sklearn.metrics import mean_squared_error 
mean_squared_error(y_test, y_pred)

### R² Score

In [None]:
from sklearn.metrics import r2_score 
r2_score(y_true, y_pred)

## Clustering Metrics

### Adjusted Rand Index 

In [None]:
from sklearn.metrics import adjusted_rand_score 
adjusted_rand_score(y_true, y_pred)   

### Homogeneity 

In [None]:
from sklearn.metrics import homogeneity_score 
homogeneity_score(y_true, y_pred)   

### V-measure

In [None]:
from sklearn.metrics import v_measure__score 
metrics.v_measure_score(y_true, y_pred)

### Cross-Validation

In [None]:
from sklearn.cross_validation import cross_val_score 
print(cross_val_score(knn, X_train, y_train, cv=4)) 
print(cross_val_score(lr, X, y, cv=2))

# Tune Your Model (Tunning)

### Grid Search

In [None]:
from sklearn.grid_search import GridSearchCV 
params = {"n_neighbors": np.arange(1,3),"metric": ["euclidean", "cityblock"]} 
grid = GridSearchCV(estimator=knn,param_grid=params) 
grid.fit(X_train, y_train) 
print(grid.best_score_) 
print(grid.best_estimator_.n_neighbors)

### Randomized Parameter Optimization

In [None]:
from sklearn.grid_search import RandomizedSearchCV 
params = {"n_neighbors": range(1,5),"weights": ["uniform", "distance"]} 
rsearch = RandomizedSearchCV(estimator=knn,param_distributions=params,cv=4,n_iter=8,random_state=5) 
rsearch.fit(X_train, y_train) 
print(rsearch.best_score_)

# A basic Example

In [15]:
from sklearn import neighbors, datasets, preprocessing 
from sklearn.model_selection import train_test_split 
from sklearn.metrics import accuracy_score 
iris = datasets.load_iris() 
X, y = iris.data[:, :2], iris.target 
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=33) 
scaler = preprocessing.StandardScaler().fit(X_train) 
X_train = scaler.transform(X_train) 
X_test = scaler.transform(X_test) 
knn = neighbors.KNeighborsClassifier(n_neighbors=5) 
knn.fit(X_train, y_train) 
y_pred = knn.predict(X_test) 
accuracy_score(y_test, y_pred)

0.631578947368421