<a href="https://colab.research.google.com/github/Terence0408/Teach_code/blob/master/Numerical_modeling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Python 建立 model 相關的套件有
* 數值運算: Scipy
* 機器學習: sklearn
* 文字分析: gensim
* 影像處理: opencv-python


In [0]:
import numpy as np
import pandas as pd

import warnings
warnings.filterwarnings('ignore')

## 機器學習: sklearn

In [0]:
import sklearn

### Load toy data
sklearn 隨附的資料可以到這裡看: [link](https://scikit-learn.org/stable/modules/classes.html#module-sklearn.datasets)

In [3]:
from sklearn.datasets import load_iris
Iris = load_iris()

Iris_X = Iris.data
Iris_y = Iris.target
Iris_names = Iris["feature_names"]

Iris_names

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [4]:
Iris_X[0:5]

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2]])

In [5]:
Iris_y[0:5]

array([0, 0, 0, 0, 0])

In [35]:
Iris_X.shape

(150, 4)

In [36]:
Iris_y.shape

(150,)

### Classification

In [0]:
from sklearn.datasets import load_iris
Iris = load_iris()

Iris_X = Iris.data
Iris_y = Iris.target
Iris_names = Iris["feature_names"]

#### Naive Bayes

Build model

In [7]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB(priors=None)
model.fit(Iris_X, Iris_y)

GaussianNB(priors=None, var_smoothing=1e-09)

Prediction

In [8]:
pred_y = model.predict(Iris_X)
print(Iris_y[0:5], pred_y[0:5])

[0 0 0 0 0] [0 0 0 0 0]


Performance

In [9]:
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(Iris_y, pred_y))
print(confusion_matrix(Iris_y, pred_y))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       0.94      0.94      0.94        50
           2       0.94      0.94      0.94        50

    accuracy                           0.96       150
   macro avg       0.96      0.96      0.96       150
weighted avg       0.96      0.96      0.96       150

[[50  0  0]
 [ 0 47  3]
 [ 0  3 47]]


#### Decision Trees

In [10]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(Iris_X, Iris_y)

pred_y = model.predict(Iris_X)
print(Iris_y[0:5], pred_y[0:5])

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(Iris_y, pred_y))
print(confusion_matrix(Iris_y, pred_y))

[0 0 0 0 0] [0 0 0 0 0]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        50
           2       1.00      1.00      1.00        50

    accuracy                           1.00       150
   macro avg       1.00      1.00      1.00       150
weighted avg       1.00      1.00      1.00       150

[[50  0  0]
 [ 0 50  0]
 [ 0  0 50]]


#### Random Forest

In [11]:
from sklearn.ensemble import BaggingClassifier
tree = DecisionTreeClassifier()
model = BaggingClassifier(tree, 
                          n_estimators=200, 
                          max_samples=0.8,
                          random_state=1)
model.fit(Iris_X, Iris_y)

pred_y = model.predict(Iris_X)
print(Iris_y[0:5], pred_y[0:5])

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(Iris_y, pred_y))
print(confusion_matrix(Iris_y, pred_y))

[0 0 0 0 0] [0 0 0 0 0]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      1.00      1.00        50
           2       1.00      1.00      1.00        50

    accuracy                           1.00       150
   macro avg       1.00      1.00      1.00       150
weighted avg       1.00      1.00      1.00       150

[[50  0  0]
 [ 0 50  0]
 [ 0  0 50]]


#### Support Vector Machines

In [12]:
from sklearn.svm import SVC
model = SVC(kernel='rbf')
model.fit(Iris_X, Iris_y)

pred_y = model.predict(Iris_X)
print(Iris_y[0:5], pred_y[0:5])

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(Iris_y, pred_y))
print(confusion_matrix(Iris_y, pred_y))

[0 0 0 0 0] [0 0 0 0 0]
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        50
           1       1.00      0.96      0.98        50
           2       0.96      1.00      0.98        50

    accuracy                           0.99       150
   macro avg       0.99      0.99      0.99       150
weighted avg       0.99      0.99      0.99       150

[[50  0  0]
 [ 0 48  2]
 [ 0  0 50]]


### Regression

In [0]:
from sklearn.datasets import load_boston
Boston = load_boston()

Boston_X = Boston.data
Boston_y = Boston.target
Boston_names = Boston["feature_names"]

#### Linear regression

Build model

In [14]:
from sklearn.linear_model import LinearRegression
model = LinearRegression(fit_intercept=True)
model.fit(Boston_X, Boston_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

Prediction

In [15]:
pred_y = model.predict(Boston_X)
print(Boston_y[0:5], pred_y[0:5])

[24.  21.6 34.7 33.4 36.2] [30.00384338 25.02556238 30.56759672 28.60703649 27.94352423]


Performance

In [16]:
from sklearn.metrics import mean_squared_error
mean_squared_error(Boston_y, pred_y)

21.894831181729202

Others: coefficient

In [17]:
print("Model slope:    ", model.coef_)
print("Model intercept:", model.intercept_)

Model slope:     [-1.08011358e-01  4.64204584e-02  2.05586264e-02  2.68673382e+00
 -1.77666112e+01  3.80986521e+00  6.92224640e-04 -1.47556685e+00
  3.06049479e-01 -1.23345939e-02 -9.52747232e-01  9.31168327e-03
 -5.24758378e-01]
Model intercept: 36.459488385090125


#### Ridge Regression

In [18]:
from sklearn.linear_model import Ridge
model = Ridge(alpha=1, fit_intercept=True)
model.fit(Boston_X, Boston_y)

pred_y = model.predict(Boston_X)
print(Boston_y[0:5], pred_y[0:5])

from sklearn.metrics import mean_squared_error
mean_squared_error(Boston_y, pred_y)

print("Model slope:    ", model.coef_)
print("Model intercept:", model.intercept_)

[24.  21.6 34.7 33.4 36.2] [30.25311604 24.80547336 30.53232402 28.91100981 28.1832052 ]
Model slope:     [-1.04595278e-01  4.74432243e-02 -8.80467889e-03  2.55239322e+00
 -1.07770146e+01  3.85400020e+00 -5.41453810e-03 -1.37265353e+00
  2.90141589e-01 -1.29116463e-02 -8.76074394e-01  9.67327945e-03
 -5.33343225e-01]
Model intercept: 31.597669818274014


#### LASSO Regression

In [19]:
from sklearn.linear_model import Lasso
model = Lasso(alpha=1, fit_intercept=True)
model.fit(Boston_X, Boston_y)

pred_y = model.predict(Boston_X)
print(Boston_y[0:5], pred_y[0:5])

from sklearn.metrics import mean_squared_error
mean_squared_error(Boston_y, pred_y)

print("Model slope:    ", model.coef_)
print("Model intercept:", model.intercept_)

[24.  21.6 34.7 33.4 36.2] [30.99753918 25.77681736 29.98601449 29.51799813 28.03248999]
Model slope:     [-0.06343729  0.04916467 -0.          0.         -0.          0.9498107
  0.02090951 -0.66879     0.26420643 -0.01521159 -0.72296636  0.00824703
 -0.76111454]
Model intercept: 41.05693374499337


#### Support Vector Regression

In [20]:
from sklearn.svm import SVR
model = SVR(kernel='linear')
model.fit(Boston_X, Boston_y)

pred_y = model.predict(Boston_X)
print(Boston_y[0:5], pred_y[0:5])

from sklearn.metrics import mean_squared_error
mean_squared_error(Boston_y, pred_y)

[24.  21.6 34.7 33.4 36.2] [28.56399526 23.75502366 29.94992127 28.73447852 28.47978597]


25.010132230270592

### Clustering

In [0]:
from sklearn.datasets import load_iris
Iris = load_iris()

Iris_X = Iris.data
Iris_y = Iris.target
Iris_names = Iris["feature_names"]

#### Kmeans

In [0]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters=3)
kmeans.fit(Iris_X)

y_kmeans = kmeans.predict(Iris_X)

In [23]:
print(Iris_y)

[0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2]


In [24]:
print(y_kmeans)

[1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
 1 1 1 1 1 1 1 1 1 1 1 1 1 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
 2 2 2 0 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 0 2 0 0 0 0 2 0 0 0 0
 0 0 2 2 0 0 0 0 2 0 2 0 2 0 0 2 2 0 0 0 0 0 2 0 0 0 0 2 0 0 0 2 0 0 0 2 0
 0 2]


### Model validation

In [0]:
from sklearn.datasets import load_iris
Iris = load_iris()

Iris_X = Iris.data
Iris_y = Iris.target
Iris_names = Iris["feature_names"]

#### Split data

In [0]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(Iris_X, Iris_y,   # 放 X & y
                                                    random_state=1)   # 固定 random seed

#### Cross variation
```
較舊的版本: from sklearn.cross_validation import cross_val_score    
較新的版本: from sklearn.model_selection import cross_val_score
```

In [27]:
from sklearn.svm import SVC                                # 使用 SVM 做訓練
model = SVC(kernel='rbf')

from sklearn.model_selection import cross_val_score
cross_val_score(model,                                     # 放模型方法, ex. SVM
                Iris_X, Iris_y,                            # 放 X & y
                cv=5,                                      # 跑 5cv
                scoring='neg_mean_squared_error')          # 計算每次的 -MSE

array([-0.03333333, -0.        , -0.03333333, -0.03333333, -0.        ])

#### Leave one out

In [0]:
from sklearn.svm import SVC                                # 使用 SVM 做訓練
model = SVC(kernel='rbf')

from sklearn.model_selection import LeaveOneOut

loo = LeaveOneOut()                                        # 生成每次 trian 和 test 的 row id
for train_row, test_row in loo.split(Iris_X):
    X_train = Iris_X[train_row,:]
    y_train = Iris_y[train_row]
    X_test  = Iris_X[test_row,:]
    y_test  = Iris_y[test_row]
    
    model.fit(X_train, y_train)                             # 訓練當次模型
    y_test_model = model.predict(X_test)                    # 預測當次 test 結果


#### Grid Search

### Principal Component Analysis

In [0]:
from sklearn.datasets import load_breast_cancer
Breast = load_breast_cancer()

Breast_X = Breast.data
Breast_y = Breast.target
Breast_names = Breast["feature_names"]

In [30]:
from sklearn.decomposition import PCA
pca = PCA(n_components=2)
pca.fit(Breast_X)

PCA(copy=True, iterated_power='auto', n_components=2, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [31]:
print(pca.explained_variance_ratio_)

[0.98204467 0.01617649]


In [32]:
print(pca.components_)

[[ 5.08623202e-03  2.19657026e-03  3.50763298e-02  5.16826469e-01
   4.23694535e-06  4.05260047e-05  8.19399539e-05  4.77807775e-05
   7.07804332e-06 -2.62155251e-06  3.13742507e-04 -6.50984008e-05
   2.23634150e-03  5.57271669e-02 -8.05646029e-07  5.51918197e-06
   8.87094462e-06  3.27915009e-06 -1.24101836e-06 -8.54530832e-08
   7.15473257e-03  3.06736622e-03  4.94576447e-02  8.52063392e-01
   6.42005481e-06  1.01275937e-04  1.68928625e-04  7.36658178e-05
   1.78986262e-05  1.61356159e-06]
 [ 9.28705650e-03 -2.88160658e-03  6.27480827e-02  8.51823720e-01
  -1.48194356e-05 -2.68862249e-06  7.51419574e-05  4.63501038e-05
  -2.52430431e-05 -1.61197148e-05 -5.38692831e-05  3.48370414e-04
   8.19640791e-04  7.51112451e-03  1.49438131e-06  1.27357957e-05
   2.86921009e-05  9.36007477e-06  1.22647432e-05  2.89683790e-07
  -5.68673345e-04 -1.32152605e-02 -1.85961117e-04 -5.19742358e-01
  -7.68565692e-05 -2.56104144e-04 -1.75471479e-04 -3.05051743e-05
  -1.57042845e-04 -5.53071662e-05]]


In [33]:
X_pca = pca.transform(Breast_X)
print(X_pca[0:5])

[[1160.1425737  -293.91754364]
 [1269.12244319   15.63018184]
 [ 995.79388896   39.15674324]
 [-407.18080253  -67.38031982]
 [ 930.34118015  189.34074158]]
