# A. Supervised Learning

-------------------------------------------------------------------

## 1. Linear Regression

### Boston Dataset

In [None]:
import numpy as np
import pandas as pd

In [None]:
# Loading the Boston Dataset
from sklearn.datasets import load_boston
boston_dataset = load_boston()


    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [None]:
boston_dataset


{'DESCR': ".. _boston_dataset:\n\nBoston house prices dataset\n---------------------------\n\n**Data Set Characteristics:**  \n\n    :Number of Instances: 506 \n\n    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.\n\n    :Attribute Information (in order):\n        - CRIM     per capita crime rate by town\n        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.\n        - INDUS    proportion of non-retail business acres per town\n        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)\n        - NOX      nitric oxides concentration (parts per 10 million)\n        - RM       average number of rooms per dwelling\n        - AGE      proportion of owner-occupied units built prior to 1940\n        - DIS      weighted distances to five Boston employment centres\n        - RAD      index of accessibility to radial highways\n        - TAX      full-value property-tax rate p

In [None]:
boston_dataset.data

array([[6.3200e-03, 1.8000e+01, 2.3100e+00, ..., 1.5300e+01, 3.9690e+02,
        4.9800e+00],
       [2.7310e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9690e+02,
        9.1400e+00],
       [2.7290e-02, 0.0000e+00, 7.0700e+00, ..., 1.7800e+01, 3.9283e+02,
        4.0300e+00],
       ...,
       [6.0760e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        5.6400e+00],
       [1.0959e-01, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9345e+02,
        6.4800e+00],
       [4.7410e-02, 0.0000e+00, 1.1930e+01, ..., 2.1000e+01, 3.9690e+02,
        7.8800e+00]])

In [None]:
print(boston_dataset['feature_names'])

['CRIM' 'ZN' 'INDUS' 'CHAS' 'NOX' 'RM' 'AGE' 'DIS' 'RAD' 'TAX' 'PTRATIO'
 'B' 'LSTAT']


In [None]:
print(boston_dataset.DESCR)

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

### Preparing the dataset

In [None]:
df_boston_columns = boston_dataset.feature_names
df_boston = pd.DataFrame(boston_dataset.data, columns=df_boston_columns)

In [None]:
df_boston.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


In [None]:
boston_dataset.data.shape

(506, 13)

In [None]:
boston_dataset.target.shape

(506,)

In [None]:
boston_dataset.target

array([24. , 21.6, 34.7, 33.4, 36.2, 28.7, 22.9, 27.1, 16.5, 18.9, 15. ,
       18.9, 21.7, 20.4, 18.2, 19.9, 23.1, 17.5, 20.2, 18.2, 13.6, 19.6,
       15.2, 14.5, 15.6, 13.9, 16.6, 14.8, 18.4, 21. , 12.7, 14.5, 13.2,
       13.1, 13.5, 18.9, 20. , 21. , 24.7, 30.8, 34.9, 26.6, 25.3, 24.7,
       21.2, 19.3, 20. , 16.6, 14.4, 19.4, 19.7, 20.5, 25. , 23.4, 18.9,
       35.4, 24.7, 31.6, 23.3, 19.6, 18.7, 16. , 22.2, 25. , 33. , 23.5,
       19.4, 22. , 17.4, 20.9, 24.2, 21.7, 22.8, 23.4, 24.1, 21.4, 20. ,
       20.8, 21.2, 20.3, 28. , 23.9, 24.8, 22.9, 23.9, 26.6, 22.5, 22.2,
       23.6, 28.7, 22.6, 22. , 22.9, 25. , 20.6, 28.4, 21.4, 38.7, 43.8,
       33.2, 27.5, 26.5, 18.6, 19.3, 20.1, 19.5, 19.5, 20.4, 19.8, 19.4,
       21.7, 22.8, 18.8, 18.7, 18.5, 18.3, 21.2, 19.2, 20.4, 19.3, 22. ,
       20.3, 20.5, 17.3, 18.8, 21.4, 15.7, 16.2, 18. , 14.3, 19.2, 19.6,
       23. , 18.4, 15.6, 18.1, 17.4, 17.1, 13.3, 17.8, 14. , 14.4, 13.4,
       15.6, 11.8, 13.8, 15.6, 14.6, 17.8, 15.4, 21

### As our target column is not included in the data attribute we need to appen it to our dataset

In [None]:
df_boston['Price'] = boston_dataset.target


In [None]:
df_boston

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,Price
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98,24.0
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.90,9.14,21.6
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.90,5.33,36.2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
501,0.06263,0.0,11.93,0.0,0.573,6.593,69.1,2.4786,1.0,273.0,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0.0,0.573,6.120,76.7,2.2875,1.0,273.0,21.0,396.90,9.08,20.6
503,0.06076,0.0,11.93,0.0,0.573,6.976,91.0,2.1675,1.0,273.0,21.0,396.90,5.64,23.9
504,0.10959,0.0,11.93,0.0,0.573,6.794,89.3,2.3889,1.0,273.0,21.0,393.45,6.48,22.0


In [None]:
X_features = boston_dataset.data
Y_target = boston_dataset.target

### Importing the Linear Model

In [None]:
from sklearn.linear_model import LinearRegression
linReg = LinearRegression()

In [None]:
linReg.fit(X_features, Y_target)

LinearRegression()

#### Intercept

In [None]:
print('The Intercept of the Model is {0:.2f} '.format(linReg.intercept_))

The Intercept of the Model is 36.46 


#### Coefficient

In [None]:
print('The Coefficient of the model is ', len(linReg.coef_))

The Coefficient of the model is  13


In [None]:
linReg.coef_

array([-1.08011358e-01,  4.64204584e-02,  2.05586264e-02,  2.68673382e+00,
       -1.77666112e+01,  3.80986521e+00,  6.92224640e-04, -1.47556685e+00,
        3.06049479e-01, -1.23345939e-02, -9.52747232e-01,  9.31168327e-03,
       -5.24758378e-01])

### Splitting the dataset for the mmodel to test

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
x_train, x_test, y_train, y_test = train_test_split(X_features, Y_target)

In [None]:
boston_dataset.data.shape

(506, 13)

In [None]:
x_train.shape, x_test.shape, y_train.shape, y_test.shape

((379, 13), (127, 13), (379,), (127,))

### fitting the train dataset into the estemator or model object

In [None]:
linReg.fit(x_train, y_train)

LinearRegression()

In [None]:
predicted_values = linReg.predict(x_test)

In [None]:
predicted_values

array([30.87088425, 20.92303217, 20.50455637, 16.99306519, 24.69612003,
        6.1248038 , 17.77128014, -4.86385452, 31.58096718, 30.59113821,
       16.40108361, 27.44566686, 31.63576184, 24.1358284 , 24.54333968,
       11.00881676, 20.49630805, 29.53074703, 25.60562273, 24.54526789,
        9.88663796, 19.45082036, 19.93168266, 13.50089668, 22.76673142,
       24.25392347, 20.55459928, 25.34868475, 13.63300893, 17.19415099,
       34.30722345, 36.51322768, 20.81613988, 21.31491368, 13.55049973,
       14.94194351, 30.84618879, 23.45815642, 28.10564723, 33.94279346,
        8.28688806, 33.19038745, 14.36010383, 29.20529596, 16.98663671,
       27.574777  , 24.86004626, 27.74813579, 12.82376676, 18.9669445 ,
       24.84299264, 21.45259952, 16.55754976, 23.0542641 , 20.65858765,
       17.67882289, 25.1107571 , 16.07247718, 13.28203474, 21.94854955,
       32.58216719, 31.62823435, 33.47877296, 28.59916221, 18.20251734,
       12.72905662, 28.74459992, 17.19482311, 24.88032067, 41.26

### Calculating MSE

In [None]:
mean_square_error = np.mean((predicted_values - y_test)** 2)

In [None]:
print('The Mean Square Error is %.2f'% mean_square_error)

The Mean Square Error is 27.82


### Calculating Variance

In [None]:
variance_score = linReg.score(x_test, y_test)

In [None]:
print('The Variance Score is %.2f' % variance_score)

The Variance Score is 0.64


### ---------------------------------------------------------------------------------------------------------------
### The End

#### ---------------------------------------------------------------------------------------------------------------

## 2. Logistic Regression

#### Importing necessary Lirary

In [None]:
import numpy as np
import pandas as pd

#### Importing and Load the Dataset

In [None]:
from sklearn.datasets import load_iris
dataset = load_iris()

In [None]:
type(dataset)

sklearn.utils.Bunch

In [None]:
dataset.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [None]:
dataset_data = dataset.data
dataset_target = dataset.target
dataset_frame = dataset.frame
dataset_target_names = dataset.target_names
dataset_DESCR = dataset.DESCR
dataset_feature_names = dataset.feature_names
dataset_filename = dataset.feature_names

In [None]:
print(dataset_DESCR)

.. _iris_dataset:

Iris plants dataset
--------------------

**Data Set Characteristics:**

    :Number of Instances: 150 (50 in each of three classes)
    :Number of Attributes: 4 numeric, predictive attributes and the class
    :Attribute Information:
        - sepal length in cm
        - sepal width in cm
        - petal length in cm
        - petal width in cm
        - class:
                - Iris-Setosa
                - Iris-Versicolour
                - Iris-Virginica
                
    :Summary Statistics:

                    Min  Max   Mean    SD   Class Correlation
    sepal length:   4.3  7.9   5.84   0.83    0.7826
    sepal width:    2.0  4.4   3.05   0.43   -0.4194
    petal length:   1.0  6.9   3.76   1.76    0.9490  (high!)
    petal width:    0.1  2.5   1.20   0.76    0.9565  (high!)

    :Missing Attribute Values: None
    :Class Distribution: 33.3% for each of 3 classes.
    :Creator: R.A. Fisher
    :Donor: Michael Marshall (MARSHALL%PLU@io.arc.nasa.gov)
    :

In [None]:
print(dataset_feature_names)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [None]:
print(dataset_filename)

['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']


In [None]:
print(dataset_target_names)

['setosa' 'versicolor' 'virginica']


In [None]:
dataset_data.shape, dataset_target.shape

((150, 4), (150,))

In [None]:
X_features = dataset_data
Y_target = dataset_target
print(X_features.shape, Y_target.shape)

(150, 4) (150,)


### Importing KNN model

In [None]:
from sklearn.neighbors import KNeighborsClassifier

In [None]:
knn = KNeighborsClassifier(n_neighbors=1)

In [None]:
knn

KNeighborsClassifier(n_neighbors=1)

In [None]:
knn.fit(X_features, Y_target)

KNeighborsClassifier(n_neighbors=1)

In [None]:
X_new = [[3,5,4,1], [5,3,4,2]]

In [None]:
knn.predict(X_new)

array([1, 1])

### Importing Logistic Regression Model

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
logReg = LogisticRegression(max_iter=500)

In [None]:
logReg.fit(X_features, Y_target)

LogisticRegression(max_iter=500)

In [None]:
logReg.predict(X_new)

array([0, 1])

# B. Unsupervised Learning
# --------------------------------------------------------------------------------

## 1. K-Means Clustering

In [None]:
# importing libraries
import numpy as np
from sklearn.cluster import KMeans
from sklearn.datasets import make_blobs

In [None]:
n_samples = 300
random_state = 20
n_features = 5

In [None]:
X, y = make_blobs(n_samples=n_samples,n_features= n_features, random_state=None)

In [None]:
predict_y = KMeans(n_clusters=3, random_state=random_state).fit_predict(X)

In [None]:
predict_y

array([1, 2, 0, 1, 1, 2, 0, 0, 1, 2, 2, 2, 0, 0, 2, 0, 0, 1, 1, 0, 1, 1,
       0, 2, 1, 1, 0, 2, 0, 0, 0, 0, 0, 0, 2, 1, 2, 2, 2, 0, 2, 1, 2, 0,
       2, 2, 1, 2, 1, 0, 1, 1, 0, 0, 1, 2, 2, 1, 2, 0, 2, 2, 0, 2, 1, 0,
       1, 2, 0, 2, 1, 1, 2, 1, 1, 1, 1, 0, 2, 1, 1, 2, 1, 0, 1, 2, 2, 2,
       0, 2, 0, 0, 2, 2, 1, 1, 2, 2, 0, 2, 2, 1, 2, 2, 0, 0, 0, 0, 0, 1,
       0, 1, 1, 2, 0, 2, 1, 2, 1, 2, 1, 0, 2, 0, 1, 1, 2, 0, 2, 1, 1, 0,
       2, 2, 1, 1, 0, 0, 2, 2, 1, 0, 2, 0, 2, 0, 2, 1, 1, 0, 0, 0, 1, 2,
       1, 1, 0, 2, 2, 2, 1, 1, 2, 0, 1, 0, 2, 1, 0, 0, 1, 2, 0, 2, 2, 2,
       2, 1, 0, 0, 2, 2, 1, 0, 0, 2, 0, 0, 2, 1, 0, 1, 0, 0, 2, 2, 2, 1,
       0, 1, 1, 2, 2, 0, 0, 2, 2, 1, 2, 1, 0, 0, 1, 0, 1, 0, 1, 1, 2, 1,
       0, 2, 1, 0, 2, 2, 0, 0, 2, 0, 0, 1, 1, 0, 0, 2, 0, 1, 0, 0, 2, 1,
       2, 1, 1, 0, 2, 2, 0, 0, 0, 0, 1, 1, 2, 1, 0, 1, 1, 0, 1, 0, 1, 0,
       2, 2, 1, 2, 1, 1, 1, 0, 2, 1, 0, 1, 2, 1, 0, 2, 1, 0, 0, 1, 0, 2,
       2, 2, 1, 1, 1, 1, 2, 1, 1, 0, 0, 2, 2, 1], d

## 2. PCA Reduction

In [None]:
# Importing Libraries

from sklearn.decomposition import PCA
from sklearn.datasets import make_blobs

In [None]:
n_samples = 20
random_state = 20
n_features = 10

In [None]:
X, Y = make_blobs(n_samples = n_samples, n_features = n_features, random_state=None)

In [None]:
X.shape, Y.shape

((20, 10), (20,))

In [None]:
pca = PCA(n_components = 3) # n_components 3 will reduce the shape of the dataset to (20, 3) from (20, 10)

In [None]:
pca.fit(X)

PCA(n_components=3)

In [None]:
print(pca.explained_variance_ratio_)

[0.69553711 0.27431434 0.0105581 ]


In [None]:
pca.components_

array([[-2.35525190e-01,  3.94543846e-01,  2.60031823e-01,
        -1.14136853e-01,  1.47993210e-01,  2.76241624e-02,
        -2.05000468e-01,  6.92036572e-01, -4.00360854e-01,
         6.57690000e-02],
       [ 4.58170490e-01, -1.49795892e-01,  7.03277039e-01,
        -4.04397293e-02,  6.44148762e-02,  4.80437209e-01,
         1.08821985e-01, -3.56480638e-04,  2.63765242e-02,
        -1.54595495e-01],
       [ 6.74785219e-01,  6.66834807e-01, -2.10179482e-01,
         6.50579937e-02,  3.03639214e-02, -5.51235587e-02,
        -1.07955045e-01, -1.23287795e-01, -2.18820373e-02,
         1.42459539e-01]])

In [None]:
first_component = pca.components_[0]
print(first_component)

[-0.23552519  0.39454385  0.26003182 -0.11413685  0.14799321  0.02762416
 -0.20500047  0.69203657 -0.40036085  0.065769  ]


In [None]:
# Applying Dimentionality Reduction to the Dataset

pca_reduced = pca.transform(X)

In [None]:
pca_reduced

array([[-10.9310067 ,  -5.78187658,   0.25771934],
       [ -3.79182598,  11.66045958,   0.28167928],
       [-10.38073044,  -6.05964454,  -0.35431495],
       [ 15.20719855,  -1.79471794,   1.1397283 ],
       [ -9.26692519,  -8.0995793 ,  -1.11823716],
       [-11.72460063,  -7.69840596,  -2.95226042],
       [ 15.64369734,  -1.8357064 ,  -1.9299112 ],
       [ 15.44634656,  -1.57075109,  -1.33490883],
       [-10.81239152,  -9.00809945,   1.56782331],
       [ -6.10498688,   9.65690889,  -0.30076617],
       [ -7.67304442,   9.35068208,   0.3179782 ],
       [-11.40424434,  -7.30544542,   3.09121145],
       [ 15.82682394,  -2.66521908,  -0.94308446],
       [ -5.74229607,   9.77728189,   0.13173145],
       [ 16.31586481,  -0.48075704,   2.62581415],
       [ -6.71874661,  11.80329683,  -1.62914351],
       [ -5.37487827,  10.89450494,   1.14586235],
       [ 16.07488045,  -0.70144005,  -0.24942731],
       [ 15.41048287,  -2.48136435,   0.63380048],
       [ -9.99961747,  -7.66012

In [None]:
pca_reduced.shape

(20, 3)

# C. PipeLines

In [None]:
# importing Libraries
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA

In [None]:
estimators = [('dim_reduction', PCA()), ('linear_model', LinearRegression())]

In [None]:
pipeline_estimator = Pipeline(estimators)

In [None]:
pipeline_estimator

Pipeline(steps=[('dim_reduction', PCA()), ('linear_model', LinearRegression())])

In [None]:
pipeline_estimator.steps

[('dim_reduction', PCA()), ('linear_model', LinearRegression())]