# Pipeline

In [4]:
#cReate a pipeline that standardize the data then creates a model

from pandas import read_csv

from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.tree import DecisionTreeClassifier

import warnings
warnings.filterwarnings('ignore')

#load data
#url = "https://github.com/dsrscientist/dataset1/blob/master/pima_indian_diabetes.csv"
names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

#DataFrame
dataframe = read_csv("pima_indian_diabetes.csv", names = names)
array = dataframe.values

x= array[:,0:8]
y= array[:,8]

#create pipeline
estimators =[]
estimators.append(('Standardize',StandardScaler()))
estimators.append(('dtc',DecisionTreeClassifier()))
model = Pipeline(estimators)

#evaluate pipeline
result = cross_val_score(model, x,y, cv=5)
print(result.mean())

0.7279517867753162


# Gradient Descent Algorithm

In [6]:
# importing libraries
from sklearn import linear_model
from sklearn.datasets import load_boston
from sklearn.linear_model import SGDRegressor

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score, mean_squared_error
from sklearn.pipeline import Pipeline

import numpy as np
import pandas as pd

#loading Data
x= load_boston()
#x = pd.read_csv('BostonHousing.csv')
data =pd.DataFrame(x.data,columns = x.feature_names)
y = x.target

#creating a model
pipe = []
pipe.append(('SC', StandardScaler()))
pipe.append(('PCA', PCA(n_components=8)))
pipe.append(('SGD', SGDRegressor(alpha=0.1, learning_rate = 'optimal', max_iter=400, penalty='12'))) # Deafult
model = Pipeline(pipe)

#cross validation score
cv_results = cross_val_score(model,data, y , cv=5)
msg = "%s: %f (%f)" % ('SGDRegressor', cv_results.mean(), cv_results.std())
print(msg)

SGDRegressor: nan (nan)



    The Boston housing prices dataset has an ethical problem. You can refer to
    the documentation of this function for further details.

    The scikit-learn maintainers therefore strongly discourage the use of this
    dataset unless the purpose of the code is to study and educate about
    ethical issues in data science and machine learning.

    In this special case, you can fetch the dataset from the original
    source::

        import pandas as pd
        import numpy as np


        data_url = "http://lib.stat.cmu.edu/datasets/boston"
        raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
        data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
        target = raw_df.values[1::2, 2]

    Alternative datasets include the California housing dataset (i.e.
    :func:`~sklearn.datasets.fetch_california_housing`) and the Ames housing
    dataset. You can load the datasets as follows::

        from sklearn.datasets import fetch_california_h

In [8]:
from sklearn.linear_model import SGDClassifier
from sklearn.datasets import load_iris

x=load_iris()
data = pd.DataFrame(x.data, columns = x.feature_names)
y = x.target

#creating model
model = SGDClassifier()

#cross validation score
cv_results = cross_val_score(model, data, y, cv=5)
msg = "%s: %f (%f)" % ('SGDClassifier', cv_results.mean(), cv_results.std())
print(msg)

SGDClassifier: 0.706667 (0.155492)


# Hyper Parameter Tuning :GridSearchCV

In [14]:
#Grid Search for Algorith Tuning

import numpy as np
from sklearn import datasets
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

#Load the diabetes datasets
dataset = datasets.load_diabetes()

#prepare a range of alpha values to test
#alphas = np.array([1,0.01,0.001,0.0001,0])
alphavalues ={'alpha':[1,0.01,0.001,0.0001,0]}

#create and fit a ridge regression model, testing each alpha
model =Ridge()

#grid = GridSearchCv(estimator=model, param_grid=dict(aplha=aplhas))
grid = GridSearchCV(estimator=model, param_grid = alphavalues)
grid.fit(dataset.data, dataset.target)
print(grid)

#summarise the result of grid serach
print(grid.best_score_)
print(grid.best_estimator_.alpha)
print(grid.best_params_)

GridSearchCV(estimator=Ridge(),
             param_grid={'alpha': [1, 0.01, 0.001, 0.0001, 0]})
0.4823231384163485
0.0001
{'alpha': 0.0001}


### Now appling the alpha in the actual dataset

In [16]:
from sklearn.linear_model import LinearRegression, Lasso, Ridge

from sklearn import datasets
from sklearn.model_selection import GridSearchCV

ds = datasets.load_diabetes()
x=ds.data
y=ds.target

lr =LinearRegression()
lr.fit(x,y)
print(x,y)
print('Score: ', lr.score(x,y))

rd=Ridge(alpha=0.0001)
rd.fit(x,y)
print(rd.coef_)
print('Score:', rd.score(x,y))

[[ 0.03807591  0.05068012  0.06169621 ... -0.00259226  0.01990842
  -0.01764613]
 [-0.00188202 -0.04464164 -0.05147406 ... -0.03949338 -0.06832974
  -0.09220405]
 [ 0.08529891  0.05068012  0.04445121 ... -0.00259226  0.00286377
  -0.02593034]
 ...
 [ 0.04170844  0.05068012 -0.01590626 ... -0.01107952 -0.04687948
   0.01549073]
 [-0.04547248 -0.04464164  0.03906215 ...  0.02655962  0.04452837
  -0.02593034]
 [-0.04547248 -0.04464164 -0.0730303  ... -0.03949338 -0.00421986
   0.00306441]] [151.  75. 141. 206. 135.  97. 138.  63. 110. 310. 101.  69. 179. 185.
 118. 171. 166. 144.  97. 168.  68.  49.  68. 245. 184. 202. 137.  85.
 131. 283. 129.  59. 341.  87.  65. 102. 265. 276. 252.  90. 100.  55.
  61.  92. 259.  53. 190. 142.  75. 142. 155. 225.  59. 104. 182. 128.
  52.  37. 170. 170.  61. 144.  52. 128.  71. 163. 150.  97. 160. 178.
  48. 270. 202. 111.  85.  42. 170. 200. 252. 113. 143.  51.  52. 210.
  65. 141.  55. 134.  42. 111.  98. 164.  48.  96.  90. 162. 150. 279.
  92.  83. 

## applying Gridsearch cv in Support vector Classifier

In [17]:
from sklearn import svm, datasets
from sklearn.model_selection import GridSearchCV
iris = datasets.load_iris()

parameters = {'kernel':('linear', 'poly', 'rbf'), 'C':[1,10]}
svc = svm.SVC()
clf = GridSearchCV(svc,parameters)
clf.fit(iris.data,iris.target)
#GridSearchCV(estimator=SVC(),
             # param_grid = {'c':[1:10], 'Kernel': ('linear', 'poly', 'rbf')} )
#sorted(clf.cv_results_.keys())

print(clf.best_score_)
print(clf.best_estimator_.kernel)
print(grid.best_params_)

0.9800000000000001
linear
{'alpha': 0.0001}


#### Apply Gridsearch cv's suggested values in the actual dataset

In [21]:
sv = svm.SVC(kernel='linear',C=1)
sv.fit(iris.data,iris.target)
t = sv.score(iris.data,iris.target)
print(round(t,2))

0.99


## applying Gridsearch cv in DecisisonTreeClassifier

In [30]:
from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV

dtc = DecisionTreeClassifier()
iris=datasets.load_iris()

grid_param = {'criterion': ['gini', 'entropy']}
#gridk={'kernel':['linear', 'poly', 'rbf']}

gd_sr = GridSearchCV(estimator =dtc,
                      param_grid = grid_param,
                      scoring = 'accuracy',
                      cv=5)
gd_sr.fit(iris.data,iris.target)

best_paramters = gd_sr.best_params_
print(best_paramters)
best_result =gd_sr.best_score_
print(best_result)
print(round(best_result,2))

{'criterion': 'gini'}
0.9600000000000002
0.96


In [31]:
dtc = DecisionTreeClassifier(criterion ='gini')
dtc.fit(iris.data,iris.target)
t = dtc.score(iris.data,iris.target)
print(round(t,2))

1.0


In [32]:
#Randomised Search CV for algorithm tuning

import numpy as np
from sklearn import datasets
from sklearn.linear_model import Ridge
from sklearn.model_selection import RandomizedSearchCV

#Load dataset
dataset = datasets.load_diabetes()

#prepare a uniform distribution to sample for the alpha parameter
param_grid= {'alpha': [1,1.0,0.01,0.001,0.0001,0]}
#create and fit a ridge regression model, testing random alpha values
model = Ridge()
rsearch = RandomizedSearchCV(estimator= model, param_distributions = param_grid)
rsearch.fit(dataset.data, dataset.target)
print(rsearch)

#summarise the results of the random parametersearch
print(rsearch.best_score_)
print(rsearch.best_estimator_.alpha)

RandomizedSearchCV(estimator=Ridge(),
                   param_distributions={'alpha': [1, 1.0, 0.01, 0.001, 0.0001,
                                                  0]})
0.4823231384163485
0.0001
