In [8]:
import pickle
from sklearn.base import TransformerMixin
from sklearn.datasets import make_regression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge

In [9]:
class RidgeTransformer(Ridge, TransformerMixin):
    
    def transform(self, X, *_):
        return self.predict(X).reshape(-1,1)
    
class DecisionTreeTransformer(DecisionTreeRegressor, TransformerMixin):
    
    def transform(self, X, *_):
        return self.predict(X).reshape(-1,1)


class KNeighborsTransformer(KNeighborsRegressor, TransformerMixin):
    
    def transform(self, X, *_):
        return self.predict(X).reshape(-1,1)

In [10]:
if __name__ == '__main__':
    with open('age.pickle', 'wb') as f:
        print('Build and fit a model...')
        import pandas as pd
        import numpy as np
        import time
        from sklearn.metrics import mean_squared_error, r2_score
        from sklearn.model_selection import cross_val_score
        from sklearn.impute import SimpleImputer
        from sklearn.preprocessing import LabelEncoder        
        from sklearn.model_selection import train_test_split
        from test import RidgeTransformer
        from test import DecisionTreeTransformer
        from test import KNeighborsTransformer
        
        df = pd.read_csv('../data/age_structure.csv')
        
        df.replace(' ', np.nan, inplace=True)
        
        df = df.dropna(axis=1, how='all')
        
        imp=SimpleImputer(missing_values=np.NaN)
        idf=pd.DataFrame(imp.fit_transform(df))
        idf.columns=df.columns
        idf.index=df.index
        
        idf['YEAR_ENCODED'] = LabelEncoder().fit_transform(idf.YEAR)
        idf['DISTRICT_ENCODED'] = LabelEncoder().fit_transform(idf.DISTRICT)
        
        research_features = ['YEAR_ENCODED', 'DISTRICT_ENCODED', 'AGE_CAT', 'GENDER']
        X = idf[research_features]
        y = idf['COUNT']
        
        X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1,test_size=0.2)
        
        model = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('poly_feats', PolynomialFeatures()),
        ('dst', DecisionTreeTransformer()),
        ('lin_regr', LinearRegression())
        ])

        begin = time.time()
        model_name = 'Custom-RDK-Model'
        model.fit(X_train,y_train)
        print(model_name + ' Train time: ' + str((time.time() - begin)/60) + " minutes")
        preds = model.predict(X_test)
        mse = mean_squared_error(y_test,preds)
        r2 = r2_score(y_test,preds)
        scores = cross_val_score(model, X_train, y_train, cv=5)
        print(model_name + ' MSE: ' + str(mse))
        print(model_name + ' R2 ' + str(r2))
        print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
        
        pickle.dump(model, f)
        print('pickle dumped')

Build and fit a model...


ImportError: cannot import name 'RidgeTransformer' from 'test' (/home/asitha/anaconda3/lib/python3.7/test/__init__.py)

## Check all remaining linear and non-linear regressors (including which do not support extrapolating) 
### NOTE: Non-linear Regressors examples
#### Decision Trees
Decision Trees, also referred to as Classification and Regression Trees (CART), work for both categorical and continuous input and output variables. It works by splitting the data into two or more homogeneous sets based on the most significant splitter among the independent variables. The best differentiator is the one that minimizes the cost metric. The cost metrics for a classification tree is often the entropy or the gini index, whereas, for a regression tree, the default metric is the mean squared error.
#### Random Forest
Decision Trees are useful, but the problem is that they often tend to overfit the training data leading to high variances in the test data. Random Forest algorithms overcome this shortcoming by reducing the variance of the decision trees. They are called 'Forest' because they are the collection, or ensemble, of several decision trees. One major difference between a Decision Tree and a Random Forest model is on how the splits happen. In Random Forest, instead of trying splits on all the features, a sample of features is selected for each split, thereby reducing the variance of the model

In [None]:
#import sklearn regression models
from sklearn.tree import DecisionTreeRegressor
from sklearn.gaussian_process import GaussianProcessRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR, NuSVR, LinearSVR
from sklearn.kernel_ridge import KernelRidge
from sklearn.ensemble import RandomForestRegressor

#Dictionary of all models. All models intialized with no args. Can modify any of them to test various args.

models = {
          'GaussianProcessRegressor': GaussianProcessRegressor(),
          'KernelRidge': KernelRidge(),
          'DecisionTreeRegressor': DecisionTreeRegressor(),
          'KNeighborsRegressor': KNeighborsRegressor(),
          'SVR': SVR(gamma='scale'),
          'NuSVR': NuSVR(gamma='scale'),
          'LinearSVR': LinearSVR(),
          'RandomForestRegressor': RandomForestRegressor()
         }

In [None]:
#Import non-sklearn packages
import numpy as np
import time
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.model_selection import cross_val_score

print_coef = False

for key, model in models.items():
    begin = time.time()
    model.fit(X_train,y_train)
    print(key + ' Train time: ' + str((time.time() - begin)/60) + " minutes")
    preds = model.predict(X_test)
    mse = mean_squared_error(y_test,preds)
    r2 = r2_score(y_test,preds)
    scores = cross_val_score(model, X_train, y_train, cv=5)
    print(key + ' MSE: ' + str(mse))
    print(key + ' R2 ' + str(r2))
    print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
    if print_coef:
        print('Coefficients:')
        print(model.coef_)
    print('')

## Trying Ensemble methods
The goal of ensemble methods is to combine the predictions of several base estimators built with a given learning algorithm in order to improve generalizability / robustness over a single estimator.

from sklearn.base import TransformerMixin

In [None]:
class MLPTransformer(MLPRegressor, TransformerMixin):
    
    def transform(self, X, *_):
        return self.predict(X).reshape(-1,1)
    
class RidgeTransformer(Ridge, TransformerMixin):
    
    def transform(self, X, *_):
        return self.predict(X).reshape(-1,1)

class ElasticNetTransformer(ElasticNet, TransformerMixin):
    
    def transform(self, X, *_):
        return self.predict(X).reshape(-1,1)
    
class OrthogonalMatchingPursuitTransformer(OrthogonalMatchingPursuit, TransformerMixin):
    
    def transform(self, X, *_):
        return self.predict(X).reshape(-1,1)

  

In [None]:
def build_model():
    ridge_transformer = Pipeline(steps=[
        ('scaler', StandardScaler()),
        ('poly_feats', PolynomialFeatures()),
        ('ridge', OrthogonalMatchingPursuitTransformer())
    ])

    pred_union = FeatureUnion(
        transformer_list=[
            ('ridge', ridge_transformer),
            ('enet', ElasticNetTransformer())
        ],
        n_jobs=2
    )

    model = Pipeline(steps=[
        ('pred_union', pred_union),
        ('lin_regr', LinearRegression())
    ])

    return model

In [None]:
begin = time.time()
model_2 = build_model()
model_2_name = 'Custom-RDK-Model'
model_2.fit(X_train,y_train)
print(model_2_name + ' Train time: ' + str((time.time() - begin)/60) + " minutes")
preds = model_2.predict(X_test)
mse = mean_squared_error(y_test,preds)
r2 = r2_score(y_test,preds)
scores = cross_val_score(model_2, X_train, y_train, cv=5)
print(model_2_name + ' MSE: ' + str(mse))
print(model_2_name + ' R2 ' + str(r2))
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

### NOTE: Combination of Ridge, DecisionTrees and KNN works

In [None]:
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import make_pipeline
mlp = make_pipeline(StandardScaler(),KNeighborsTransformer(),
                    MLPRegressor(hidden_layer_sizes=(200, 1000),
                                 tol=1e-2, max_iter=500,activation='relu', \
                            random_state=0)

begin = time.time()
model_2 = mlp
model_2_name = 'MLPRegressor'
model_2.fit(X_train,y_train)
print(model_2_name + ' Train time: ' + str((time.time() - begin)/60) + " minutes")
preds = model_2.predict(X_test)
mse = mean_squared_error(y_test,preds)
r2 = r2_score(y_test,preds)
scores = cross_val_score(model_2, X_train, y_train, cv=5)
print(model_2_name + ' MSE: ' + str(mse))
print(model_2_name + ' R2 ' + str(r2))
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))