In [None]:
Create a Simple Estimator

In [1]:
import numpy as np
import pandas as pd

from sklearn.datasets import load_breast_cancer

bc = load_breast_cancer() 

new_feature_names = ['_'.join(ele.split()) for ele in bc.feature_names]

X = pd.DataFrame(bc.data,columns = new_feature_names)
y = bc.target

In [2]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=7, stratify = y)

In [3]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import Ridge
 
class RidgeClassifier(BaseEstimator, ClassifierMixin):
    
    """A Classifier made from Ridge Regression"""
    
    def __init__(self,alpha=0):
        self.alpha = alpha
        
    def fit(self, X, y = None):
        #pass along the alpha parameter to the internal ridge estimator and perform a fit using it
        self.ridge_regressor = Ridge(alpha = self.alpha) 
        self.ridge_regressor.fit(X, y)
        
        #save the seen class labels
        self.class_labels = np.unique(y)
        
        return self
    
    def predict(self, X_test):
        #store the results of the internal ridge regressor estimator
        results = self.ridge_regressor.predict(X_test)
        
        #find the nearest class label
        return np.array([self.class_labels[np.abs(self.class_labels - x).argmin()] for x in results])

In [4]:
r_classifier = RidgeClassifier(1.5)     
r_classifier.fit(X_train, y_train)
r_classifier.score(X_test, y_test)

0.95744680851063835

In [5]:
from sklearn.model_selection import GridSearchCV

param_grid = {'alpha': [0,0.5,1.0,1.5,2.0]}
gs_rc = GridSearchCV(RidgeClassifier(), param_grid, cv = 3).fit(X_train, y_train)

gs_rc.grid_scores_



[mean: 0.94751, std: 0.00399, params: {'alpha': 0},
 mean: 0.95801, std: 0.01010, params: {'alpha': 0.5},
 mean: 0.96063, std: 0.01140, params: {'alpha': 1.0},
 mean: 0.96063, std: 0.01140, params: {'alpha': 1.5},
 mean: 0.96063, std: 0.01140, params: {'alpha': 2.0}]

In [8]:
from sklearn.linear_model import LogisticRegression

lr = LogisticRegression()
lr.fit(X_train,y_train)
lr.score(X_test,y_test)

0.9521276595744681

In [9]:
import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.base import BaseEstimator, ClassifierMixin


from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.linear_model import Ridge

class GEEClassifier(BaseEstimator, ClassifierMixin):
    
    """A Classifier made from statsmodels' Generalized Estimating Equations
    
    documentation available at: http://www.statsmodels.org/dev/gee.html
       
    """
    
    def __init__(self,group_by_feature):
        self.group_by_feature = group_by_feature
          
    def fit(self, X, y = None):
        #Same settings as the documentation's example: 
        self.fam = sm.families.Poisson()
        self.ind = sm.cov_struct.Exchangeable()
        
        #Auxiliary function: only used in this method within the class
        def expand_X(X, y, desired_group): 
            X_plus = X.copy()
            X_plus['y'] = y
    
            #roughly make ten groups
            X_plus[desired_group + '_group'] = (X_plus[desired_group] * 10)//10
    
            return X_plus
        
        #save the seen class labels
        self.class_labels = np.unique(y)
        
        dataframe_feature_names = X.columns
        not_group_by_features = [x for x in dataframe_feature_names if x != self.group_by_feature]
        
        formula_in = 'y ~ ' + ' + '.join(not_group_by_features)
        
        data = expand_X(X,y,self.group_by_feature)
        self.mod = smf.gee(formula_in, 
                           self.group_by_feature + "_group", 
                           data, 
                           cov_struct=self.ind, 
                           family=self.fam)
        
        self.res = self.mod.fit()
        
        return self
    
    def predict(self, X_test):
        #store the results of the internal GEE regressor estimator
        results = self.res.predict(X_test)
        
        #find the nearest class label
        return np.array([self.class_labels[np.abs(self.class_labels - x).argmin()] for x in results])
        
    def print_fit_summary(self):
        print res.summary()
        return self

  from pandas.core import datetools


In [10]:
gee_classifier = GEEClassifier('mean_concavity')     
gee_classifier.fit(X_train, y_train)
gee_classifier.score(X_test, y_test)

0.94680851063829785

In [11]:
import numpy as np
import pandas as pd

data_web_address = "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-diabetes/pima-indians-diabetes.data"

column_names = ['pregnancy_x', 
                'plasma_con', 
                'blood_pressure', 
                'skin_mm', 
                'insulin', 
                'bmi', 
                'pedigree_func', 
                'age', 
                'target']

feature_names = column_names[:-1]
all_data = pd.read_csv(data_web_address , names=column_names)


X = all_data[feature_names]
y = all_data['target']


from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=7,stratify=y)

In [12]:
gee_classifier = GEEClassifier('blood_pressure')     
gee_classifier.fit(X_train, y_train)
gee_classifier.score(X_test, y_test)

0.80519480519480524

In [13]:
r_classifier = RidgeClassifier()     
r_classifier.fit(X_train, y_train)
r_classifier.score(X_test, y_test)

0.76623376623376627

In [14]:
import pickle

f = open('rc_inst.save','wb')
pickle.dump(r_classifier, f, protocol = pickle.HIGHEST_PROTOCOL)
f.close()

In [15]:
import pickle

f = open('rc_inst.save','rb')
r_classifier = pickle.load(f)
f.close()

In [16]:
r_classifier.fit(X_train, y_train)
r_classifier.score(X_test, y_test)

0.76623376623376627