In [1]:
from sklearn import datasets
import pandas as pd
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from nyoka import skl_to_pmml

# Iris

In [2]:
irisdata = datasets.load_iris()
iris = pd.DataFrame(irisdata.data,columns=irisdata.feature_names)
iris['Species'] = irisdata.target

feature_names = iris.columns.drop('Species')

X_train, X_test, y_train, y_test = train_test_split(iris[iris.columns.drop('Species')], 
                                                    iris['Species'], test_size=0.33, random_state=101)

X_test.to_csv("iris_test.csv")

# auto_mpg

In [None]:
#using auto dataset
upp_names= ["mpg", "cylinders", "displacement", "horsepower", "weight", "acceleration", "model year", "origin", "car name"]

cars = pd.read_table("auto.txt", delim_whitespace=True, names=upp_names)
# cars=pd.read_table('auto-mpg.data',delim_whitespace=True)

feature_names=cars.columns[:-1]
target_names=cars.columns[-1]


X_train, X_test, y_train, y_test = train_test_split(cars[feature_names] ,cars[target_names],test_size=0.33,random_state=101)

X_test.to_csv("auto_test.csv",index=False)

# Bike rentals

In [None]:
df = pd.read_csv('bike_rental_hour.csv')
df = df.drop(['instant','dteday'],axis=1)
X = df.drop(['cnt'],axis=1)
y = df['cnt']
feature_names = [name for name in df.columns if name not in ('cnt')]

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=101)
X_test.to_csv("bike_test.csv",index=False)

# Admissions

In [None]:
adms = pd.read_csv("admissions.csv")
feature_names = adms.columns.drop("admit")
X_train, X_test, y_train, y_test = train_test_split(adms[predictors], adms["admit"], test_size=0.33,random_state=101)

X_test.to_csv(path_or_buf='adms_test_X.csv',sep=',',index=False)

# Loans

In [None]:
loans_2007 = pd.read_csv("loans_2007.csv", encoding='latin-1')

# First set of columns to drop
loans_2007 = loans_2007.drop(['id','member_id','funded_amnt','funded_amnt_inv','grade','sub_grade',
                              'emp_title','issue_d'],axis=1)

# Second set of columns to drop
loans_2007 = loans_2007.drop(['zip_code','out_prncp','out_prncp_inv','total_pymnt',
                              'total_pymnt_inv','total_rec_prncp'],axis = 1)

# Third set of columns to drop
loans_2007 = loans_2007.drop(['total_rec_int','total_rec_late_fee','recoveries','collection_recovery_fee',
                              'last_pymnt_d','last_pymnt_amnt'],axis = 1)

loans_2007 = loans_2007[(loans_2007['loan_status'] == "Fully Paid") | (loans_2007['loan_status'] == "Charged Off")]

status_replace = {
    "loan_status" : {
        "Fully Paid": 1,
        "Charged Off": 0,
    }
}

loans_2007 = loans_2007.replace(status_replace)


drop_columns = []
cols = loans_2007.columns
for col in cols:
    if len(loans_2007[col].dropna().unique()) == 1:
        drop_columns.append(col)
loans_2007 = loans_2007.drop(drop_columns, axis = 1)


loans = loans_2007.drop('pub_rec_bankruptcies' , axis = 1)
loans = loans.dropna(axis=0)

object_columns_df = loans.select_dtypes(include=['object'])

mapping_dict = {
    "emp_length": {
        "10+ years": 10,
        "9 years": 9,
        "8 years": 8,
        "7 years": 7,
        "6 years": 6,
        "5 years": 5,
        "4 years": 4,
        "3 years": 3,
        "2 years": 2,
        "1 year": 1,
        "< 1 year": 0,
        "n/a": 0
    }
}
loans = loans.drop(["last_credit_pull_d", "earliest_cr_line", "addr_state", "title"], axis=1)
loans["int_rate"] = loans["int_rate"].str.rstrip("%").astype("float")
loans["revol_util"] = loans["revol_util"].str.rstrip("%").astype("float")
loans = loans.replace(mapping_dict)


cat_columns = ["home_ownership", "verification_status", "emp_length", "purpose", "term"]
dummy_df = pd.get_dummies(loans[cat_columns])
loans = pd.concat([loans, dummy_df], axis=1)
loans = loans.drop(cat_columns, axis=1)

cols = loans.columns
feature_names = cols.drop("loan_status")


X_train, X_test, y_train, y_test = train_test_split(loans[feature_names], loans["loan_status"], test_size=0.33,random_state=101)
X_test.to_csv(path_or_buf='loans_test_X.csv',sep=',',index=False)

# Diabetes

In [None]:
# Load the data
df_data=pd.read_csv("diabetes.csv")
X=df_data.drop('Glucose',axis=1)
y=df_data['Glucose']

X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.30,random_state=42)
X_test.to_csv('test_diabetes.csv')

feature_names=X.columns
target_names='Glucose'

# Boston

In [None]:
df = datasets.load_boston()
data_frm=pd.DataFrame(data=df.data,columns=df.feature_names)

data_frm['target']=df.target
X=data_frm.drop(['target'],axis=1)
y=data_frm['target']

feature_names = X.columns

X_train, X_test, y_train, y_test = train_test_split(X,y,test_size = 0.3, random_state = 101)
target_names='target'
X_test.to_csv('test_boston.csv')

In [None]:
tita = pd.read_csv("titanic_train.csv")
from sklearn.feature_extraction.text import TfidfVectorizer

tff = TfidfVectorizer()

In [None]:
tfff = tff.fit_transform(tita['Ticket'])

In [None]:
feature_names = tff.get_feature_names()
doc = 0
feature_index = tfff[doc,:].nonzero()[1]
tfidf_scores = zip(feature_index, [tfff[doc, x] for x in feature_index])

In [None]:
for w, s in [(feature_names[i], s) for (i, s) in tfidf_scores]:
    print(w, s)

In [None]:
tff.

##  Model Training

In [3]:
from sklearn.svm import OneClassSVM, SVC
ppp = Pipeline([('model',OneClassSVM())])
ppp.fit(X_train, y_train)

Pipeline(memory=None,
     steps=[('model', OneClassSVM(cache_size=200, coef0=0.0, degree=3, gamma='auto', kernel='rbf',
      max_iter=-1, nu=0.5, random_state=None, shrinking=True, tol=0.001,
      verbose=False))])

In [4]:
skl_to_pmml(pipeline=ppp, col_names=feature_names, pmml_f_name="OneClassSVM.pmml")

In [None]:
skl_preds = ppp.predict(X_test)
adapa = pd.read_csv("test_boston-SCORED.csv")
adapa.head()

In [None]:
def convert_anomaly(val):
    if val == False:
        return 1
    else:
        return -1
    
adapa['anomaly_score'] = adapa['anomaly'].apply(convert_anomaly)

len([(i,j) for (i,j) in zip(adapa['anomaly_score'],ppp.predict(X_test)) if i!=j])

# Isolation Forest

In [5]:
from sklearn.ensemble import IsolationForest, RandomForestClassifier, GradientBoostingClassifier, RandomForestRegressor, GradientBoostingRegressor
iso = Pipeline([('model', IsolationForest(n_estimators=2, random_state=101))])
iso.fit(X_train)

Pipeline(memory=None,
     steps=[('model', IsolationForest(bootstrap=False, contamination=0.1, max_features=1.0,
        max_samples='auto', n_estimators=2, n_jobs=1, random_state=101,
        verbose=0))])

In [4]:
iso.decision_function(X_test)

array([-0.05986349,  0.01909448,  0.01909448,  0.01160753,  0.01472256,
        0.11127388, -0.03714064, -0.03254482,  0.01472256,  0.12403357,
        0.03553708, -0.10435421,  0.09002954, -0.04045226,  0.11127388,
        0.05649732, -0.07855471, -0.03254482,  0.00437426, -0.01534003,
       -0.05986349,  0.09002954, -0.03254482,  0.03142963,  0.0455294 ,
       -0.16078272, -0.10823361,  0.11127388,  0.12403357, -0.05507325,
        0.05723013, -0.03254482,  0.05438796, -0.02991178,  0.05723013,
       -0.07855471,  0.05649732,  0.01984178, -0.07855471, -0.04045226,
        0.09802523,  0.12403357,  0.12403357,  0.05723013, -0.08354759,
       -0.05507325,  0.05438796, -0.02991178, -0.02914819, -0.16078272])

In [10]:
iso.predict(X_test)

array([ 1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1, -1, -1,  1,  1,  1,  1,  1,  1,  1,
        1,  1,  1,  1,  1,  1,  1,  1,  1,  1, -1,  1,  1,  1,  1, -1])

In [5]:
model = iso.named_steps['model']
model.threshold_

-0.08019784119280912

In [6]:
firs = model.estimators_[0]

In [9]:
firs.tree_.value[0][0]

array([0.49479106])

In [6]:
skl_to_pmml(pipeline=iso, col_names=feature_names, pmml_f_name='IsolationForests.pmml')

In [5]:
import math

cn = (2*(math.log(99)+0.577215664901532860606512090082402431)) - (2*(99/100))
cn

8.364671030072245

In [7]:
model = iso.named_steps['model']
est1 = model.estimators_[0]
tr = est.tree_
tr.value[0]
print(dir(model),end='')
model.__getstate__()

NameError: name 'est' is not defined

In [5]:
from sklearn2pmml import PMMLPipeline, sklearn2pmml
siso = PMMLPipeline([('model', IsolationForest(n_estimators=1,random_state=101))])
siso.fit(X_train)

sklearn2pmml(siso, "sianomaly.pmml", with_repr=True)

In [10]:
jppml = siso.named_steps['model']
model = iso.named_steps['model']
print(jppml.threshold_)
print(model.threshold_)

-0.10823360518175151
-0.10823360518175151


In [11]:
siso.decision_function(X_test)

array([-0.10823361, -0.16078272, -0.16078272, -0.05986349, -0.0527445 ,
        0.07395471, -0.01534003, -0.00655918,  0.07395471,  0.096134  ,
       -0.10823361, -0.16078272,  0.01977555, -0.10823361,  0.07395471,
        0.0383246 , -0.00655918, -0.00655918,  0.096134  , -0.01534003,
       -0.05986349,  0.01977555, -0.00655918, -0.01534003,  0.0383246 ,
       -0.16078272, -0.10823361,  0.07395471,  0.096134  , -0.00655918,
       -0.0527445 , -0.00655918, -0.05986349,  0.0383246 , -0.0527445 ,
       -0.00655918,  0.0383246 , -0.01534003, -0.00655918, -0.10823361,
        0.0383246 ,  0.096134  ,  0.096134  , -0.0527445 , -0.01534003,
       -0.00655918, -0.05986349,  0.0383246 , -0.00655918, -0.21787188])

In [12]:
def convert_anomaly(val):
    if val == False:
        return 1
    else:
        return -1
    
adapa['anomaly_score'] = adapa['anomaly'].apply(convert_anomaly)

len([(i,j) for (i,j) in zip(adapa['anomaly_score'],iso.predict(X_test)) if i!=j])

NameError: name 'adapa' is not defined

In [None]:
adapa['anomaly_score']

In [8]:
import sys
from nyoka import PMML43Ext as pml

output = []

n = model.max_samples_

op1 = pml.OutputField(name="rawAnomalyScore", 
                optype="continuous", 
                dataType="double",
                feature="predictedValue",
                isFinalResult="false")


op2 = pml.OutputField(name="normalizedAnomalyScore",
                optype="continuous",
                dataType="double",
                feature="transformedValue",
                isFinalResult="false", 
                Apply=pml.Apply(function="/", 
                                FieldRef=[pml.FieldRef(field="rawAnomalyScore")], 
                                Constant=[pml.Constant(dataType="double",
                                                       valueOf_=(2.0*(math.log(n-1.0)+0.577215664901532860606512090082402431))-
                                                                (2.0*((n-1.0)/n)))]))

appl_inner_inner = pml.Apply(function="*")
cnst = pml.Constant(dataType="double", valueOf_=-1.0)
fldref = pml.FieldRef(field="normalizedAnomalyScore")
cnst.original_tagname_ = 'Constant'
appl_inner_inner.add_FieldRef(cnst)
appl_inner_inner.add_FieldRef(fldref)

appl_inner = pml.Apply(function='pow')
cnst = pml.Constant(dataType="double", valueOf_=2.0)
cnst.original_tagname_ = 'Constant'
appl_inner.add_FieldRef(cnst)
appl_inner_inner.original_tagname_='Apply'
appl_inner.add_FieldRef(appl_inner_inner)

appl_outer = pml.Apply(function="-")
cnst = pml.Constant(dataType="double", valueOf_=0.5)
cnst.original_tagname_ = 'Constant'
appl_outer.add_FieldRef(cnst)
appl_inner.original_tagname_='Apply'
appl_outer.add_FieldRef(appl_inner)


op3 = pml.OutputField(name="decisionFunction",
                      optype="continuous",
                      dataType="double",
                      feature="transformedValue",
                      isFinalResult="false", 
                      Apply=appl_outer)


op4 = pml.OutputField(name="outlier",
                      optype="categorical",
                      dataType="boolean",
                      feature="transformedValue",
                      isFinalResult="true", 
                      Apply=pml.Apply(function="lessOrEqual", 
                                      FieldRef=[pml.FieldRef(field="decisionFunction")],
                                      Constant=[pml.Constant(dataType="double", 
                                                             valueOf_=model.threshold_)]))

output.append(op1)
output.append(op2)
output.append(op3)
output.append(op4)


for op in output:
    op.export(sys.stdout,0)

<OutputField name="rawAnomalyScore" optype="continuous" dataType="double" feature="predictedValue"/>
<OutputField name="normalizedAnomalyScore" optype="continuous" dataType="double" feature="transformedValue">
    <Apply function="/">
        <FieldRef field="rawAnomalyScore"/>
        <Constant dataType="double">8.364671030072245</Constant>
    </Apply>
</OutputField>
<OutputField name="decisionFunction" optype="continuous" dataType="double" feature="transformedValue">
    <Apply function="-">
        <Constant dataType="double">0.5</Constant>
        <Apply function="pow">
            <Constant dataType="double">2.0</Constant>
            <Apply function="*">
                <Constant dataType="double">-1.0</Constant>
                <FieldRef field="normalizedAnomalyScore"/>
            </Apply>
        </Apply>
    </Apply>
</OutputField>
<OutputField name="outlier" optype="categorical" dataType="boolean" feature="transformedValue">
    <Apply function="lessOrEqual">
        <Field

In [9]:
output_fields = list()
n = model.max_samples_
eulers_gamma = 0.577215664901532860606512090082402431

output_fields.append(pml.OutputField(name="rawAnomalyScore", 
                                     optype="continuous", 
                                     dataType="double",
                                     feature="predictedValue",
                                     isFinalResult="false"))

output_fields.append(pml.OutputField(name="normalizedAnomalyScore",
                                     optype="continuous",
                                     dataType="double",
                                     feature="transformedValue",
                                     isFinalResult="false", 
                                     Apply=pml.Apply(function="/", 
                                                     FieldRef=[pml.FieldRef(field="rawAnomalyScore")], 
                                                     Constant=[pml.Constant(dataType="double",
                                                                            valueOf_=(2.0*(math.log(n-1.0)+eulers_gamma))-
                                                                                     (2.0*((n-1.0)/n)))])))

appl_inner_inner = pml.Apply(function="*")
cnst = pml.Constant(dataType="double", valueOf_=-1.0)
fldref = pml.FieldRef(field="normalizedAnomalyScore")
cnst.original_tagname_ = 'Constant'
appl_inner_inner.add_FieldRef(cnst)
appl_inner_inner.add_FieldRef(fldref)

appl_inner = pml.Apply(function='pow')
cnst = pml.Constant(dataType="double", valueOf_=2.0)
cnst.original_tagname_ = 'Constant'
appl_inner.add_FieldRef(cnst)
appl_inner_inner.original_tagname_='Apply'
appl_inner.add_FieldRef(appl_inner_inner)

appl_outer = pml.Apply(function="-")
cnst = pml.Constant(dataType="double", valueOf_=0.5)
cnst.original_tagname_ = 'Constant'
appl_outer.add_FieldRef(cnst)
appl_inner.original_tagname_='Apply'
appl_outer.add_FieldRef(appl_inner)

output_fields.append(pml.OutputField(name="decisionFunction",
                                     optype="continuous",
                                     dataType="double",
                                     feature="transformedValue",
                                     isFinalResult="false", 
                                     Apply=appl_outer))

output_fields.append(pml.OutputField(name="outlier",
                                     optype="categorical",
                                     dataType="boolean",
                                     feature="transformedValue",
                                     isFinalResult="true", 
                                     Apply=pml.Apply(function="lessOrEqual", 
                                                     FieldRef=[pml.FieldRef(field="decisionFunction")],
                                                     Constant=[pml.Constant(dataType="double", 
                                                                            valueOf_=model.threshold_)])))

opp = pml.Output(OutputField=output_fields)
opp.export(sys.stdout, 0)

<Output>
    <OutputField name="rawAnomalyScore" optype="continuous" dataType="double" feature="predictedValue"/>
    <OutputField name="normalizedAnomalyScore" optype="continuous" dataType="double" feature="transformedValue">
        <Apply function="/">
            <FieldRef field="rawAnomalyScore"/>
            <Constant dataType="double">8.364671030072245</Constant>
        </Apply>
    </OutputField>
    <OutputField name="decisionFunction" optype="continuous" dataType="double" feature="transformedValue">
        <Apply function="-">
            <Constant dataType="double">0.5</Constant>
            <Apply function="pow">
                <Constant dataType="double">2.0</Constant>
                <Apply function="*">
                    <Constant dataType="double">-1.0</Constant>
                    <FieldRef field="normalizedAnomalyScore"/>
                </Apply>
            </Apply>
        </Apply>
    </OutputField>
    <OutputField name="outlier" optype="categorical" dataTyp