In [11]:
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from imblearn.over_sampling import SMOTE  # For balancing
from sklearn_pandas import DataFrameMapper
from sklearn.ensemble import RandomForestClassifier
from sklearn2pmml.decoration import ContinuousDomain
from sklearn.impute import SimpleImputer
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml import sklearn2pmml

In [3]:
# Load the dataset
data = pd.read_csv('/Users/rohitkumarchintamani/Downloads/data_public.csv')

In [4]:
# Separate features and target
X = data.drop('Class', axis=1)
y = data['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

In [5]:
# Create combined DataFrame for training set
train_data = pd.concat([X_train, y_train], axis=1)
train_data.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,Class
688780,-28.419056,-13.605521,9.702405,13.153352,-25.450617,-30.104114,3.979949,-28.409236,-8.154132,25.768482,2.802308,2.235978,-53.602854,-14.199752,1.581281,2
1116888,-30.761285,-10.346601,15.290104,23.327628,-22.745018,-24.338241,0.636401,-21.522868,-9.583754,31.554137,3.38055,6.916146,-55.357701,-3.80717,-0.740548,2
229260,-32.304853,-8.458003,13.59338,15.761076,-23.658604,-26.784405,1.066903,-25.081276,-8.772996,22.292128,2.277458,6.332846,-56.913408,-9.32793,5.185227,2
508992,-41.975919,-17.586835,13.2875,23.134427,-20.711992,-20.870517,8.660636,-20.709046,-7.671481,32.763082,2.439433,-4.908329,-54.110635,-10.694282,4.786347,2
452976,216.419806,-9.253202,224.540722,-13.407619,134.330816,68.435989,131.866624,200.633948,85.327118,138.148327,209.182604,-3.461812,-15.905022,131.194748,149.745579,1


In [6]:
# Create combined DataFrame for testing set
test_data = pd.concat([X_test, y_test], axis=1)
test_data.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,K,L,M,N,O,Class
538618,-37.099004,-17.74508,17.818461,16.123024,-29.018384,-28.122557,4.106259,-25.285297,-6.872593,16.71512,3.169589,2.535431,-55.696459,-7.345575,1.305758,2
645814,-26.393601,-13.033464,9.570214,23.270026,-20.380909,-22.729561,2.015678,-29.318864,-5.342848,25.945599,3.534727,1.033578,-52.638282,-8.785714,-5.521002,3
562698,243.769988,-9.378293,226.080055,-15.343486,132.992627,95.30418,105.978433,189.844817,87.048581,137.547162,226.831533,-4.225293,-11.566791,165.391862,141.350109,1
92301,238.988079,-12.629872,214.233916,-16.480889,129.686255,75.858704,107.341018,193.925408,78.418588,143.117509,222.013252,0.573881,-31.750628,131.609776,135.38345,3
325462,-26.571683,-13.958562,8.091048,20.099971,-25.260579,-30.129442,-0.721471,-23.916279,-8.169571,13.169121,0.671958,3.198173,-55.568459,-12.387438,-1.776283,2


In [7]:
# Define the pipeline
pipeline = PMMLPipeline([
    ('mapper', DataFrameMapper([
        (X_train.columns.values, [ContinuousDomain(), SimpleImputer(), StandardScaler()])
    ])),
    ('pca', PCA(n_components=10)),
    ('selector', SelectKBest(k=9)),
    ('classifier', RandomForestClassifier(
        random_state=42,
        n_estimators=200,
        max_depth=10,
        min_samples_split=10 
    ))
])

In [8]:
# Fit the pipeline on the training data
pipeline.fit(train_data.drop('Class', axis=1), train_data['Class'])

In [9]:
# Predictions and evaluation
print(classification_report(pipeline.predict(X_train),
                            y_train))

              precision    recall  f1-score   support

           1       0.45      0.56      0.50    129123
           2       1.00      0.75      0.86    539809
           3       0.63      0.78      0.70    411068

    accuracy                           0.74   1080000
   macro avg       0.69      0.70      0.68   1080000
weighted avg       0.79      0.74      0.75   1080000



In [13]:
sklearn2pmml(pipeline,
             'RandomForestClassifier_Pipeline.pmml')

Standard output is empty
Standard error:
Exception in thread "main" net.razorvine.pickle.PickleException: pickle stream refers to out-of-band data but no user-overridden next_buffer() method is used

	at net.razorvine.pickle.Unpickler.next_buffer(Unpickler.java:145)
	at net.razorvine.pickle.Unpickler.load_next_buffer(Unpickler.java:378)
	at net.razorvine.pickle.Unpickler.dispatch(Unpickler.java:361)
	at org.jpmml.python.CustomUnpickler.dispatch(CustomUnpickler.java:31)
	at org.jpmml.python.PickleUtil$1.dispatch(PickleUtil.java:64)
	at net.razorvine.pickle.Unpickler.load(Unpickler.java:109)
	at org.jpmml.python.PickleUtil.unpickle(PickleUtil.java:85)
	at com.sklearn2pmml.Main.run(Main.java:71)
	at com.sklearn2pmml.Main.main(Main.java:62)



RuntimeError: The SkLearn2PMML application has failed. The Java executable should have printed more information about the failure into its standard output and/or standard error streams