In [21]:
import pandas as pd
from sklearn_pandas import DataFrameMapper
from sklearn2pmml.pipeline import PMMLPipeline
from sklearn2pmml import sklearn2pmml

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import SelectKBest
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

# Load the dataset
data = pd.read_csv('/Users/rohitkumarchintamani/Downloads/data_public.csv')

# Separate features and target
X = data.drop('Class', axis=1)
y = data['Class']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42, stratify=y)

# Define the pipeline
pipeline = PMMLPipeline([
    ('mapper', DataFrameMapper([
        (X_train.columns.values, [SimpleImputer(strategy='mean'), StandardScaler()])
    ])),
    ('pca', PCA(n_components=4)),
    ('selector', SelectKBest(k=2)),
    ('classifier', RandomForestClassifier(random_state=42))
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Predictions and evaluation
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

# Optionally, export the model to PMML
# sklearn2pmml(pipeline, "ModelPipeline.pmml", with_repr=True)


              precision    recall  f1-score   support

           1       0.46      0.31      0.37     18059
           2       0.75      0.92      0.83     44989
           3       0.69      0.64      0.67     56952

    accuracy                           0.70    120000
   macro avg       0.63      0.62      0.62    120000
weighted avg       0.68      0.70      0.68    120000



In [22]:
pipeline.predict(X_test)

array([2, 2, 3, ..., 2, 3, 1])

In [23]:
y_test.values.ravel()

array([2, 3, 1, ..., 2, 3, 1])

In [24]:
print(classification_report(pipeline.predict(X_test),
                            y_test))

              precision    recall  f1-score   support

           1       0.31      0.46      0.37     12202
           2       0.92      0.75      0.83     55391
           3       0.64      0.69      0.67     52407

    accuracy                           0.70    120000
   macro avg       0.62      0.63      0.62    120000
weighted avg       0.74      0.70      0.71    120000

