In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
%matplotlib widget
from sklearn import ensemble, feature_selection, preprocessing, model_selection, neighbors, svm

## Load the dataset

In [20]:
input_file = "data.csv"
df = pd.read_csv(input_file, header = 0)

In [21]:
dataset = df.values
X = dataset[:,1:7]
y = dataset[:,7]
y = y.astype('int')

## Apply Tree based feature selection

In [22]:
col = df.columns.tolist()
col = col[1:7]

In [23]:
imp = np.zeros((6,))
for j in range(1000):
    clf = ensemble.ExtraTreesClassifier(n_estimators=50, random_state=j )
    clf = clf.fit(X, y)
    imp += clf.feature_importances_
imp = imp / 1000

array([0.15940957, 0.13788721, 0.17424087, 0.16455568, 0.17854944,
       0.18535723])

In [24]:
for i in range(len(col)):
    print(f"{col[i]}: {imp[i]}")

# Liquid Flow Rate (mL/min) : 0.15940956903706738
 Atomization O2 Flow Rate (L/min) : 0.1378872132834783
 Sheath O2 Flow Rate (L/min) : 0.17424087331724217
 ER1 (EtOH/AtomO2) : 0.16455567640291435
 ER2 (EtOH/AtomO2+SheathO2): 0.17854944013682392
Burner Cap Temperature: 0.18535722782247394


In [25]:
sc = preprocessing.StandardScaler()
X_new = sc.fit_transform(X)

In [26]:
model = feature_selection.SelectFromModel(clf, prefit=True)
X_new = model.transform(X_new)

## Experimenting with models

In [37]:
score_train = 0
score_cross = 0
for i in range(1000):
    X_train, X_cross, y_train, y_cross = model_selection.train_test_split(X_new, y, train_size=.8, random_state=i)
    neigh = neighbors.KNeighborsClassifier()
    neigh.fit(X_train, y_train)
    score_train = score_train + neigh.score(X_train, y_train)
    score_cross = score_cross + neigh.score(X_cross, y_cross)
score_train = score_train / 10
print('Train Accuracy: %.2f' % score_train)
score_cross = score_cross / 10
print('Cross Accuracy: %.2f' % score_cross)

Train Accuracy: 79.15
Cross Accuracy: 61.29


In [38]:
score_train = 0
score_cross = 0
for i in range(1000):
    X_train, X_cross, y_train, y_cross = model_selection.train_test_split(X_new, y, train_size=.8, random_state=i)
    clf3 = svm.SVC(random_state=1)
    clf3.fit(X_train, y_train)
    score_train = score_train + clf3.score(X_train, y_train)
    score_cross = score_cross + clf3.score(X_cross, y_cross)
score_train = score_train / 10
print('Train Accuracy: %.2f' % score_train)
score_cross = score_cross / 10
print('Cross Accuracy: %.2f' % score_cross)




Train Accuracy: 79.59
Cross Accuracy: 61.12
