In [2]:
# spot check machine learning algorithms on the glass identification dataset
from numpy import mean
from numpy import std
from pandas import read_csv
from matplotlib import pyplot
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from imblearn.over_sampling import SMOTE

# load the dataset
def load_dataset(full_path):
	# load the dataset as a numpy array
	data = read_csv(full_path)
	# retrieve numpy array
	data = data.values
	# split into input and output elements
	X, y = data[:, :-1], data[:, -1]
	oversample = SMOTE()
	X_sm, y_sm = oversample.fit_resample(X, y)
	#Dimensionamento dos recursos
	# label encode the target variable to have the classes 0 and 1
	y = LabelEncoder().fit_transform(y_sm)
	return X_sm, y

# evaluate a model
def evaluate_model(X, y, model):
	# define evaluation procedure
	cv = KFold(n_splits=10, shuffle=False)
	# evaluate model
	scores = cross_val_score(model, X, y, cv=cv)
	return scores

# define models to test
def get_models():
	models, names = list(), list()
	# SVM
	models.append(SVC(kernel = 'linear', C=1))
	names.append('SVM')
	# KNN
	models.append(GaussianNB())
	names.append('NB')
	# RF
	models.append(RandomForestClassifier(n_estimators=10, random_state=42))
	names.append('RF')
	return models, names

# define the location of the dataset
full_path = 'Privacidade.csv'
# load the dataset
X, y = load_dataset(full_path)
# define models
models, names = get_models()
results = list()
# evaluate each model
for i in range(len(models)):
	# evaluate the model and store results
	scores = evaluate_model(X, y, models[i])
	results.append(scores)
	# summarize performance
	print('&gt;%s %.3f (%.3f)' % (names[i], mean(scores), std(scores)))

&gt;SVM 0.998 (0.006)
&gt;NB 0.842 (0.129)
&gt;RF 0.992 (0.013)
