Find file Copy path
Fetching contributors…
Cannot retrieve contributors at this time
70 lines (56 sloc) 2.46 KB
import pandas as pd
import numpy as np
import pickle
import sklearn.ensemble as ske
from sklearn import cross_validation, tree, linear_model
from sklearn.feature_selection import SelectFromModel
from sklearn.externals import joblib
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix
data = pd.read_csv('data.csv', sep='|')
X = data.drop(['Name', 'md5', 'legitimate'], axis=1).values
y = data['legitimate'].values
print('Researching important feature based on %i total features\n' % X.shape[1])
# Feature selection using Trees Classifier
fsel = ske.ExtraTreesClassifier().fit(X, y)
model = SelectFromModel(fsel, prefit=True)
X_new = model.transform(X)
nb_features = X_new.shape[1]
X_train, X_test, y_train, y_test = cross_validation.train_test_split(X_new, y ,test_size=0.2)
features = []
print('%i features identified as important:' % nb_features)
indices = np.argsort(fsel.feature_importances_)[::-1][:nb_features]
for f in range(nb_features):
print("%d. feature %s (%f)" % (f + 1, data.columns[2+indices[f]], fsel.feature_importances_[indices[f]]))
# XXX : take care of the feature order
for f in sorted(np.argsort(fsel.feature_importances_)[::-1][:nb_features]):
#Algorithm comparison
algorithms = {
"DecisionTree": tree.DecisionTreeClassifier(max_depth=10),
"RandomForest": ske.RandomForestClassifier(n_estimators=50),
"GradientBoosting": ske.GradientBoostingClassifier(n_estimators=50),
"AdaBoost": ske.AdaBoostClassifier(n_estimators=100),
"GNB": GaussianNB()
results = {}
print("\nNow testing algorithms")
for algo in algorithms:
clf = algorithms[algo], y_train)
score = clf.score(X_test, y_test)
print("%s : %f %%" % (algo, score*100))
results[algo] = score
winner = max(results, key=results.get)
print('\nWinner algorithm is %s with a %f %% success' % (winner, results[winner]*100))
# Save the algorithm and the feature list for later predictions
print('Saving algorithm and feature list in classifier directory...')
joblib.dump(algorithms[winner], 'classifier/classifier.pkl')
open('classifier/features.pkl', 'w').write(pickle.dumps(features))
# Identify false and true positive rates
clf = algorithms[winner]
res = clf.predict(X_test)
mt = confusion_matrix(y_test, res)
print("False positive rate : %f %%" % ((mt[0][1] / float(sum(mt[0])))*100))
print('False negative rate : %f %%' % ( (mt[1][0] / float(sum(mt[1]))*100)))