In [1]:
import numpy as np
import requests
import time

from pyspark import SQLContext, SparkContext
from pyspark import SparkConf

from pyspark.ml.feature import StandardScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.feature import StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
from pyspark.mllib.evaluation import BinaryClassificationMetrics
from pyspark.ml.linalg import Vectors

from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.model_selection import StratifiedKFold

from pyspark.mllib.tree import GradientBoostedTrees, GradientBoostedTreesModel
from pyspark.mllib.util import MLUtils
from pyspark.mllib.regression import LabeledPoint
from pyspark.mllib.linalg import Vector as MLLibVector, Vectors as MLLibVectors

%pylab inline

In [2]:
# Load the dataset and labels
x=np.load('Bx.npy')
y=np.load('By.npy')

In [3]:
conf = SparkConf()
sc = SparkContext(conf = conf)
spark = SQLContext(sc)

In [4]:
# Calculate the rank of each feature
R=[]
for h in range(x.shape[1]):
    kmeans = KMeans(init='k-means++', n_clusters=2) #The number of clusters is set to the number of classes in the dataset
    ff=kmeans.fit_predict(x[:,h].reshape(-1,1))
    r=metrics.homogeneity_score(y,ff) #Use the homogeneity score as a rank of the feature
    R.append(r)

In [5]:
#Arrange feature accroding to thier ranks
Rnk=np.argsort(np.array(R))

In [6]:
#Initiate the cross-validation splitter
kfolds=StratifiedKFold(n_splits=5,shuffle=True)

In [7]:
#Per each set of ranks, use cross-validation to calculate accuracy.
smr=[]
et=0
for j in range(Rnk.shape[0]):
    fd=x[:,Rnk[j:]]
    pp=0
    for train,test in kfolds.split(fd,y):
        dff = map(lambda x: (int(float(x[-1])), Vectors.dense(x[:-1])),np.hstack((fd[train],y[train].reshape(-1,1))))
        TrD = spark.createDataFrame(dff,schema=["label", "features"]).rdd.map(lambda row: LabeledPoint(row.label, MLLibVectors.fromML(row.features)))
        dff = map(lambda x: (int(float(x[-1])), Vectors.dense(x[:-1])),np.hstack((fd[test],y[test].reshape(-1,1))))
        TsD = spark.createDataFrame(dff,schema=["label", "features"]).rdd.map(lambda row: LabeledPoint(row.label, MLLibVectors.fromML(row.features)))
        model = GradientBoostedTrees.trainClassifier(TrD,categoricalFeaturesInfo={})
        predictions = model.predict(TsD.map(lambda x: x.features))
        st = time.time()
        labelsAndPredictions = TsD.map(lambda lp: lp.label).zip(predictions)
        et+=time.time()-st
        acc = labelsAndPredictions.filter(lambda lp: lp[0] == lp[1]).count() / float(TsD.count())
        pp=pp+acc
    pp=pp/kfolds.n_splits
    smr.append([j, pp, et*1000000/x.shape[0]]) #Calculate the time required to predict a label per each object in uS.

In [8]:
smr

[[43.0, 0.9783, 0.7004],
 [42.0, 0.9784, 0.6417],
 [41.0, 0.9784, 0.6454],
 [40.0, 0.9783, 0.6319],
 [39.0, 0.9783, 0.6264],
 [38.0, 0.9783, 0.6465],
 [37.0, 0.9783, 0.5924],
 [36.0, 0.9783, 0.5868],
 [35.0, 0.9783, 0.5487],
 [34.0, 0.9783, 0.4774],
 [33.0, 0.9783, 0.4669],
 [32.0, 0.9783, 0.448],
 [31.0, 0.9783, 0.4151],
 [30.0, 0.979, 0.4286],
 [29.0, 0.9783, 0.4177],
 [28.0, 0.9787, 0.406],
 [27.0, 0.9785, 0.3952],
 [26.0, 0.9792, 0.3779],
 [25.0, 0.9788, 0.3642],
 [24.0, 0.9792, 0.3531],
 [23.0, 0.9782, 0.3495],
 [22.0, 0.9786, 0.3738],
 [21.0, 0.9787, 0.3368],
 [20.0, 0.9781, 0.3292],
 [19.0, 0.9781, 0.3251],
 [18.0, 0.9781, 0.3001],
 [17.0, 0.9782, 0.3005],
 [16.0, 0.9781, 0.2939],
 [15.0, 0.9781, 0.2796],
 [14.0, 0.978, 0.2822],
 [13.0, 0.9778, 0.274],
 [12.0, 0.9778, 0.2738],
 [11.0, 0.978, 0.2652],
 [10.0, 0.978, 0.2578],
 [9.0, 0.9763, 0.2521],
 [8.0, 0.9762, 0.2419],
 [7.0, 0.9763, 0.232],
 [6.0, 0.9762, 0.2244],
 [5.0, 0.9777, 0.2277],
 [4.0, 0.976, 0.2201],
 [3.0, 0.9767, 