In [3]:
# GETTING SPARK IN COLAB: https://colab.research.google.com/github/asifahmed90/pyspark-ML-in-Colab/blob/master/PySpark_Regression_Analysis.ipynb#scrollTo=9_Uz1NL4gHFx

!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget https://downloads.apache.org/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz
!tar xf spark-3.0.0-bin-hadoop2.7.tgz
!pip install -q findspark

--2020-06-29 14:06:21--  https://downloads.apache.org/spark/spark-3.0.0/spark-3.0.0-bin-hadoop2.7.tgz
Resolving downloads.apache.org (downloads.apache.org)... 88.99.95.219, 2a01:4f8:10a:201a::2
Connecting to downloads.apache.org (downloads.apache.org)|88.99.95.219|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 220272364 (210M) [application/x-gzip]
Saving to: ‘spark-3.0.0-bin-hadoop2.7.tgz.1’


2020-06-29 14:06:30 (22.7 MB/s) - ‘spark-3.0.0-bin-hadoop2.7.tgz.1’ saved [220272364/220272364]



In [5]:
# INIT STUFF OF ANY SPARK CODE

import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.0.0-bin-hadoop2.7"

import findspark
findspark.init()

import pyspark 
conf = pyspark.SparkConf().setMaster('local[4]').setAppName('my first Spark')
sc = pyspark.SparkContext( conf=conf )

# OTHER STUFF

import numpy as np
import warnings


In [20]:
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn import preprocessing

iris = datasets.load_iris()
X_train, X_test, y_train, y_test = train_test_split(iris.data[:,:2], iris.target, test_size=0.2, random_state=42)

D_train = np.column_stack((X_train, y_train))

RDD = sc.parallelize(D_train).cache()
k = 5
k_br = sc.broadcast(k)

def euclidean_distance(element):
  return ((element[:-1]-x_br.value)**2).sum()

def k_nearest(data):
  return sorted(data, key=euclidean_distance)[:k]

p_labels = []
for x in X_test:
  x_br = sc.broadcast(x)
  nn = k_nearest(RDD.mapPartitions(distances).collect())
  counter = dict()
  for n in nn:
    d_k = int(n[-1])
    if d_k in counter:
      counter[d_k] += 1
    else:
      counter[d_k] = 1
  p_labels.append(max(counter.keys(), key=lambda k: counter[k]))

neigh = KNeighborsClassifier(n_neighbors=k, algorithm="brute")
neigh.fit(X_train, y_train)
s_labels = []
for x in X_test:
  s_labels.append(neigh.predict([x])[0])

perc = 0
for x, y in zip(p_labels, s_labels):
  if x == y:
    perc += 1
print("Reality is:\t", list(y_test))
print("Sklearn says:\t", s_labels)
print("And I say:\t", p_labels)
print("{:.2f}% match".format(perc*100/len(p_labels)))


Reality is:	 [1, 0, 2, 1, 1, 0, 1, 2, 1, 1, 2, 0, 0, 0, 0, 1, 2, 1, 1, 2, 0, 2, 0, 2, 2, 2, 2, 2, 0, 0]
Sklearn says:	 [1, 0, 2, 1, 1, 0, 1, 2, 1, 2, 2, 0, 0, 0, 0, 2, 2, 1, 2, 2, 0, 1, 0, 2, 2, 1, 1, 2, 0, 0]
And I say:	 [1, 0, 2, 1, 1, 0, 1, 2, 1, 2, 2, 0, 0, 0, 0, 2, 2, 1, 2, 2, 0, 1, 0, 2, 2, 1, 1, 2, 0, 0]
100.00% match
