## **Implementacja algorytmu kkN**

### **1. Wczytanie zbioru**

In [50]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q dlcdn.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
!tar xf spark-3.5.0-bin-hadoop3.tgz
!pip install -q findspark

In [51]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"

import findspark
findspark.init()

In [73]:
from functools import reduce
import math
from typing import Optional, Tuple, List

import pandas as pd
import numpy as np
from geopy.distance import geodesic

from pyspark.sql import SparkSession, DataFrame as SparkDataFrame, functions as F
from pyspark.sql.types import IntegerType, FloatType, StringType, StructType, StructField, Row

from google.colab import drive

In [53]:
spark = (
         SparkSession.builder
        .master("local")
        .appName("Colab")
        .config('spark.ui.port', '4050')
        .getOrCreate()
)
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '100g'), ('spark.driver.memory','64g')])
spark.conf.set("park.driver.maxResultSize", "80g")

spark.conf.set('spark.sql.execution.arrow.enabled', 'true')

In [54]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [55]:
columns = ['lon', 'lat', 'Date', 'Rainf', 'Evap', 'AvgSurfT', 'Albedo','SoilT_10_40cm', 'GVEG', 'PotEvap', 'RootMoist', 'SoilM_100_200cm']

# Utworzenie schematu określającego typ zmiennych
schema = StructType()
for i in columns:
  if i == "Date":
    schema = schema.add(i, IntegerType(), True)
  else:
    schema = schema.add(i, FloatType(), True)

In [56]:
%%time
# Wczytanie zbioru Nasa w sparku

nasa = spark.read.format('csv').option("header", True).schema(schema).load('/content/drive/MyDrive/BigMess/NASA/NASA.csv')
nasa.createOrReplaceTempView("nasa")

nasa = (
    nasa
    .withColumn('Year', (F.col('Date') / 100).cast('int'))
    .withColumn('Month', F.col('Date') % 100)
    .drop('Date')
)
nasa.show(5)

+---------+-------+-----+---------+---------+---------+-------------+----------+---------+----------+---------------+----+-----+
|      lon|    lat|Rainf|     Evap| AvgSurfT|   Albedo|SoilT_10_40cm|      GVEG|  PotEvap| RootMoist|SoilM_100_200cm|Year|Month|
+---------+-------+-----+---------+---------+---------+-------------+----------+---------+----------+---------------+----+-----+
|-112.0625|25.0625|  0.0|   4.3807| 288.0707| 41.47715|    289.00714|0.19712792|139.13737|  243.2525|      108.76931|2000|    1|
|-111.9375|25.0625|  0.0|4.6673994|287.39276|41.509407|     288.8017|0.19860405|162.25638| 220.77466|       90.67495|2000|    1|
|-111.8125|25.0625|  0.0|5.8487973| 287.6554|41.505375|    289.55984|0.17118543|121.55404| 103.95005|      161.94794|2000|    1|
|-111.6875|25.0625|  0.0|6.4366016| 287.5386|41.501343|    289.61142|0.17118543|127.63407|106.032845|      163.44402|2000|    1|
|-111.5625|25.0625|  0.0|3.4506986| 287.2394|41.509407|     289.2371| 0.1429876|179.37668| 161.43

In [57]:
nasa_coords = spark.sql("""SELECT DISTINCT lat, lon FROM nasa""")
nasa_coords.collect()

[Row(lat=25.1875, lon=-104.8125),
 Row(lat=25.1875, lon=-101.8125),
 Row(lat=25.4375, lon=-99.8125),
 Row(lat=25.5625, lon=-105.6875),
 Row(lat=25.9375, lon=-103.3125),
 Row(lat=26.3125, lon=-106.0625),
 Row(lat=26.4375, lon=-101.6875),
 Row(lat=26.4375, lon=-80.9375),
 Row(lat=26.5625, lon=-106.4375),
 Row(lat=26.6875, lon=-98.1875),
 Row(lat=26.6875, lon=-82.0625),
 Row(lat=26.9375, lon=-81.4375),
 Row(lat=27.1875, lon=-101.3125),
 Row(lat=27.4375, lon=-81.6875),
 Row(lat=28.4375, lon=-104.1875),
 Row(lat=30.4375, lon=-115.1875),
 Row(lat=32.0625, lon=-112.5625),
 Row(lat=32.1875, lon=-81.3125),
 Row(lat=32.3125, lon=-115.8125),
 Row(lat=32.3125, lon=-102.9375),
 Row(lat=32.3125, lon=-102.8125),
 Row(lat=32.3125, lon=-84.0625),
 Row(lat=32.4375, lon=-112.6875),
 Row(lat=32.5625, lon=-108.5625),
 Row(lat=32.5625, lon=-88.4375),
 Row(lat=32.5625, lon=-80.5625),
 Row(lat=32.6875, lon=-81.6875),
 Row(lat=32.8125, lon=-105.3125),
 Row(lat=33.1875, lon=-103.6875),
 Row(lat=33.5625, lon=-83

### **2. Implementacja algorytmu kNN:**

In [78]:
# function searches for points that lie within a (euclidean) ball of size *radius* around the query points
# if optional argument *k* is given then function searches for at most k nearest points that lie within
# a (euclidean) ball of size *radius* around the query points

#point = (latitude, longitude)
#df - Spark DataFrame collected in a list of Rows (in order to collect your DataFrame run: df.collect() )
def kRadiusNN(df: List[Row], radius: float, point: Tuple[float, float], label_column_name: str, k: Optional[int]=None) -> pd.DataFrame :
  assert (25.0625<= point[0] <=52.9375) and (-124.9375<= point[1] <=-67.0625 ), 'Wrong coordinates (out of range)'

  neighbours_pd = pd.DataFrame({"lon":[], "lat":[], 'dist':[], label_column_name: []})

  for row in df:

     lon = row['lon']
     lat = row['lat']
     label = row[label_column_name]
     dist = geodesic((lat, lon), point).km
     if dist <= radius:
        new_row = {'lon': lon , 'lat': lat, 'dist' : dist, label_column_name : label}
        neighbours_pd.loc[len(neighbours_pd)] = new_row

  if k and (k < len(neighbours_pd)):

      neighbours_pd = neighbours_pd.sort_values('dist', ascending=True)
      if not (neighbours_pd.at[k-1,'dist']==neighbours_pd.at[k,'dist']):  #checking if there is no tie (more neighbours with the same distance)
         neighbours_pd = neighbours_pd.iloc[:k]
      else:
         raise Exception("Unable to determine k nearest neighbours: more neighbours with the same distance")

  if len(neighbours_pd) == 0:
         raise Exception("No neighbours found within the given radius")

  return(neighbours_pd)

In [80]:
# weighted: if True then function will weight points by the inverse of their distance (in this case, closer neighbours of
# a query point will have a greater influence than neighbors which are further away).


#point = (latitude, longitude)
#df - Spark DataFrame collected in a list of Rows (in order to collect your DataFrame run: df.collect(), before applying function to your DataFrame)
def predict_class(df: List[Row], point: Tuple[float, float], radius: float, label_column_name: str,
                  k: Optional[int]=None, weighted: Optional[bool]=False) -> int :
 if k:
    neighbours = kRadiusNN(df, radius, point, label_column_name, k)
 else:
    neighbours = kRadiusNN(df, radius, point, label_column_name)


 if weighted:   #weighted nearest neighbours
    neighbours['dist'] = neighbours['dist'].map(lambda x: 1/x)

    label0 = neighbours[neighbours[label_column_name]==0]
    label1 = neighbours[neighbours[label_column_name]==1]

    frequency0 = label0['dist'].sum()
    frequency1 = label1['dist'].sum()

    return 0 if frequency0 > frequency1 else 1


 else:

    if len(neighbours[label_column_name].mode()) > 1:    #this is only possible when the number of neighbours is an even number (since we perform binary classification)
       raise Exception("Unable to predict the label: we have a tie, there is no clear winner in the majority voting")
    else:
      predicted_label = neighbours[label_column_name].mode().iat[0]

    return predicted_label


Wygenerujemy w sposób sztuczny etykiety dla zbioru w celu przetestowania funkcji:

In [60]:
df = nasa_coords.withColumn('label', (F.col('lat').cast('int'))%2)

In [67]:
df = df.collect()

In [68]:
%%time
predict_class(df, (40.5, -95), 18, "label", k=8, weighted=True)

CPU times: user 19.1 s, sys: 58.2 ms, total: 19.2 s
Wall time: 19.4 s


0

In [69]:
%%time
predict_class(df, (29.5, -92), 20, "label", k=None, weighted=False)

CPU times: user 18.7 s, sys: 61.8 ms, total: 18.8 s
Wall time: 18.9 s


1

In [70]:
%%time
predict_class(df, (31.5, -112), 20, "label", k=8, weighted=True)

CPU times: user 19.6 s, sys: 73.1 ms, total: 19.7 s
Wall time: 19.9 s


1

In [71]:
%%time
predict_class(df, (40.5, -95), 18, "label", k=8, weighted=True)


CPU times: user 19.1 s, sys: 75.2 ms, total: 19.2 s
Wall time: 19.4 s


0

In [77]:
%%time
predict_class(df, (49.375, -80.125), 15, "label", k=8, weighted=True)

CPU times: user 19.2 s, sys: 51.7 ms, total: 19.2 s
Wall time: 19.5 s


1