In [1]:
!pip install pyspark

from google.colab import drive
from pyspark.sql.functions import col
from pyspark.sql.functions import size
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, FloatType, StringType, StructType
from pyspark.sql import Window
from geopy.distance import geodesic

import pyspark
import pyspark.sql.functions as F
import pyspark.sql.types as T

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=8cae38a1c84598562a763359c2a73fad6ef372eae163a6b93a8757bed9258472
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [2]:
# tworzenie sesji w Sparku
spark = SparkSession.builder.appName('SparkWindows').getOrCreate()

In [3]:
# wczytanie danych z google drive
drive.mount('/content/drive')

columns = ['lon', 'lat', 'Date', 'Rainf', 'Evap', 'AvgSurfT', 'Albedo','SoilT_10_40cm', 'GVEG', 'PotEvap', 'RootMoist', 'SoilM_100_200cm']

# Utworzenie schematu okreslajacego typ zmiennych
schema = StructType()
for i in columns:
  if i == "Date":
    schema = schema.add(i, IntegerType(), True)
  else:
    schema = schema.add(i, FloatType(), True)

nasa = spark.read.format('csv').option("header", True).schema(schema).load('/content/drive/MyDrive/BigMess/NASA/NASA.csv')
nasa.createOrReplaceTempView("nasa")
nasa.show(5)


Mounted at /content/drive
+---------+-------+------+-----+---------+---------+---------+-------------+----------+---------+----------+---------------+
|      lon|    lat|  Date|Rainf|     Evap| AvgSurfT|   Albedo|SoilT_10_40cm|      GVEG|  PotEvap| RootMoist|SoilM_100_200cm|
+---------+-------+------+-----+---------+---------+---------+-------------+----------+---------+----------+---------------+
|-112.0625|25.0625|200001|  0.0|   4.3807| 288.0707| 41.47715|    289.00714|0.19712792|139.13737|  243.2525|      108.76931|
|-111.9375|25.0625|200001|  0.0|4.6673994|287.39276|41.509407|     288.8017|0.19860405|162.25638| 220.77466|       90.67495|
|-111.8125|25.0625|200001|  0.0|5.8487973| 287.6554|41.505375|    289.55984|0.17118543|121.55404| 103.95005|      161.94794|
|-111.6875|25.0625|200001|  0.0|6.4366016| 287.5386|41.501343|    289.61142|0.17118543|127.63407|106.032845|      163.44402|
|-111.5625|25.0625|200001|  0.0|3.4506986| 287.2394|41.509407|     289.2371| 0.1429876|179.37668| 1

In [4]:
nasa_ym = nasa.filter('Date = 202307').drop('Date')
pdf = nasa_ym.toPandas()
print(f'Dataset size {len(pdf)}')
pdf.head()

Dataset size 76063


Unnamed: 0,lon,lat,Rainf,Evap,AvgSurfT,Albedo,SoilT_10_40cm,GVEG,PotEvap,RootMoist,SoilM_100_200cm
0,-112.0625,25.0625,0.4906,2.114799,298.253296,39.528221,294.885101,0.002595,278.5755,351.136108,185.874496
1,-111.9375,25.0625,0.4506,1.5034,299.812103,39.595428,296.497314,0.002595,368.664795,328.992401,171.594803
2,-111.8125,25.0625,0.4106,1.010101,302.517212,39.59409,298.711212,0.0,312.855194,146.726501,199.586807
3,-111.6875,25.0625,0.3749,0.9301,303.365295,39.595428,299.556885,0.0,328.704803,143.987701,198.737793
4,-111.5625,25.0625,0.4388,0.9732,302.8797,39.60215,299.345703,0.0,434.079712,189.947296,214.576797


In [28]:
import pandas as pd
import numpy as np
def create_lon_lat_pairs(df: pd.DataFrame, tolerance_lon: float = 5, tolerance_lat: float = 5, verbose: bool = False):
  projection = df[['lon', 'lat']]
  result = {}
  if verbose:
    count = 0
  for row in projection.itertuples(index=True):
    index, lon, lat = row.index, row.lon, row.lat
    if verbose:
      count += 1
      print(f'Processing item no {count}')
    result[(lon,lat)] = {}
    for lon_other, lat_other in result:
      if (abs(lon-lon_other) < tolerance_lon) and (abs(lat-lat_other) < tolerance_lat):
        result[(lon,lat)].add( (lon_other, lat_other) )
        result[(lon_other, lat_other)].add((lon,lat))
  def generator():
    for key, values in result.items():
      for value in values:
        lon, lat = key
        lon_other, lat_other = value
        yield lon, lat, lon_other, lat_other
  columns = ['lon', 'lat', 'lon_other', 'lat_other']
  return pd.DataFrame(generator(), columns=columns)

In [None]:
# u mnie sie liczy 25 minut
df_grid = create_lon_lat_pairs(pdf, tolerance_lon=1, tolerance_lat=1)
df_grid.head()


In [29]:
def haversine_distance(lon1, lat1, lon2, lat2):
    """
    Calculate the Haversine distance between two points specified by their longitude and latitude.

    Parameters:
    lon1, lat1: Longitude and latitude of the first point
    lon2, lat2: Longitude and latitude of the second point

    Returns:
    Haversine distance in kilometers
    """
    R = 6371  # Earth radius in kilometers

    # Convert degrees to radians
    lon1, lat1, lon2, lat2 = map(np.radians, [lon1, lat1, lon2, lat2])

    # Haversine formula
    dlon = lon2 - lon1
    dlat = lat2 - lat1
    a = np.sin(dlat / 2) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2) ** 2
    c = 2 * np.arcsin(np.sqrt(a))

    # Calculate distance
    distance = R * c
    return distance

In [36]:
df_grid['distance'] = haversine_distance(df_grid['lon'], df_grid['lat'], df_grid['lon_other'], df_grid['lat_other'])
df_grid.sample(5)

Unnamed: 0,lon,lat,lon_other,lat_other,distance
11393301,-118.6875,46.3125,-118.6875,46.6875,41.698097
8572494,-121.5625,42.3125,-122.0625,42.3125,41.113389
15789400,-122.3125,52.1875,-121.9375,52.3125,29.066876
946092,-106.9375,29.5625,-106.8125,29.5625,12.089918
14025907,-73.9375,49.8125,-74.3125,49.6875,30.316139


In [None]:
df_grid[['lon', 'lat']].duplicates()

In [37]:
max_distance = 50
df_grid = df_grid[df_grid['distance'] < max_distance]
df_grid

Unnamed: 0,lon,lat,lon_other,lat_other,distance
0,-112.0625,25.0625,-112.0625,25.0625,0.000000
1,-112.0625,25.0625,-112.0625,25.0625,0.000000
2,-112.0625,25.0625,-111.9375,25.0625,12.590688
3,-112.0625,25.0625,-111.8125,25.0625,25.181373
4,-112.0625,25.0625,-111.6875,25.0625,37.772053
...,...,...,...,...,...
16256269,-67.0625,52.9375,-67.4375,52.9375,25.130824
16256270,-67.0625,52.9375,-67.3125,52.9375,16.753894
16256271,-67.0625,52.9375,-67.1875,52.9375,8.376950
16256272,-67.0625,52.9375,-67.0625,52.9375,0.000000


In [38]:
from typing import Optional

def average_over_space_window(df: pd.DataFrame,max_distance: float, df_grid: Optional[pd.DataFrame] = None, tolerance_lon : float =1, tolerance_lat: float =1) -> pd.DataFrame:
  """
  some parameters works only id df_grid is left None
  df grid must also have proper column names
  """
  if df_grid is None:
    df_grid = create_lon_lat_pairs(pdf, tolerance_lon=tolerance_lon, tolerance_lat=tolerance_lat)
    df_grid['distance'] = haversine_distance(df_grid['lon'], df_grid['lat'], df_grid['lon_other'], df_grid['lat_other'])
  df_grid = df_grid[df_grid['distance'] < max_distance]
  df_grid = df_grid.drop(columns='distance')
  window_data = pd.merge(df_grid, df, left_on = ['lon_other', 'lat_other'], right_on = ['lon', 'lat'])
  window_data.rename(columns={'lon_x': 'lon', 'lat_x': 'lat'}, inplace=True)
  window_data.drop(['lon_other', 'lat_other', 'lon_y', 'lat_y'], axis=1, inplace=True)
  return window_data.groupby(['lon', 'lat']).mean().reset_index()


In [39]:
result = average_over_space_window(pdf,max_distance=50, df_grid=df_grid)
result

Unnamed: 0,lon,lat,Rainf,Evap,AvgSurfT,Albedo,SoilT_10_40cm,GVEG,PotEvap,RootMoist,SoilM_100_200cm
0,-124.9375,48.8125,41.702904,43.425835,287.279297,19.601549,283.619812,0.863302,227.082916,428.061249,217.845901
1,-124.9375,48.9375,36.894627,44.426498,287.743713,19.546310,283.923584,0.864130,234.306213,420.835632,214.968567
2,-124.9375,49.0625,33.121204,45.041183,288.108215,19.496737,284.119141,0.865302,239.109009,417.127777,213.608459
3,-124.9375,49.1875,29.460777,46.578056,288.614288,19.439306,284.461731,0.866423,245.374802,411.137024,210.904739
4,-124.9375,49.3125,26.842903,48.151543,289.179749,19.399244,284.856659,0.867838,249.063950,406.087128,208.769791
...,...,...,...,...,...,...,...,...,...,...,...
76058,-67.0625,52.4375,191.745453,76.225761,291.358429,19.991936,281.871979,0.577333,215.356079,490.567413,280.899506
76059,-67.0625,52.5625,191.707138,74.988876,291.217621,20.192894,281.600983,0.580245,212.437210,493.792389,283.788940
76060,-67.0625,52.6875,192.211365,74.382889,291.138123,20.322620,281.432983,0.581145,210.657394,501.197205,286.622314
76061,-67.0625,52.8125,192.062332,73.707466,291.054535,20.438654,281.283691,0.580928,209.468918,516.049744,288.654327


In [None]:
import pandas as pd
coordinates = pdf[['lon', 'lat']]
coordinates['geo'] = list(zip(pdf['lon'], pdf['lat']))
coordinates = coordinates.drop(columns=['lon','lat'])
def cross_join(df1: pd.DataFrame, df2: pd.DataFrame) -> pd.DataFrame:
  df1['cross_key'] = 1
  df2['cross_key'] = 1

  # Perform the cross join
  return pd.merge(df1, df2, on='cross_key').drop('cross_key', axis=1)

coordinates = cross_join(coordinates, coordinates)
coordinates

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  coordinates['geo'] = list(zip(pdf['lon'], pdf['lat']))


In [4]:
nasa_ym = spark.sql("""
          SELECT
          CAST(SUBSTRING(CAST(Date AS STRING), 1, 4) AS INT) AS Year,
          CAST(SUBSTRING(CAST(Date AS STRING), 5, 2) AS INT) AS Month,
          n.*
          FROM nasa n
          """)
nasa_ym = nasa_ym.drop("Date")

nasa_ym.createOrReplaceTempView("nasa_ym")

In [5]:
# wybieramy dane z lipca 2k23
SparkDataFrame_2023_7 = spark.sql("""
                        SELECT
                        *
                        FROM nasa_ym WHERE (Year == 2023) and (Month == 7)
                        order by lon, lat, Year, Month
                        """)
SparkDataFrame_2023_7.show(5)

+----+-----+---------+-------+--------+--------+--------+--------+-------------+---------+--------+---------+---------------+
|Year|Month|      lon|    lat|   Rainf|    Evap|AvgSurfT|  Albedo|SoilT_10_40cm|     GVEG| PotEvap|RootMoist|SoilM_100_200cm|
+----+-----+---------+-------+--------+--------+--------+--------+-------------+---------+--------+---------+---------------+
|2023|    7|-124.9375|48.8125|52.83326|35.82973|286.1314|19.78629|      282.349| 0.853646|179.8188| 453.1293|       229.7097|
|2023|    7|-124.9375|48.9375|38.92641| 46.2698|288.2968|19.52688|     284.2224| 0.853646|224.1511| 416.8515|       212.1873|
|2023|    7|-124.9375|49.0625|28.72708|43.29089|287.6732|19.38844|     283.7652|0.8625529|249.8481| 404.1081|       209.3529|
|2023|    7|-124.9375|49.1875| 22.0683| 45.7691|288.7706|19.38441|     284.7356|0.8625529|265.3578| 391.5402|       204.5058|
|2023|    7|-124.9375|49.3125| 19.8993|54.68368|291.3871|19.38038|     286.9337|0.8549436|263.7617| 371.7049|       19

In [12]:
pdf = SparkDataFrame_2023_7.toPandas()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [11]:
sdf = SparkDataFrame_2023_7.persist()
sdf.count()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [10]:
sdf = sdf.withColumn("cordinates", F.struct(F.col('lon'), F.col('lat')))
cord_sdf = sdf.select('cordinates')
cord_sdf.show()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
distance_matrix = cord_sdf.crossJoin(cord_sdf)
distance_matrix.show()

In [7]:
import pyspark.sql.functions as F
from pyspark.sql.types import DoubleType

def cal_lat_log_dist(df, lat1, long1, lat2, long2):
  # Ref - https://en.wikipedia.org/wiki/Great-circle_distance#Formulae
  # We are using haversine formaula to derive this Distance between two Co-ordinates
  # Parameters:
  # Base DF with Four columns where it has LAT and LONG
  # Corresponding column name in Dataframe -> Cororidnate1 LAT1 LONG1
  # Corresponding column name in Dataframe -> Cororidnate2 LAT2 LONG2

  df = df.withColumn('distance_in_kms' , \
      F.round((F.acos((F.sin(F.radians(F.col(lat1))) * F.sin(F.radians(F.col(lat2)))) + \
              ((F.cos(F.radians(F.col(lat1))) * F.cos(F.radians(F.col(lat2)))) * \
              (F.cos(F.radians(long1) - F.radians(long2))))
                  ) * F.lit(6371.0)), 4))
  return df

In [8]:
pdf = sdf.select('lon','lat').toPandas()
pdf.head()

Unnamed: 0,lon,lat
0,-124.9375,48.8125
1,-124.9375,48.9375
2,-124.9375,49.0625
3,-124.9375,49.1875
4,-124.9375,49.3125


In [None]:
import pandas as pd
merged = pd.merge(pdf, pdf, how='cross', on=None, suffixes=('_src', '_others'))
merged.head()

In [19]:
sdf2 = (
    sdf
    .select('lon','lat')
    .withColumnRenamed('lon', 'lon_src')
    .withColumnRenamed('lat', 'lat_src')
    .crossJoin(
      sdf
      .select('lon','lat')
      .withColumnRenamed('lon', 'lon_other')
      .withColumnRenamed('lat', 'lat_other')
    )
)
#sdf2 = cal_lat_log_dist(sdf2, 'lat_src', 'lon_src', 'lat_other', 'lon_other')

In [21]:
pdf = sdf2.toPandas()
pdf.head()

ERROR:root:KeyboardInterrupt while sending command.
Traceback (most recent call last):
  File "/usr/local/lib/python3.10/dist-packages/py4j/java_gateway.py", line 1038, in send_command
    response = connection.send_command(command)
  File "/usr/local/lib/python3.10/dist-packages/py4j/clientserver.py", line 511, in send_command
    answer = smart_decode(self.stream.readline()[:-1])
  File "/usr/lib/python3.10/socket.py", line 705, in readinto
    return self._sock.recv_into(b)
KeyboardInterrupt


KeyboardInterrupt: 

In [None]:
nasa.crossJoin(nasa.withColumnRenamed("lon", "lon2").withColumnRenamed("lat", "lat2")).withColumn('distance_km', haversine_udf('lon', 'lat', 'lon2', 'lat2'))


In [None]:
# tworze sztuczna tabele ktora posluzy mi do wstepnego sprawdzenia funkcji w wersji pandasowej
from pyspark.sql.types import StructType, StructField, FloatType

schema = StructType([
    StructField("lon", FloatType(), True),
    StructField("lat", FloatType(), True),
    StructField("GVEG", IntegerType(), True)
])

data = [(-112.0625, 25.0625, 45),
        (-104.4165, 32.4484, 20),
        (-112.0111, 25.0995, 30),
        (-104.3406, 32.5318, 40),
        (-111.9428, 25.0280, 50)]

dataF = spark.createDataFrame(data, schema=schema)

# Wyświetlenie zawartości tabeli
dataF.show()

+---------+-------+----+
|      lon|    lat|GVEG|
+---------+-------+----+
|-112.0625|25.0625|  45|
|-104.4165|32.4484|  20|
|-112.0111|25.0995|  30|
|-104.3406|32.5318|  40|
|-111.9428| 25.028|  50|
+---------+-------+----+



In [None]:
import geopandas as gpd
from shapely.geometry import Point
from pyspark.sql.functions import col, lit, when

# funkcja, która dzialaa tylko, że konwertuje tabele Spark na Pandas
def space_features(dataframe):
    # Przeksztalcenie DataFrame na GeoDataFrame
    geometry = [Point(lon, lat) for lon, lat in zip(dataframe.select("lon").rdd.flatMap(lambda x: x).collect(), dataframe.select("lat").rdd.flatMap(lambda x: x).collect())]
    gdf = gpd.GeoDataFrame(dataframe.toPandas(), geometry=geometry)

    # Funkcja do obliczania sredniej GVEG w odleglosci 50 km
    def average_gveg_within_radius(center_point, radius=50):
        circle = center_point.buffer(radius / 111.32)  # Przyblizona konwersja stopni na kilometr (1 stopien to okolo 111.32 km)
        points_within_circle = gdf[gdf.geometry.within(circle)]
        if len(points_within_circle) > 0:
            return points_within_circle["GVEG"].mean()
        else:
            return None

    # Dodanie kolumny avgGVEG
    df_with_avg_gveg = dataframe.withColumn("avgGVEG", lit(None).cast("float"))  # Inicjalizacja kolumny avgGVEG

    # Obliczenia dla kazdego wiersza
    for row in dataframe.collect():
        lon, lat, _ = row
        center_point = Point(lon, lat)
        avg_gveg = average_gveg_within_radius(center_point)
        df_with_avg_gveg = df_with_avg_gveg.withColumn("avgGVEG", when((col("lon") == lon) & (col("lat") == lat), avg_gveg).otherwise(col("avgGVEG")))
    return df_with_avg_gveg




In [None]:
wynik_sztucznej_tabeli = space_features(dataF)
wynik_sztucznej_tabeli.show(5)

+---------+-------+----+------------------+
|      lon|    lat|GVEG|           avgGVEG|
+---------+-------+----+------------------+
|-112.0625|25.0625|  45|41.666666666666664|
|-104.4165|32.4484|  20|              30.0|
|-112.0111|25.0995|  30|41.666666666666664|
|-104.3406|32.5318|  40|              30.0|
|-111.9428| 25.028|  50|41.666666666666664|
+---------+-------+----+------------------+



In [None]:
# teraz zrobimy test na pierwszy 500 wierszach naszej prawdziwej tabeli, ale dla przyspieszenia procesu ograniczymy sie tylko do trzech kolumn
selected_columns_2023_7 = SparkDataFrame_2023_7.select(col("lon"), col("lat"), col("GVEG"))
first_500 = selected_columns_2023_7.limit(500)

In [None]:
wynik_first_500 = space_features(first_500)
wynik_first_500.show(5)

+---------+-------+---------+------------------+
|      lon|    lat|     GVEG|           avgGVEG|
+---------+-------+---------+------------------+
|-124.9375|48.8125| 0.853646| 0.860625684261322|
|-124.9375|48.9375| 0.853646|0.8644187450408936|
|-124.9375|49.0625|0.8625529|0.8666259050369263|
|-124.9375|49.1875|0.8625529|0.8674166798591614|
|-124.9375|49.3125|0.8549436|0.8665063381195068|
+---------+-------+---------+------------------+
only showing top 5 rows



In [None]:
# tutaj to zadzialalo ale jezeli zastosujemy te funkcje do calego zbioru z lipca 2k23 to dostajemy bledy o przekroczeniu limitow.

In [None]:
# ponizej proby na oknach w Sparku

In [None]:
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

# funkcja do liczenia odleglosci
haversine_udf = udf(lambda lon1, lat1, lon2, lat2: haversine_distance(lon1, lat1, lon2, lat2), DoubleType())

In [None]:
# tutaj mamy funkcje ktora teoretycznie moglaby robic to czego potrzebujemy, a pod nia po kolei test jej kolejnych krokow

def calculate_average_within_distance(km: int):
    # Filtruj wiersze, aby uniknąć kombinacji tych samych współrzędnych geograficznych + wybieramy ten sam moment w czasie
    filtered_nasa = nasa.crossJoin(nasa.withColumnRenamed("lon", "lon2").withColumnRenamed("lat", "lat2"))
    # obliczamy odleglosc dla kazdej pary wspolrzednych
    filtered_nasa = filtered_nasa.withColumn('distance_km', haversine_udf('lon', 'lat', 'lon2', 'lat2'))

    # wybieramy te pary dla ktorych odleglosc jest mniejsza od podanej w parametrze funkcji
    filtered_nasa = filtered_nasa.filter((col("distance") > 0) & (col("distance") <= km))

    # towrzymy okno
    window_spec = Window().partitionBy("lon", "lat").orderBy("distance").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)

    # liczymy srednia z kolumny GVEG z punktow w danych na podanym obszarze
    result_nasa = filtered_nasa.withColumn("avgGVEG", F.avg("gveg").over(window_spec))

    return result_df

In [None]:
filtered_nasa = SparkDataFrame_2023_7.crossJoin(SparkDataFrame_2023_7.withColumnRenamed("lon", "lon2").withColumnRenamed("lat", "lat2"))

In [None]:
# Assuming filtered_nasa is your PySpark DataFrame
filtered_nasa = filtered_nasa.withColumn('distance', haversine_udf('lon', 'lat', 'lon2', 'lat2'))

In [None]:
# wybieramy te pary dla ktorych odleglosc jest mniejsza od podanej w parametrze funkcji
filtered_nasa = filtered_nasa.filter((col("distance") > 0) & (col("distance") <= 50))

In [None]:
# Tworzymy okno dla partycji zgrupowanych według 'lon' i 'lat' oraz sortujemy po 'distance'
window_spec = Window().partitionBy("lon", "lat").orderBy("distance").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)

In [None]:
# Tworzymy kolumne 'avgGVEG' z wartosciami srednimi dla 'gveg' w obrebie 50 jednostek odleglosci
result_nasa = filtered_nasa.withColumn("avgGVEG", F.avg("gveg").over(window_spec))

# trzeba usunac kolumny ktore zostaly zduplikowane przy corssjoinie tylko nie wiem jak bo jesli robie .drop('zduplikowana_kolumna') to pisze ze nie moze bo nie wie ktora. odniesienie sie do numeracji ze np. usun kolumne 5
# powoduje ten sam blad

AnalysisException: [AMBIGUOUS_REFERENCE] Reference `gveg` is ambiguous, could be: [`nasa_ym`.`gveg`, `nasa_ym`.`gveg`].