In [1]:
!pip install pyspark

from google.colab import drive
from pyspark.sql.functions import col
from pyspark.sql.functions import size
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, FloatType, StringType, StructType
from pyspark.sql import Window
from geopy.distance import geodesic

import pyspark
import pyspark.sql.functions as F
import pyspark.sql.types as T

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=e1ba275013f3d45ac4fe08380f54557b6240fd91cbb1dfc1fbd9b74edb062951
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [2]:
# tworzenie sesji w Sparku
spark = SparkSession.builder.appName('SparkWindows').getOrCreate()

In [3]:
# wczytanie danych z google drive
drive.mount('/content/drive')

columns = ['lon', 'lat', 'Date', 'Rainf', 'Evap', 'AvgSurfT', 'Albedo','SoilT_10_40cm', 'GVEG', 'PotEvap', 'RootMoist', 'SoilM_100_200cm']

# Utworzenie schematu okreslajacego typ zmiennych
schema = StructType()
for i in columns:
  if i == "Date":
    schema = schema.add(i, IntegerType(), True)
  else:
    schema = schema.add(i, FloatType(), True)

nasa = spark.read.format('csv').option("header", True).schema(schema).load('/content/drive/MyDrive/BigMess/NASA/NASA.csv')
nasa.createOrReplaceTempView("nasa")
nasa.show(5)


Mounted at /content/drive
+---------+-------+------+-----+---------+---------+---------+-------------+----------+---------+----------+---------------+
|      lon|    lat|  Date|Rainf|     Evap| AvgSurfT|   Albedo|SoilT_10_40cm|      GVEG|  PotEvap| RootMoist|SoilM_100_200cm|
+---------+-------+------+-----+---------+---------+---------+-------------+----------+---------+----------+---------------+
|-112.0625|25.0625|200001|  0.0|   4.3807| 288.0707| 41.47715|    289.00714|0.19712792|139.13737|  243.2525|      108.76931|
|-111.9375|25.0625|200001|  0.0|4.6673994|287.39276|41.509407|     288.8017|0.19860405|162.25638| 220.77466|       90.67495|
|-111.8125|25.0625|200001|  0.0|5.8487973| 287.6554|41.505375|    289.55984|0.17118543|121.55404| 103.95005|      161.94794|
|-111.6875|25.0625|200001|  0.0|6.4366016| 287.5386|41.501343|    289.61142|0.17118543|127.63407|106.032845|      163.44402|
|-111.5625|25.0625|200001|  0.0|3.4506986| 287.2394|41.509407|     289.2371| 0.1429876|179.37668| 1

In [4]:
nasa_ym = spark.sql("""
          SELECT
          CAST(SUBSTRING(CAST(Date AS STRING), 1, 4) AS INT) AS Year,
          CAST(SUBSTRING(CAST(Date AS STRING), 5, 2) AS INT) AS Month,
          n.*
          FROM nasa n
          """)
nasa_ym = nasa_ym.drop("Date")

nasa_ym.createOrReplaceTempView("nasa_ym")

In [5]:
# wybieramy dane z lipca 2k23
SparkDataFrame_2023_7 = spark.sql("""
                        SELECT
                        *
                        FROM nasa_ym WHERE (Year == 2023) and (Month == 7)
                        order by lon, lat, Year, Month
                        """)
SparkDataFrame_2023_7.show(5)

+----+-----+---------+-------+--------+--------+--------+--------+-------------+---------+--------+---------+---------------+
|Year|Month|      lon|    lat|   Rainf|    Evap|AvgSurfT|  Albedo|SoilT_10_40cm|     GVEG| PotEvap|RootMoist|SoilM_100_200cm|
+----+-----+---------+-------+--------+--------+--------+--------+-------------+---------+--------+---------+---------------+
|2023|    7|-124.9375|48.8125|52.83326|35.82973|286.1314|19.78629|      282.349| 0.853646|179.8188| 453.1293|       229.7097|
|2023|    7|-124.9375|48.9375|38.92641| 46.2698|288.2968|19.52688|     284.2224| 0.853646|224.1511| 416.8515|       212.1873|
|2023|    7|-124.9375|49.0625|28.72708|43.29089|287.6732|19.38844|     283.7652|0.8625529|249.8481| 404.1081|       209.3529|
|2023|    7|-124.9375|49.1875| 22.0683| 45.7691|288.7706|19.38441|     284.7356|0.8625529|265.3578| 391.5402|       204.5058|
|2023|    7|-124.9375|49.3125| 19.8993|54.68368|291.3871|19.38038|     286.9337|0.8549436|263.7617| 371.7049|       19

In [None]:
# tworze sztuczna tabele ktora posluzy mi do wstepnego sprawdzenia funkcji w wersji pandasowej
from pyspark.sql.types import StructType, StructField, FloatType

schema = StructType([
    StructField("lon", FloatType(), True),
    StructField("lat", FloatType(), True),
    StructField("GVEG", IntegerType(), True)
])

data = [(-112.0625, 25.0625, 45),
        (-104.4165, 32.4484, 20),
        (-112.0111, 25.0995, 30),
        (-104.3406, 32.5318, 40),
        (-111.9428, 25.0280, 50)]

dataF = spark.createDataFrame(data, schema=schema)

# Wyświetlenie zawartości tabeli
dataF.show()

+---------+-------+----+
|      lon|    lat|GVEG|
+---------+-------+----+
|-112.0625|25.0625|  45|
|-104.4165|32.4484|  20|
|-112.0111|25.0995|  30|
|-104.3406|32.5318|  40|
|-111.9428| 25.028|  50|
+---------+-------+----+



In [None]:
import geopandas as gpd
from shapely.geometry import Point
from pyspark.sql.functions import col, lit, when

# funkcja, która dzialaa tylko, że konwertuje tabele Spark na Pandas
def space_features(dataframe):
    # Przeksztalcenie DataFrame na GeoDataFrame
    geometry = [Point(lon, lat) for lon, lat in zip(dataframe.select("lon").rdd.flatMap(lambda x: x).collect(), dataframe.select("lat").rdd.flatMap(lambda x: x).collect())]
    gdf = gpd.GeoDataFrame(dataframe.toPandas(), geometry=geometry)

    # Funkcja do obliczania sredniej GVEG w odleglosci 50 km
    def average_gveg_within_radius(center_point, radius=50):
        circle = center_point.buffer(radius / 111.32)  # Przyblizona konwersja stopni na kilometr (1 stopien to okolo 111.32 km)
        points_within_circle = gdf[gdf.geometry.within(circle)]
        if len(points_within_circle) > 0:
            return points_within_circle["GVEG"].mean()
        else:
            return None

    # Dodanie kolumny avgGVEG
    df_with_avg_gveg = dataframe.withColumn("avgGVEG", lit(None).cast("float"))  # Inicjalizacja kolumny avgGVEG

    # Obliczenia dla kazdego wiersza
    for row in dataframe.collect():
        lon, lat, _ = row
        center_point = Point(lon, lat)
        avg_gveg = average_gveg_within_radius(center_point)
        df_with_avg_gveg = df_with_avg_gveg.withColumn("avgGVEG", when((col("lon") == lon) & (col("lat") == lat), avg_gveg).otherwise(col("avgGVEG")))
    return df_with_avg_gveg




In [None]:
wynik_sztucznej_tabeli = space_features(dataF)
wynik_sztucznej_tabeli.show(5)

+---------+-------+----+------------------+
|      lon|    lat|GVEG|           avgGVEG|
+---------+-------+----+------------------+
|-112.0625|25.0625|  45|41.666666666666664|
|-104.4165|32.4484|  20|              30.0|
|-112.0111|25.0995|  30|41.666666666666664|
|-104.3406|32.5318|  40|              30.0|
|-111.9428| 25.028|  50|41.666666666666664|
+---------+-------+----+------------------+



In [None]:
# teraz zrobimy test na pierwszy 500 wierszach naszej prawdziwej tabeli, ale dla przyspieszenia procesu ograniczymy sie tylko do trzech kolumn
selected_columns_2023_7 = SparkDataFrame_2023_7.select(col("lon"), col("lat"), col("GVEG"))
first_500 = selected_columns_2023_7.limit(500)

In [None]:
wynik_first_500 = space_features(first_500)
wynik_first_500.show(5)

+---------+-------+---------+------------------+
|      lon|    lat|     GVEG|           avgGVEG|
+---------+-------+---------+------------------+
|-124.9375|48.8125| 0.853646| 0.860625684261322|
|-124.9375|48.9375| 0.853646|0.8644187450408936|
|-124.9375|49.0625|0.8625529|0.8666259050369263|
|-124.9375|49.1875|0.8625529|0.8674166798591614|
|-124.9375|49.3125|0.8549436|0.8665063381195068|
+---------+-------+---------+------------------+
only showing top 5 rows



In [None]:
# tutaj to zadzialalo ale jezeli zastosujemy te funkcje do calego zbioru z lipca 2k23 to dostajemy bledy o przekroczeniu limitow.

In [None]:
# ponizej proby na oknach w Sparku

In [9]:
from pyspark.sql.functions import udf
from pyspark.sql.types import DoubleType

# funkcja do liczenia odleglosci
haversine_udf = udf(lambda lon1, lat1, lon2, lat2: haversine_distance(lon1, lat1, lon2, lat2), DoubleType())

In [11]:
# tutaj mamy funkcje ktora teoretycznie moglaby robic to czego potrzebujemy, a pod nia po kolei test jej kolejnych krokow

def calculate_average_within_distance(km: int):
    # Filtruj wiersze, aby uniknąć kombinacji tych samych współrzędnych geograficznych + wybieramy ten sam moment w czasie
    filtered_nasa = nasa.crossJoin(nasa.withColumnRenamed("lon", "lon2").withColumnRenamed("lat", "lat2"))
    # obliczamy odleglosc dla kazdej pary wspolrzednych
    filtered_nasa = filtered_nasa.withColumn('distance_km', haversine_udf('lon', 'lat', 'lon2', 'lat2'))

    # wybieramy te pary dla ktorych odleglosc jest mniejsza od podanej w parametrze funkcji
    filtered_nasa = filtered_nasa.filter((col("distance") > 0) & (col("distance") <= km))

    # towrzymy okno
    window_spec = Window().partitionBy("lon", "lat").orderBy("distance").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)

    # liczymy srednia z kolumny GVEG z punktow w danych na podanym obszarze
    result_nasa = filtered_nasa.withColumn("avgGVEG", F.avg("gveg").over(window_spec))

    return result_df

In [7]:
filtered_nasa = SparkDataFrame_2023_7.crossJoin(SparkDataFrame_2023_7.withColumnRenamed("lon", "lon2").withColumnRenamed("lat", "lat2"))

In [14]:
# Assuming filtered_nasa is your PySpark DataFrame
filtered_nasa = filtered_nasa.withColumn('distance', haversine_udf('lon', 'lat', 'lon2', 'lat2'))

In [15]:
# wybieramy te pary dla ktorych odleglosc jest mniejsza od podanej w parametrze funkcji
filtered_nasa = filtered_nasa.filter((col("distance") > 0) & (col("distance") <= 50))

In [17]:
# Tworzymy okno dla partycji zgrupowanych według 'lon' i 'lat' oraz sortujemy po 'distance'
window_spec = Window().partitionBy("lon", "lat").orderBy("distance").rowsBetween(Window.unboundedPreceding, Window.unboundedFollowing)

In [20]:
# Tworzymy kolumne 'avgGVEG' z wartosciami srednimi dla 'gveg' w obrebie 50 jednostek odleglosci
result_nasa = filtered_nasa.withColumn("avgGVEG", F.avg("gveg").over(window_spec))

# trzeba usunac kolumny ktore zostaly zduplikowane przy corssjoinie tylko nie wiem jak bo jesli robie .drop('zduplikowana_kolumna') to pisze ze nie moze bo nie wie ktora. odniesienie sie do numeracji ze np. usun kolumne 5
# powoduje ten sam blad

AnalysisException: [AMBIGUOUS_REFERENCE] Reference `gveg` is ambiguous, could be: [`nasa_ym`.`gveg`, `nasa_ym`.`gveg`].