In [1]:
!pip install pyspark

from google.colab import drive
from pyspark.sql.functions import col
from pyspark.sql.functions import size
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, FloatType, StringType, StructType
from pyspark.sql import Window

import pyspark
import pyspark.sql.functions as F
import pyspark.sql.types as T

Collecting pyspark
  Downloading pyspark-3.5.0.tar.gz (316.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m316.9/316.9 MB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.0-py2.py3-none-any.whl size=317425345 sha256=fedb60dbe8acb485f86e6b564e4030e0fc8d601a1d3213cfb6ea21a345da1432
  Stored in directory: /root/.cache/pip/wheels/41/4e/10/c2cf2467f71c678cfc8a6b9ac9241e5e44a01940da8fbb17fc
Successfully built pyspark
Installing collected packages: pyspark
Successfully installed pyspark-3.5.0


In [2]:
# tworzenie sesji w Sparku
spark = SparkSession.builder.appName('SparkWindows').getOrCreate()

In [3]:
# wczytanie danych z google drive
drive.mount('/content/drive')

columns = ['lon', 'lat', 'Date', 'Rainf', 'Evap', 'AvgSurfT', 'Albedo','SoilT_10_40cm', 'GVEG', 'PotEvap', 'RootMoist', 'SoilM_100_200cm']

# Utworzenie schematu okreslajacego typ zmiennych
schema = StructType()
for i in columns:
  if i == "Date":
    schema = schema.add(i, IntegerType(), True)
  else:
    schema = schema.add(i, FloatType(), True)

nasa = spark.read.format('csv').option("header", True).schema(schema).load('/content/drive/MyDrive/BigMess/NASA/NASA.csv')
nasa.show(5)

Mounted at /content/drive
+---------+-------+------+-----+---------+---------+---------+-------------+----------+---------+----------+---------------+
|      lon|    lat|  Date|Rainf|     Evap| AvgSurfT|   Albedo|SoilT_10_40cm|      GVEG|  PotEvap| RootMoist|SoilM_100_200cm|
+---------+-------+------+-----+---------+---------+---------+-------------+----------+---------+----------+---------------+
|-112.0625|25.0625|200001|  0.0|   4.3807| 288.0707| 41.47715|    289.00714|0.19712792|139.13737|  243.2525|      108.76931|
|-111.9375|25.0625|200001|  0.0|4.6673994|287.39276|41.509407|     288.8017|0.19860405|162.25638| 220.77466|       90.67495|
|-111.8125|25.0625|200001|  0.0|5.8487973| 287.6554|41.505375|    289.55984|0.17118543|121.55404| 103.95005|      161.94794|
|-111.6875|25.0625|200001|  0.0|6.4366016| 287.5386|41.501343|    289.61142|0.17118543|127.63407|106.032845|      163.44402|
|-111.5625|25.0625|200001|  0.0|3.4506986| 287.2394|41.509407|     289.2371| 0.1429876|179.37668| 1

In [4]:
# rozdzielenie kolumny Date na Year i Month
nasa.createOrReplaceTempView("nasa")
nasa = spark.sql("""
          SELECT
          CAST(SUBSTRING(CAST(Date AS STRING), 1, 4) AS INT) AS Year,
          CAST(SUBSTRING(CAST(Date AS STRING), 5, 2) AS INT) AS Month,
          n.*
          FROM nasa n
          """)

nasa = nasa.drop("Date")
nasa.show(5)

+----+-----+---------+-------+-----+---------+---------+---------+-------------+----------+---------+----------+---------------+
|Year|Month|      lon|    lat|Rainf|     Evap| AvgSurfT|   Albedo|SoilT_10_40cm|      GVEG|  PotEvap| RootMoist|SoilM_100_200cm|
+----+-----+---------+-------+-----+---------+---------+---------+-------------+----------+---------+----------+---------------+
|2000|    1|-112.0625|25.0625|  0.0|   4.3807| 288.0707| 41.47715|    289.00714|0.19712792|139.13737|  243.2525|      108.76931|
|2000|    1|-111.9375|25.0625|  0.0|4.6673994|287.39276|41.509407|     288.8017|0.19860405|162.25638| 220.77466|       90.67495|
|2000|    1|-111.8125|25.0625|  0.0|5.8487973| 287.6554|41.505375|    289.55984|0.17118543|121.55404| 103.95005|      161.94794|
|2000|    1|-111.6875|25.0625|  0.0|6.4366016| 287.5386|41.501343|    289.61142|0.17118543|127.63407|106.032845|      163.44402|
|2000|    1|-111.5625|25.0625|  0.0|3.4506986| 287.2394|41.509407|     289.2371| 0.1429876|179.37

In [5]:
# Funkcja do tworzenia cech czasowych z wykorzystaniem funkcji window w Sparku
def overYearStats(column: str, n: int):
  """
    Funkcja liczaca statystyki srednia i mediane dla wybranej zmiennej z zakresu n miesiecy wstecz i n miesiecy w przod dla danych wspolrzednych geograficznych

    :param column: zmienna, dla ktorej liczymy min, max itd.
    :param n: liczba dni w przod i w tyl z ktorych liczymy statystyki
  """
  windowSpec = Window.partitionBy("lon", "lat").orderBy("Year", "Month").rowsBetween(-n, n)

  nasa_window = (nasa.withColumn("average_" + column, F.avg(F.col(column)).over(windowSpec))
                .withColumn("median_" + column, F.expr("percentile_approx(" + column + ", 0.5)").over(windowSpec))
                )
  return nasa_window

In [6]:
result = overYearStats("GVEG", 5)

In [7]:
result.show(5)

+----+-----+---------+-------+----------+---------+---------+---------+-------------+----------+---------+---------+---------------+------------------+-----------+
|Year|Month|      lon|    lat|     Rainf|     Evap| AvgSurfT|   Albedo|SoilT_10_40cm|      GVEG|  PotEvap|RootMoist|SoilM_100_200cm|      average_GVEG|median_GVEG|
+----+-----+---------+-------+----------+---------+---------+---------+-------------+----------+---------+---------+---------------+------------------+-----------+
|1979|    1|-124.5625|47.9375| 102.24292|10.942404|274.84088| 19.94993|    277.61047|0.51115257|37.040825| 632.6485|      323.77112| 0.662685622771581| 0.56353873|
|1979|    2|-124.5625|47.9375| 512.05865|30.025711|277.54193| 19.33631|     277.1021|0.52497137|48.730946| 684.1061|      347.15833|0.7008754355566842| 0.64292705|
|1979|    3|-124.5625|47.9375|  186.3494|30.520498| 279.1123|18.885754|     278.0162|0.56353873|109.74138| 645.7304|       332.0991|0.7268796861171722| 0.64292705|
|1979|    4|-124

In [20]:
# jako ze korzystamy z funkcji rangeBetween musimy sprawdzic czy dla kazdej pary wspolrzednych jest zapis z 12 miesiecy kazdego roku. Najpierw stworzymy dodatkowe kolumy "allyears" i "allmonths"
# ktore zawierac beda listy wszystkich lat i miesiecy dla danej pary wspolrzednych
windowSpec = Window.partitionBy('lon', 'lat')

test = (result.withColumn('AllYears', F.collect_list(F.col('Year')).over(windowSpec))
       .withColumn('AllMonths', F.collect_list(F.col('Month')).over(windowSpec))
       )

# usuwamy z naszej tabeli pomocniczej duplikaty lat i miesiecy z list w wyzej utworzonych kolumnach
test = (test.withColumn('AllYears', F.array_distinct('AllYears'))
       .withColumn('AllMonths', F.array_distinct('AllMonths'))
       )

test.show()

+----+-----+---------+-------+----------+---------+---------+---------+-------------+----------+---------+---------+---------------+------------------+-----------+--------------------+--------------------+
|Year|Month|      lon|    lat|     Rainf|     Evap| AvgSurfT|   Albedo|SoilT_10_40cm|      GVEG|  PotEvap|RootMoist|SoilM_100_200cm|      average_GVEG|median_GVEG|            AllYears|           AllMonths|
+----+-----+---------+-------+----------+---------+---------+---------+-------------+----------+---------+---------+---------------+------------------+-----------+--------------------+--------------------+
|1979|    1|-124.5625|47.9375| 102.24292|10.942404|274.84088| 19.94993|    277.61047|0.51115257|37.040825| 632.6485|      323.77112| 0.662685622771581| 0.56353873|[1979, 1980, 1981...|[1, 2, 3, 4, 5, 6...|
|1979|    2|-124.5625|47.9375| 512.05865|30.025711|277.54193| 19.33631|     277.1021|0.52497137|48.730946| 684.1061|      347.15833|0.7008754355566842| 0.64292705|[1979, 1980, 

In [25]:
test_checked_years = test.withColumn("are_years_valid", size(col("AllYears")) == 45) # sprawdzamy czy dla kazdej pary wspolrzednych w kolumnie allYears mamy wszystkie 45 lat
test_checked_months = test.withColumn("are_months_valid", size(col("AllMonths")) == 12) # sprawdzamy czy dla kazdej pary wspolrzednych w kolumnie allMonths mamy wszystkie 12 miesiecy

# Sprawdzenie, czy istnieja jakiekolwiek niewlasciwe tablice w kolumnie "AllYears" i "AllMonths"
invalid_years_count = test_checked_years.filter(col("are_years_valid") == False).count()
invalid_months_count = test_checked_months.filter(col("are_months_valid") == False).count()

In [29]:
# sprwadzamy czy wszedzie mamy 45 lat i 12 miesiecy
print(invalid_years_count, "and", invalid_months_count)

0 and 0
