In [1]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q dlcdn.apache.org/spark/spark-3.5.0/spark-3.5.0-bin-hadoop3.tgz
!tar xf spark-3.5.0-bin-hadoop3.tgz
!pip install -q findspark

In [2]:
import os
os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.5.0-bin-hadoop3"

import findspark
findspark.init()

In [248]:
from typing import List, Tuple
import pandas as pd
import numpy as np
import random
from sklearn import preprocessing
from sklearn.mixture import GaussianMixture
from sklearn.metrics import accuracy_score, precision_score, jaccard_score

from pyspark.sql import SparkSession, DataFrame as SparkDataFrame, functions as F
from pyspark.sql.types import IntegerType, FloatType, StringType, StructType, StructField, Row

from google.colab import drive

In [4]:
spark = (
         SparkSession.builder
        .master("local")
        .appName("Colab")
        .config('spark.ui.port', '4050')
        .getOrCreate()
)
conf = spark.sparkContext._conf.setAll([('spark.executor.memory', '100g'), ('spark.driver.memory','64g')])
spark.conf.set("park.driver.maxResultSize", "80g")

spark.conf.set('spark.sql.execution.arrow.enabled', 'true')

In [5]:
drive.mount('/content/drive')

Mounted at /content/drive


In [6]:
columns = ['lon', 'lat', 'Date', 'Rainf', 'Evap', 'AvgSurfT', 'Albedo','SoilT_40_100cm', 'GVEG', 'PotEvap', 'RootMoist', 'SoilM_100_200cm']

# Utworzenie schematu określającego typ zmiennych
schema = StructType()
for i in columns:
  if i == "Date":
    schema = schema.add(i, IntegerType(), True)
  else:
    schema = schema.add(i, FloatType(), True)

In [7]:
%%time
# Wczytanie zbioru Nasa w sparku

nasa = spark.read.format('csv').option("header", True).schema(schema).load('/content/drive/MyDrive/BigMess/NASA/NASA.csv')

nasa = (
    nasa
    .withColumn('Year', (F.col('Date') / 100).cast('int'))
    .withColumn('Month', F.col('Date') % 100)
    .drop('Date')
)
nasa.show(5)

nasa.createOrReplaceTempView("nasa")

+---------+-------+-----+---------+---------+---------+--------------+----------+---------+----------+---------------+----+-----+
|      lon|    lat|Rainf|     Evap| AvgSurfT|   Albedo|SoilT_40_100cm|      GVEG|  PotEvap| RootMoist|SoilM_100_200cm|Year|Month|
+---------+-------+-----+---------+---------+---------+--------------+----------+---------+----------+---------------+----+-----+
|-112.0625|25.0625|  0.0|   4.3807| 288.0707| 41.47715|     289.00714|0.19712792|139.13737|  243.2525|      108.76931|2000|    1|
|-111.9375|25.0625|  0.0|4.6673994|287.39276|41.509407|      288.8017|0.19860405|162.25638| 220.77466|       90.67495|2000|    1|
|-111.8125|25.0625|  0.0|5.8487973| 287.6554|41.505375|     289.55984|0.17118543|121.55404| 103.95005|      161.94794|2000|    1|
|-111.6875|25.0625|  0.0|6.4366016| 287.5386|41.501343|     289.61142|0.17118543|127.63407|106.032845|      163.44402|2000|    1|
|-111.5625|25.0625|  0.0|3.4506986| 287.2394|41.509407|      289.2371| 0.1429876|179.37668

## **Implementacja Expectation-Maximization**

Na razie (do celów testowych) ograniczymy się do do jednego, wybranego roku (2020) i współrzędnych uprzednio zaanotowanych.

In [8]:
nasa2020 = spark.sql(''' SELECT * FROM nasa''').where(nasa.Year == 2020).drop('Year')
nasa2020.show(5)

+---------+-------+------+--------+--------+--------+--------------+---------+--------+---------+---------------+-----+
|      lon|    lat| Rainf|    Evap|AvgSurfT|  Albedo|SoilT_40_100cm|     GVEG| PotEvap|RootMoist|SoilM_100_200cm|Month|
+---------+-------+------+--------+--------+--------+--------------+---------+--------+---------+---------------+-----+
|-112.0625|25.0625|2.4016|19.38381|288.9403|41.47715|      290.1312|0.1971279|137.2037| 401.1353|       199.6243|    1|
|-111.9375|25.0625|2.0352|20.49631|288.4253|41.50941|      289.9881| 0.198604|152.8765| 380.2956|       183.9548|    1|
|-111.8125|25.0625|2.0704| 20.8319|288.9668|41.50537|      290.5693|0.1711854|110.4835| 201.3415|       207.5636|    1|
|-111.6875|25.0625|2.1024|21.48582|288.8314|41.50134|      290.4967|0.1711854|  114.73| 200.7157|       206.5126|    1|
|-111.5625|25.0625|3.4526|18.24449|288.4534|41.50941|       290.131|0.1429876|161.2883|  213.642|       239.1196|    1|
+---------+-------+------+--------+-----

In [9]:
NASA_sample_annotated = pd.read_csv('/content/drive/MyDrive/BigMess/NASA/NASA_an.csv',sep=';')
NASA_sample_annotated = spark.createDataFrame(NASA_sample_annotated).withColumnRenamed("lat", "lat_sam").withColumnRenamed("lon", "lon_sam")
NASA_sample_annotated.show(5)

+---------+-------+--------+----+
|  lon_sam|lat_sam|pustynia|step|
+---------+-------+--------+----+
| -98.4375|49.0625|       0|   0|
| -84.4375|51.9375|       0|   0|
|-100.5625|47.0625|       0|   0|
|-104.6875|48.8125|       0|   0|
| -95.1875|36.5625|       0|   0|
+---------+-------+--------+----+
only showing top 5 rows



In [10]:
 NASA_2020_an = nasa2020.join( NASA_sample_annotated,
                [nasa2020.lon==NASA_sample_annotated.lon_sam , nasa2020.lat==NASA_sample_annotated.lat_sam],
                "inner").drop('lat_sam').drop('lon_sam')

 NASA_2020_an.show(5)

+---------+-------+--------+--------+--------+--------+--------------+---------+--------+---------+---------------+-----+--------+----+
|      lon|    lat|   Rainf|    Evap|AvgSurfT|  Albedo|SoilT_40_100cm|     GVEG| PotEvap|RootMoist|SoilM_100_200cm|Month|pustynia|step|
+---------+-------+--------+--------+--------+--------+--------------+---------+--------+---------+---------------+-----+--------+----+
|-112.0625|25.4375|  1.5544| 19.3186|288.5265|41.51882|      289.5916|0.1682641|132.3863| 425.1631|       217.5735|    1|       1|   0|
|-109.0625|25.6875|  0.5468|33.27829|289.0874|30.31183|      290.3853|0.4438815|120.8952| 184.9112|       210.0811|    1|       1|   0|
|-102.1875|26.3125|  7.6463|11.87471|280.0134|41.64651|      281.8206|0.1470791|128.0688| 193.7183|       204.0133|    1|       1|   0|
| -98.6875|26.4375| 10.2024|10.95381|290.7727|30.52419|      291.5844|0.1398843|121.1354| 116.3283|       187.1595|    1|       1|   0|
|-102.9375|26.5625|5.663199| 12.9635|279.7965|41

In [13]:
NASA_2020_set = NASA_2020_an.toPandas()

In [14]:
NASA_2020_set.head(5)

Unnamed: 0,lon,lat,Rainf,Evap,AvgSurfT,Albedo,SoilT_40_100cm,GVEG,PotEvap,RootMoist,SoilM_100_200cm,Month,pustynia,step
0,-112.0625,25.4375,1.5544,19.3186,288.526489,41.518822,289.591614,0.168264,132.386307,425.163086,217.573502,1,1,0
1,-109.0625,25.6875,0.5468,33.27829,289.087402,30.311831,290.385315,0.443882,120.895203,184.911194,210.0811,1,1,0
2,-102.1875,26.3125,7.6463,11.87471,280.013397,41.646511,281.820587,0.147079,128.068802,193.718307,204.013306,1,1,0
3,-98.6875,26.4375,10.2024,10.95381,290.772705,30.524191,291.584412,0.139884,121.135399,116.3283,187.1595,1,1,0
4,-102.9375,26.5625,5.663199,12.9635,279.796509,41.64785,281.563293,0.129287,131.679596,382.547607,191.083099,1,0,1


In [271]:
NASA2020_means = (NASA_2020_set[['lon', 'lat', 'Evap','PotEvap','RootMoist','Rainf','SoilM_100_200cm']].groupby(
    by=['lon', 'lat']).mean()).reset_index()

#Roczna suma opadów:
NASA2020_sum = (NASA_2020_set[['lon', 'lat', 'Rainf']].groupby(by=['lon', 'lat']).sum()).rename(columns={"Rainf": "Annual Rainfall"}).reset_index()

#Kolumny ze srednimi miesiecznymi indeksu zielonej roslinnosci:
GVEG_columns = pd.DataFrame({'lon':[], 'lat':[], 'GVEG1':[], 'GVEG2':[], 'GVEG3':[], 'GVEG4':[], 'GVEG5':[], 'GVEG6':[], 'GVEG7':[],
                             'GVEG8':[], 'GVEG9':[], 'GVEG10': [], 'GVEG11':[], 'GVEG12':[] })

coordinates = NASA_2020_set[['lon', 'lat']]
coordinates = coordinates.drop_duplicates()
NASA2020_GVEG = NASA_2020_set[['lon', 'lat', 'GVEG', 'Month']]

for i in coordinates.index:
  data = NASA2020_GVEG[(NASA2020_GVEG['lon']==coordinates.at[i,'lon'])&(NASA2020_GVEG['lat']==coordinates.at[i, 'lat'])]
  data = data.sort_values(by=['Month'])
  GVEG = data['GVEG'].tolist()

  row = {'lon': coordinates.at[i, 'lon'], 'lat': coordinates.at[i, 'lat'], 'GVEG1': GVEG[0], 'GVEG2': GVEG[1], 'GVEG3': GVEG[2],
         'GVEG4': GVEG[3], 'GVEG5': GVEG[4], 'GVEG6': GVEG[5], 'GVEG7': GVEG[6], 'GVEG8': GVEG[7], 'GVEG9': GVEG[8], 'GVEG10': GVEG[9],
         'GVEG11': GVEG[10], 'GVEG12': GVEG[11]}

  GVEG_columns.loc[len(GVEG_columns)] = row

NASA2020 = NASA2020_means.merge(NASA2020_sum, how='inner', on=['lon', 'lat']).merge(GVEG_columns, how='inner', on=['lon', 'lat'])

#Kolumny ze srednimi miesiecznymi Root Moist:
#RootMoist_columns = pd.DataFrame({'lon':[], 'lat':[], 'RootMoist1':[], 'RootMoist2':[], 'RootMoist3':[], 'RootMoist4':[], 'RootMoist5':[], 'RootMoist6':[],
#                                'RootMoist7':[], 'RootMoist8':[], 'RootMoist9':[], 'RootMoist10': [], 'RootMoist11':[], 'RootMoist12':[] })
#NASA2020_RootMoist = NASA_2020_set[['lon', 'lat', 'RootMoist', 'Month']]
#for i in coordinates.index:
# data = NASA2020_RootMoist[(NASA2020_RootMoist['lon']==coordinates.at[i,'lon'])&(NASA2020_RootMoist['lat']==coordinates.at[i, 'lat'])]
# data = data.sort_values(by=['Month'])
# RootMoist = data['RootMoist'].tolist()

#  row = {'lon': coordinates.at[i, 'lon'], 'lat': coordinates.at[i, 'lat'], 'RootMoist1': RootMoist[0], 'RootMoist2': RootMoist[1], 'RootMoist3': RootMoist[2],
#         'RootMoist4': RootMoist[3], 'RootMoist5': RootMoist[4], 'RootMoist6': RootMoist[5], 'RootMoist7': RootMoist[6], 'RootMoist8': RootMoist[7], 'RootMoist9': RootMoist[8],
#         'RootMoist10': RootMoist[9], 'RootMoist11': RootMoist[10], 'RootMoist12': RootMoist[11]}

#  RootMoist_columns.loc[len(RootMoist_columns)] = row

#NASA2020 = NASA2020.merge(RootMoist_columns, how='inner', on=['lon', 'lat'])

#Mediany dla wybranych cech:
NASA2020_median = (NASA_2020_set[['lon', 'lat', 'PotEvap', 'Evap', 'SoilM_100_200cm', 'AvgSurfT', 'SoilT_40_100cm' ]]).rename(
    columns={'PotEvap': "PotEvap_Median", 'Evap': 'Evap_Median', 'AvgSurfT': 'AvgSurfTmedian', 'SoilT_40_100cm': "SoilT40_100_Median",
            'SoilM_100_200cm': 'SoilM_100_200cm_Median'}).groupby(by=['lon', 'lat']).median().reset_index()

NASA2020 = NASA2020.merge(NASA2020_median, how='inner', on=['lon', 'lat'])

#tabela z etykietami:
labels = NASA_2020_set[['lon', 'lat', 'pustynia', 'step']].drop_duplicates()

In [174]:
def add_column(feature: str, month: int, dataset: pd.DataFrame, data: pd.DataFrame) -> pd.DataFrame:
  coordinates = dataset[['lon', 'lat']]
  coordinates = coordinates.drop_duplicates()
  feature_df = dataset[['lon', 'lat', feature, 'Month']]
  feature_df_m = feature_df[feature_df['Month'] == month]
  feature_df_m = feature_df_m.drop(columns=['Month']).rename(columns = {feature: feature+str(month)})

  return data.merge(feature_df_m, how='inner', on=['lon', 'lat'])


In [175]:
NASA2020 = add_column('PotEvap', 5, NASA_2020_set, NASA2020)
NASA2020 = add_column('PotEvap', 6, NASA_2020_set, NASA2020)
NASA2020 = add_column('PotEvap', 7, NASA_2020_set, NASA2020)
NASA2020 = add_column('PotEvap', 8, NASA_2020_set, NASA2020)
NASA2020 = add_column('PotEvap', 9, NASA_2020_set, NASA2020)

In [176]:
NASA2020 = add_column('Evap', 5, NASA_2020_set, NASA2020)
NASA2020 = add_column('Evap', 6, NASA_2020_set, NASA2020)
NASA2020 = add_column('Evap', 7, NASA_2020_set, NASA2020)
NASA2020 = add_column('Evap', 8, NASA_2020_set, NASA2020)
NASA2020 = add_column('Evap', 9, NASA_2020_set, NASA2020)

In [177]:
NASA2020 = add_column('Rainf', 5, NASA_2020_set, NASA2020)
NASA2020 = add_column('Rainf', 6, NASA_2020_set, NASA2020)
NASA2020 = add_column('Rainf', 7, NASA_2020_set, NASA2020)
NASA2020 = add_column('Rainf', 8, NASA_2020_set, NASA2020)
NASA2020 = add_column('Rainf', 9, NASA_2020_set, NASA2020)

In [178]:
NASA2020 = add_column('Albedo', 5, NASA_2020_set, NASA2020)
NASA2020 = add_column('Albedo', 6, NASA_2020_set, NASA2020)
NASA2020 = add_column('Albedo', 7, NASA_2020_set, NASA2020)
NASA2020 = add_column('Albedo', 8, NASA_2020_set, NASA2020)
NASA2020 = add_column('Albedo', 9, NASA_2020_set, NASA2020)

In [179]:
NASA2020 = add_column('RootMoist', 4, NASA_2020_set, NASA2020)
NASA2020 = add_column('RootMoist', 5, NASA_2020_set, NASA2020)
NASA2020 = add_column('RootMoist', 6, NASA_2020_set, NASA2020)
NASA2020 = add_column('RootMoist', 7, NASA_2020_set, NASA2020)
NASA2020 = add_column('RootMoist', 8, NASA_2020_set, NASA2020)
NASA2020 = add_column('RootMoist', 9, NASA_2020_set, NASA2020)

In [263]:
NASA2020.head(5)


Unnamed: 0,lon,lat,Evap,PotEvap,RootMoist,Rainf,SoilM_100_200cm,Annual Rainfall,GVEG1,GVEG2,...,Albedo6,Albedo7,Albedo8,Albedo9,RootMoist4,RootMoist5,RootMoist6,RootMoist7,RootMoist8,RootMoist9
0,-124.1875,48.9375,26.427094,120.993134,483.842133,73.564392,243.687424,882.772705,0.466769,0.526758,...,18.16806,19.440861,19.946239,19.47361,472.967712,465.355103,452.168793,441.243286,437.322296,434.976593
1,-124.1875,49.1875,25.858953,106.414398,487.425201,105.696472,247.507126,1268.357666,0.449689,0.52327,...,18.106951,19.41398,19.911289,19.45278,480.244598,482.193604,462.198212,444.509186,415.420013,414.631805
2,-123.3125,43.3125,48.146252,201.660339,527.861145,58.874897,252.254684,706.498779,0.581185,0.620959,...,18.23472,19.55645,19.88172,19.272221,640.795776,615.768982,596.975586,516.66217,433.145691,385.575806
3,-123.1875,38.9375,40.840504,343.695435,351.876434,36.06641,173.429977,432.796906,0.503812,0.59527,...,26.955549,27.077959,27.20027,27.243059,475.458191,434.617889,373.700012,298.016602,243.927704,210.176697
4,-123.1875,44.6875,39.657574,125.759712,275.439362,66.961472,317.462158,803.537659,0.404609,0.49909,...,18.21944,19.52957,19.877689,19.30278,325.066895,317.440308,295.398102,235.927597,181.294601,163.480103


In [181]:
NASA2020features = NASA2020.drop(columns=['lon', 'lat'])

#standaryzujemy dane:
scaler = preprocessing.StandardScaler()
scaler.fit(NASA2020features)
standarized = scaler.transform(NASA2020features)

In [182]:
st_NASA2020 = pd.DataFrame(standarized, columns=NASA2020features.columns)

In [266]:
gm = GaussianMixture(n_components = 2, n_init = 200, max_iter=100, init_params= 'random_from_data', covariance_type='spherical')
gm_result = gm.fit_predict(st_NASA2020)

In [272]:
labels_ = labels['pustynia'].tolist()

In [273]:
accuracy_score(gm_result, labels_)

0.63

In [252]:
precision_score(gm_result, labels_)

0.5362318840579711

In [253]:
jaccard_score(gm_result, labels_)

0.16666666666666666

In [255]:
stNASA2020_ver2 = st_NASA2020.drop(columns = ['GVEG1', 'GVEG2', 'GVEG3', 'GVEG11', 'GVEG12'])

In [256]:
gm = GaussianMixture(n_components = 2, n_init = 200, max_iter=100, init_params= 'random_from_data', covariance_type='spherical')
gm_result = gm.fit_predict(stNASA2020_ver2)

print(accuracy_score(gm_result, labels_))
print(precision_score(gm_result, labels_))
print(jaccard_score(gm_result, labels_))


0.622
0.5507246376811594
0.16740088105726872


### **Dane z jednego miesiaca:**

A teraz sprawdzimy, jakie beda wyniki grupowania na danych z jednego miesiaca (spodziewam sie, ze jakosc podzialu bedzie zalezala od wyboru miesiaca, dlatego przetestujemy algorytm na wszystkich miesiacach po kolei)

In [40]:
NASA_2020_set

Unnamed: 0,lon,lat,Rainf,Evap,AvgSurfT,Albedo,SoilT_40_100cm,GVEG,PotEvap,RootMoist,SoilM_100_200cm,Month,pustynia,step
0,-112.0625,25.4375,1.554400,19.318600,288.526489,41.518822,289.591614,0.168264,132.386307,425.163086,217.573502,1,1,0
1,-109.0625,25.6875,0.546800,33.278290,289.087402,30.311831,290.385315,0.443882,120.895203,184.911194,210.081100,1,1,0
2,-102.1875,26.3125,7.646300,11.874710,280.013397,41.646511,281.820587,0.147079,128.068802,193.718307,204.013306,1,1,0
3,-98.6875,26.4375,10.202400,10.953810,290.772705,30.524191,291.584412,0.139884,121.135399,116.328300,187.159500,1,1,0
4,-102.9375,26.5625,5.663199,12.963500,279.796509,41.647850,281.563293,0.129287,131.679596,382.547607,191.083099,1,0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5995,-74.1875,52.6875,0.000000,5.138396,255.607498,68.297043,273.169586,0.000000,5.428224,433.204498,196.483597,12,0,0
5996,-72.6875,52.6875,0.504600,5.461201,255.323196,67.434143,273.136993,0.000000,5.769086,499.828094,238.111801,12,0,0
5997,-99.5625,52.9375,0.000000,2.939899,255.941101,59.775539,271.222809,0.055678,3.315863,521.949524,258.321899,12,0,0
5998,-78.1875,52.9375,0.000000,6.205701,256.984192,64.500000,273.234314,0.000000,6.558333,569.775818,288.056488,12,0,0


In [261]:
def gmm_monthly_data(month: int) -> Tuple[float, float, float, List[int]]:

  NASA_M_2020 = NASA_2020_set[NASA_2020_set['Month']==month]
  NASA_month = NASA_M_2020.drop(columns=['lon', 'lat', 'pustynia', 'step', 'Month', 'SoilT_40_100cm', 'SoilM_100_200cm'])
  scaler = preprocessing.StandardScaler()
  scaler.fit(NASA_month)
  standarized = scaler.transform(NASA_month)
  st_NASA_month = pd.DataFrame(standarized, columns=NASA_month.columns)
  gm = GaussianMixture(n_components = 2, n_init = 100, max_iter=100, init_params= 'random_from_data', covariance_type='spherical')
  gm_result = gm.fit_predict(st_NASA_month)
  label = NASA_M_2020['pustynia'].tolist()
  acc = accuracy_score(gm_result, label)

  if acc <0.5:
    acc = 1 - acc
    gm_result = gm_result.tolist()
    for i in range(len(gm_result)):
      gm_result[i] = 1 - gm_result[i]

  pre = precision_score(gm_result, label)
  jac = jaccard_score(gm_result, label)
  return (acc, pre, jac, gm_result)

In [262]:
for i in range(12):
  acc, pre, jac, labels = gmm_monthly_data(i+1)
  print("Accuracy dla danych z miesiąca ", i+1, " wynosi ", acc, " , Precision: ", pre, ", Jaccard score: ", jac )


Accuracy dla danych z miesiąca  1  wynosi  0.52  , Precision:  0.7391304347826086 , Jaccard score:  0.17525773195876287
Accuracy dla danych z miesiąca  2  wynosi  0.52  , Precision:  0.8115942028985508 , Jaccard score:  0.1891891891891892
Accuracy dla danych z miesiąca  3  wynosi  0.518  , Precision:  0.14492753623188406 , Jaccard score:  0.0398406374501992
Accuracy dla danych z miesiąca  4  wynosi  0.662  , Precision:  0.0 , Jaccard score:  0.0
Accuracy dla danych z miesiąca  5  wynosi  0.888  , Precision:  0.7391304347826086 , Jaccard score:  0.4766355140186916
Accuracy dla danych z miesiąca  6  wynosi  0.868  , Precision:  0.8985507246376812 , Jaccard score:  0.484375
Accuracy dla danych z miesiąca  7  wynosi  0.854  , Precision:  0.9855072463768116 , Jaccard score:  0.48226950354609927
Accuracy dla danych z miesiąca  8  wynosi  0.762  , Precision:  1.0 , Jaccard score:  0.3670212765957447
Accuracy dla danych z miesiąca  9  wynosi  0.72  , Precision:  1.0 , Jaccard score:  0.3301435

Przetestujemy jeszcze taki tam głupi pomysł: majority voting dla wyników klastrowania dla wybranych miesięcy (glownie miesiecy, w ktorych ma miejsce wegetacja na polkuli polnocnej ):

In [264]:
labels1 = np.zeros(500)
months = [5,6,7,8,9,11]
for i in range(6):
  acc, pre, jac, labels2 = gmm_monthly_data(months[i])
  for j in range(len(labels1)):
     labels1[j] = labels1[j] + labels2[j]

for j in range(len(labels1)):
  if labels1[j]>=4:
     labels1[j] = 1
  else:
     labels1[j] = 0

accuracy_score(labels1, labels_)

0.878

In [247]:
precision_score(labels1, labels_)

0.8985507246376812

In [249]:
jaccard_score(labels1, labels_)

0.5040650406504065