In [1]:
import findspark
findspark.init()
findspark.find()
import os
os.environ['HADOOP_CONF_DIR'] = '/etc/hadoop/conf'
os.environ['YARN_CONF_DIR'] = '/etc/hadoop/conf'

In [106]:
from datetime import date, timedelta, datetime
import pyspark.sql.functions as F
from pyspark.sql.window import Window 

In [3]:
from pyspark.sql import SparkSession
spark = SparkSession \
.builder \
.master("yarn")\
.config("spark.driver.cores", 1) \
.config("spark.driver.memory", "2g") \
.config("spark.executor.memory", "2g") \
.appName("Project7") \
.getOrCreate()                               

SLF4J: Class path contains multiple SLF4J bindings.
SLF4J: Found binding in [jar:file:/usr/lib/spark/jars/slf4j-log4j12-1.7.30.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: Found binding in [jar:file:/usr/lib/hadoop/lib/slf4j-log4j12-1.7.25.jar!/org/slf4j/impl/StaticLoggerBinder.class]
SLF4J: See http://www.slf4j.org/codes.html#multiple_bindings for an explanation.
SLF4J: Actual binding is of type [org.slf4j.impl.Log4jLoggerFactory]
2023-01-24 19:26:26,709 WARN util.Utils: Your hostname, fhmd0k22gk1qqq6a221q resolves to a loopback address: 127.0.1.1; using 172.16.0.37 instead (on interface eth0)
2023-01-24 19:26:26,710 WARN util.Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2023-01-24 19:26:28,940 WARN util.Utils: spark.executor.instances less than spark.dynamicAllocation.minExecutors is invalid, ignoring its setting, please update 

# Функции

In [104]:
# Выбирает все даты, попадающие в заданный диапазон.
# Даты передаются в формате "2022-01-01"
def select_dates(start_date: str, end_date: str) -> list:
    try:
        start_date = datetime.fromisoformat(start_date)
        end_date = datetime.fromisoformat(end_date)
    except:
        print('Неверный формат даты!') # ПИСАТЬ В ЛОГ!!!!!
    # Список дат в формате ['/user/master/data/events/date=2020-10-02', ...]
    paths_list = []
    # Количество дней между датами
    delta = end_date - start_date
    
    if delta.days < 0:
        print ('Указан слишком маленький диапазон!')  # ПИСАТЬ В ЛОГ!!!!!
    for i in range(delta.days + 1):
        paths_list.append('/user/master/data/geo/events/date=' + (start_date + timedelta(i)).__str__()[:10])
    return paths_list

calculate_dist = F.lit(2*6371) * F.asin(
        F.sqrt(
            F.pow(F.sin((F.radians(F.col("lat")) - F.radians(F.col("ltt"))) / 2), 2) +
            F.cos(F.radians(F.col("lat"))) * F.radians(F.cos(F.col("ltt"))) *
            F.pow(F.sin((F.radians(F.col("lon")) - F.radians(F.col("lng"))) / 2), 2)
        )
    )


In [5]:
select_dates('2022-01-01', '2022-01-03')

['/user/master/data/geo/events/date=2022-01-01',
 '/user/master/data/geo/events/date=2022-01-02',
 '/user/master/data/geo/events/date=2022-01-03']

## Загрузка данных

### Действия

In [5]:
#  Добавить проверку на количество записей в лог
try:
    activities = spark.read.parquet(*select_dates('2022-01-01', '2022-01-03'))
except:
    print('Нет данных за указанный диапазон!')

                                                                                

In [6]:
activities.show(3, truncate=False)

[Stage 1:>                                                          (0 + 1) / 1]

+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+------------+-------------------+------------------+
|event                                                                                                                                                                                                                                                                                                   |event_type  |lat                |lon               |
+--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+

                                                                                

In [7]:
activities.schema

StructType(List(StructField(event,StructType(List(StructField(admins,ArrayType(LongType,true),true),StructField(channel_id,LongType,true),StructField(datetime,StringType,true),StructField(media,StructType(List(StructField(media_type,StringType,true),StructField(src,StringType,true))),true),StructField(message,StringType,true),StructField(message_channel_to,LongType,true),StructField(message_from,LongType,true),StructField(message_group,LongType,true),StructField(message_id,LongType,true),StructField(message_to,LongType,true),StructField(message_ts,StringType,true),StructField(reaction_from,StringType,true),StructField(reaction_type,StringType,true),StructField(subscription_channel,LongType,true),StructField(subscription_user,StringType,true),StructField(tags,ArrayType(StringType,true),true),StructField(user,StringType,true))),true),StructField(event_type,StringType,true),StructField(lat,DoubleType,true),StructField(lon,DoubleType,true)))

### Города

In [8]:
cities = spark.read.csv('/user/flaviusoct/data/coord', sep=';', header=True).withColumnRenamed('lat', 'ltt')

In [9]:
cities.show(3)

+---+---------+--------+--------+
| id|     city|     ltt|     lng|
+---+---------+--------+--------+
|  1|   Sydney| -33,865|151,2094|
|  2|Melbourne|-37,8136|144,9631|
|  3| Brisbane|-27,4678|153,0281|
+---+---------+--------+--------+
only showing top 3 rows



## Обработка данных

In [89]:
# Удаляем строки с пропущенными значениями координат

activities_new = activities.where("lat IS NOT NULL AND lon IS NOT NULL")

In [96]:
# Делаем одну колонку со временем
# и добавляем колонку с уникальным id
activities_new = activities_new \
.withColumn('datetime', F.coalesce(F.col('event.datetime'), F.col('event.message_ts')).cast('timestamp')) \
# Есть ли смысл????
.orderBy(F.asc('datetime')) \
# В этом точно есть, ибо группировать ниже надо по какому-то общему идентификатору
# в сообщениях и иных действиях. По умолчанию его нет.
.withColumn('activity_id', F.monotonically_increasing_id())

In [97]:
activities_new.show(3, truncate=False)

+-------------------------------------------------------------------------------------------------------------------------------------------+----------+-------------------+------------------+--------------------------+-----------+
|event                                                                                                                                      |event_type|lat                |lon               |datetime                  |activity_id|
+-------------------------------------------------------------------------------------------------------------------------------------------+----------+-------------------+------------------+--------------------------+-----------+
|[,,,, do you need to add something to samba to enable windows sharing,, 30040,, 385953, 149488, 2021-01-01 00:00:24.248710480,,,,,,]       |message   |-34.052886090129995|150.89616994465248|2021-01-01 00:00:24.24871 |0          |
|[,,,, May someone tell me how I can get the trash icon to use another icon?

In [99]:
cities_new = cities \
.withColumn("ltt", F.regexp_replace(F.col("ltt"), pattern=',', replacement='.').cast("double")) \
.withColumn("lng", F.regexp_replace(F.col("lng"), pattern=',', replacement='.').cast("double"))

## Операции

In [107]:
activities_new = activities_new.crossJoin(cities_new) \
        .withColumn('distance', calculate_dist) \
        .withColumn("distance_rank",
                    F.row_number().over(Window.partitionBy("activity_id").orderBy("distance"))) \
        .where("distance_rank == 1")

In [108]:
activities_new.show()

+--------------------+----------+-------------------+------------------+--------------------+------------+---+-----------+--------+--------+------------------+-------------+
|               event|event_type|                lat|               lon|            datetime| activity_id| id|       city|     ltt|     lng|          distance|distance_rank|
+--------------------+----------+-------------------+------------------+--------------------+------------+---+-----------+--------+--------+------------------+-------------+
|[,,,, Hello! I wo...|   message| -26.60923828282186|153.15786533201094|2021-01-01 00:05:...|          26|  3|   Brisbane|-27.4678|153.0281| 95.45593089880165|            1|
|[,,,, try to 'for...|   message| -31.77315708819572|116.21798444130923|2021-01-01 00:05:...|          29|  4|      Perth|-31.9522|115.8589|20.412973536009655|            1|
|[,,,,  you mean t...|   message| -32.32697702659191|151.97297941789648|2021-01-01 01:13:...|  8589934658| 23|   Maitland|-32.7167

In [6]:
!hdfs dfs -ls /user/master/data/geo/events/date=2022-01-01

Found 303 items
-r-xr-xr-x   1 ubuntu hadoop     258489 2022-12-06 12:50 /user/master/data/geo/events/date=2022-01-01/part-00006-82f846d5-74a4-44e0-8fa9-de643f825932.c000.snappy.parquet
-r-xr-xr-x   1 ubuntu hadoop      13069 2022-12-06 12:50 /user/master/data/geo/events/date=2022-01-01/part-00937-82f846d5-74a4-44e0-8fa9-de643f825932.c000.snappy.parquet
-r-xr-xr-x   1 ubuntu hadoop      13014 2022-12-06 12:50 /user/master/data/geo/events/date=2022-01-01/part-00939-82f846d5-74a4-44e0-8fa9-de643f825932.c000.snappy.parquet
-r-xr-xr-x   1 ubuntu hadoop      12868 2022-12-06 12:50 /user/master/data/geo/events/date=2022-01-01/part-00947-82f846d5-74a4-44e0-8fa9-de643f825932.c000.snappy.parquet
-r-xr-xr-x   1 ubuntu hadoop      12848 2022-12-06 12:50 /user/master/data/geo/events/date=2022-01-01/part-00949-82f846d5-74a4-44e0-8fa9-de643f825932.c000.snappy.parquet
-r-xr-xr-x   1 ubuntu hadoop      18737 2022-12-06 12:50 /user/master/data/geo/events/date=2022-01-01/part-00960-82f846d5-74a4-44e0-8f