In [None]:
!apt-get install openjdk-8-jdk-headless -qq > /dev/nullb
!wget -q http://archive.apache.org/dist/spark/spark-3.1.1/spark-3.1.1-bin-hadoop3.2.tgz
!tar xf spark-3.1.1-bin-hadoop3.2.tgz
!pip install -q findspark

In [None]:
import os

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-8-openjdk-amd64"
os.environ["SPARK_HOME"] = "/content/spark-3.1.1-bin-hadoop3.2"
import findspark

findspark.init()
from pyspark.sql import SparkSession

spark = SparkSession.builder.master("local[*]").getOrCreate()
spark.conf.set("spark.sql.repl.eagerEval.enabled", True)

In [None]:
!pip install pdoc3
!pip install country_list
!pip install countryinfo

import random
import uuid
import string
import pyspark.sql.types as T
import pyspark.sql.functions as F
import countryinfo
import country_list
import hashlib
from  datetime import date
from  datetime import datetime
from  datetime import timedelta
import logging
from google.colab import drive

drive.mount("/content/gdrive")

logger = logging.getLogger()
logging.basicConfig(
      filename = "mylog.log",
      format = "%(asctime)s - %(levelname)s - %(funcName)s: %(lineno)d - %(message)s",
      datefmt='%H:%M:%S',
    )

In [None]:
countries = list(country_list.countries_for_language("en"))

In [None]:
def generation_value(count: int = 10):
    """Генерация числа.

    Parameters:
        count: int
              Количество цифр в числе.

    Returns:
        string:
              Сгенерированное число.
    """

    return "".join(random.choices(string.digits, k=count))

In [None]:
print(generation_value.__doc__)

Генерация числа.

    Parameters:
        count: int
              Количество цифр в числе.

    Returns:
        string:
              Сгенерированное число.
    


In [None]:
pdoc generation_value

# Новый раздел

In [None]:
def generate_rows_table(count: int = 1,
                    timestampcolumn: datetime = date.today()):
    """Генерация строк таблицы.

    Parameters:
        count: int
              Количество строк.
        timestampcolumn: datetime
              Дата в колонке 'timestampcolumn'.

    Returns:
        list_cookies: list
              Возвращает массив списков, представляющий собой строку таблицы.
    """
    list_cookies = list()
    for _ in range(count):
        # генерация cookies
        inn = generation_value(12)

        _sa_cookie_a = {
            "key": "_sa_cookie_a",
            "value": f"SA1.{uuid.uuid4()}.{generation_value(10)}"
        }

        _fa_cookie_a = {
            "key": "_fa_cookie_a",
            "value": f"ads2.{generation_value(10)}.{generation_value(10)}"
        }

        _ym_cookie_c = {
            "key": "_ym_cookie_c",
            "value": {''.join(generation_value(20))}
        }

        _fbp = {
            "key": "_fbp",
            "value": f"fb.{random.choice(string.digits)}."
                         f"{generation_value(13)}."
                         f"{generation_value(9)}"
        }

        org_uid = {
            "key": "org_uid",
            "value": f"{generation_value(7)}"
        }

        user_uid = {
            "key": "user_uid",
            "value": f"{generation_value(7)}"
        }

        user_phone = {
            "key": "user_phone",
            "value": f"79{generation_value(2)}"
                     f"{generation_value(3)}"
                     f"{generation_value(2)}"
                     f"{generation_value(2)}"
        }

        user_mail = {
            "key": "user_mail",
            "value": f"""{"".join(random.choices(string.ascii_letters +
                             string.digits, k=10))}@user.io"""
        }

        # генерация event_type
        event_type = random.choice(["SUBMIT", "REGISTER", "SUBMIT_MD5"])

        # генерация event_action
        event_action = random.choice(["pageview", "event", "login-check-otp"])

        # генерация data_value
        if event_type == "SUBMIT":
            data_value = hashlib.sha256(bytes(inn, encoding="utf-8")).hexdigest()
        elif event_type == "SUBMIT_MD5":
            data_value = hashlib.md5(bytes(inn, encoding="utf-8")).hexdigest()
        else:
            data_value = None

        # генерация страный, города и геопозиции
        while True:
            try:
                geocountry = random.choice(countries)
                country = countryinfo.CountryInfo(geocountry[1])
                city = country.capital()
                geoaltitude = ",".join(map(str, country.latlng()))
                break
            except KeyError as e:
                logger.error(f"В списке 'country' отсутствует значение {str(e)}")

        # Генерация meta_platform
        meta_platform = random.choice(["WEB", "MOBAIL"])

        # Генерация user_os
        if meta_platform == "WEB":
            user_os = random.choice(["Mac", "Windows", "Ubuntu"])
        else:
            user_os = random.choice(["IOS", "Android", "HarmonyOS", "BlackBerryOS"])
        # Генерация systemlanguage
        systemlanguage = random.choice(["RU", "ENG"])

        # Генерация screensize
        screensize = "1920x1080"

        # Добавление спосков в единый список
        list_cookies.append([
            inn,
            [_sa_cookie_a, _fa_cookie_a, _ym_cookie_c, _fbp,
            org_uid, user_uid, user_phone, user_mail],
            event_type,
            event_action,
            data_value,
            geocountry[1],
            city,
            user_os,
            systemlanguage,
            geoaltitude,
            meta_platform,
            screensize,
            timestampcolumn
                             ])
    return list_cookies

In [None]:
# схема данных
schema = T.StructType([
    T.StructField("inn", T.StringType(), True),
    T.StructField("raw_cookie", T.ArrayType(T.MapType(T.StringType(),
                                                      T.StringType()))),
    T.StructField("event_type", T.StringType(), True),
    T.StructField("event_action", T.StringType(), True),
    T.StructField("data_value", T.StringType(), True),
    T.StructField("geocountry", T.StringType(), True),
    T.StructField("city", T.StringType(), True),
    T.StructField("user_os", T.StringType(), True),
    T.StructField("systemlanguage", T.StringType(), True),
    T.StructField("geoaltitude", T.StringType(), True),
    T.StructField("meta_platform", T.StringType(), True),
    T.StructField("screensize", T.StringType(), True),
    T.StructField("timestampcolumn", T.DateType(), True)
                       ])

In [None]:
# Цикл генерирует каталоги с файлами в формате JSON согласно партиционирования
current_date = date.today()
folder = "/content/gdrive/MyDrive/data/json/"
if not os.path.isdir(folder):
    os.makedirs(folder)
i = 0
while i < 10:
    for _ in range(random.randrange(5, 10)):
        df = spark.createDataFrame(generate_rows_table(30, current_date),
                                   schema=schema)
        df.coalesce(1).write.mode("append") \
            .json(f"{folder}{current_date.strftime('%Y_%m_%d')}")

    current_date = current_date + timedelta(1)
    i = i + 1

In [None]:
! rm -r /content/gdrive/MyDrive/data/json/

In [None]:
df = spark.createDataFrame(generate_rows_table(2000, current_date),schema=schema)
df.write.mode("append").json(f"{folder}")