In [45]:
import pathlib
import os
import zipfile

import pandas as pd
from pyspark import SparkContext, SparkConf, SparkFiles
from pyspark.sql import SparkSession, Window
from pyspark.sql.functions import desc, lead, row_number, max, min, upper, year
from pyspark.sql.types import StructField, StructType, ByteType, DateType, FloatType, IntegerType, ShortType, StringType
import tqdm

In [6]:
pd.set_option("display.max_columns", None)

# Spark Configuration

In [2]:
conf = SparkConf()
conf.setMaster("local[*]").setAppName("Dataset Maker")

<pyspark.conf.SparkConf at 0x7fca336b99c0>

In [3]:
sc = SparkContext(conf=conf)

23/09/11 16:19:18 WARN Utils: Your hostname, asus-notebook resolves to a loopback address: 127.0.1.1; using 192.168.1.186 instead (on interface wlp3s0)
23/09/11 16:19:18 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
23/09/11 16:19:19 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [4]:
sc.uiWebUrl

'http://192.168.1.186:4040'

In [5]:
session = SparkSession(sc)

# Dataset Based on Small&Meduim Business Data

In [7]:
rsmp_schema = StructType([
    StructField("kind", ByteType(), False),
    StructField("category", ByteType(), False),
    StructField("reestr_date", DateType(), False),
    StructField("date", DateType(), False),
    StructField("ind_tin", StringType(), True),
    StructField("ind_number", StringType(), True),
    StructField("first_name", StringType(), True),
    StructField("last_name", StringType(), True),
    StructField("patronymic", StringType(), True),
    StructField("org_name", StringType(), True),
    StructField("org_short_name", StringType(), True),
    StructField("org_tin", StringType(), True),
    StructField("org_number", StringType(), True),
    StructField("region_code", ByteType(), True),
    StructField("region_name", StringType(), True),
    StructField("region_type", StringType(), True),
    StructField("district_name", StringType(), True),
    StructField("district_type", StringType(), True),
    StructField("city_name", StringType(), True),    
    StructField("city_type", StringType(), True),
    StructField("settlement_name", StringType(), True),
    StructField("settlement_type", StringType(), True),
    StructField("activity_code_main", StringType(), False),
    StructField("activity_codes_additional", StringType(), True),
    StructField("total", ShortType(), True), 
    StructField("file_id", StringType(), True), 
])
rsmp_path = pathlib.Path("rsmp/csv")
rsmp_csv_files = [str(fn) for fn in rsmp_path.glob("*.csv")]

In [38]:
rsmp = session.read.options(header=True, dateFormat="dd.MM.yyyy", escape='"').schema(rsmp_schema).csv(rsmp_csv_files)
rsmp.printSchema()



root
 |-- kind: byte (nullable = true)
 |-- category: byte (nullable = true)
 |-- reestr_date: date (nullable = true)
 |-- date: date (nullable = true)
 |-- ind_tin: string (nullable = true)
 |-- ind_number: string (nullable = true)
 |-- first_name: string (nullable = true)
 |-- last_name: string (nullable = true)
 |-- patronymic: string (nullable = true)
 |-- org_name: string (nullable = true)
 |-- org_short_name: string (nullable = true)
 |-- org_tin: string (nullable = true)
 |-- org_number: string (nullable = true)
 |-- region_code: byte (nullable = true)
 |-- region_name: string (nullable = true)
 |-- region_type: string (nullable = true)
 |-- district_name: string (nullable = true)
 |-- district_type: string (nullable = true)
 |-- city_name: string (nullable = true)
 |-- city_type: string (nullable = true)
 |-- settlement_name: string (nullable = true)
 |-- settlement_type: string (nullable = true)
 |-- activity_code_main: string (nullable = true)
 |-- activity_codes_additional: 

                                                                                

In [84]:
reestr_org_tins = (
    rsmp
    .filter("kind = 1")
    .dropDuplicates(["org_tin"])
    .select("org_tin")
).cache()

In [55]:
reestr_org_tins.show(5)



+----------+
|   org_tin|
+----------+
|7704331019|
|7604259432|
|2130011462|
|7325150259|
|9104006275|
+----------+
only showing top 5 rows



                                                                                

In [85]:
reestr_org_tins.count()

                                                                                

72591

In [33]:
with open("rsmp/csv/data-01102019-structure-08012016.csv") as f:
    print(f.read(1000))

kind,category,reestr_date,date,ind_tin,ind_number,first_name,last_name,patronymic,org_name,org_short_name,org_tin,org_number,region_code,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,activity_code_main,activity_codes_additional,total,file_id
1,1,10.02.2017,10.01.2019,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""Г.Е.Н.И.Я.""","ООО ""Г.Е.Н.И.Я.""",8602274490,,86,ХАНТЫ-МАНСИЙСКИЙ АВТОНОМНЫЙ ОКРУГ - ЮГРА,АВТОНОМНЫЙ ОКРУГ,,,СУРГУТ,ГОРОД,,,69.10,"69.20.3,",900,VO_RRMSPSV_0000_9965_20190110_00019b6b-66db-47a3-9dc1-531f91eed871
1,1,10.10.2016,10.01.2019,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ЮРИДИЧЕСКИЙ КОНСАЛТИНГ""","ООО ""ЮРКОНСАЛТИНГ""",8602272084,,86,ХАНТЫ-МАНСИЙСКИЙ АВТОНОМНЫЙ ОКРУГ - ЮГРА,АВТОНОМНЫЙ ОКРУГ,,,СУРГУТ,ГОРОД,,,69.10,"73.20, 73.11, 68.10, 81.21, 68.31, 69.20, 78.10, 68.20,",900,VO_RRMSPSV_0000_9965_20190110_00019b6b-66db-47a3-9dc1-531f91eed871
1,1,01.08.2016,10.01.2019,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТ

In [43]:
rsmp.filter("org_tin = 0276115295").orderBy("date").toPandas()

                                                                                

Unnamed: 0,kind,category,reestr_date,date,ind_tin,ind_number,first_name,last_name,patronymic,org_name,org_short_name,org_tin,org_number,region_code,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,activity_code_main,activity_codes_additional,total,file_id
0,1,1,2016-08-01,2016-12-10,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АКЦЕНТ""","ООО ""АКЦЕНТ""",0276115295,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.10,"68.31.3, 68.31.4, 68.31.1, 68.31.2, 69.20,",901,VO_RRMSPSV_0000_9965_20161210_eafd6d55-a976-4f...
1,1,1,2016-08-01,2017-01-10,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АКЦЕНТ""","ООО ""АКЦЕНТ""",0276115295,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.10,"68.31.1, 68.31.4, 68.31.2, 69.20, 68.31.3,",901,VO_RRMSPSV_0000_9965_20170110_7caa5f9d-3818-41...
2,1,1,2016-08-01,2017-02-10,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АКЦЕНТ""","ООО ""АКЦЕНТ""",0276115295,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.10,"68.31.2, 69.20, 68.31.3, 68.31.4, 68.31.1,",900,VO_RRMSPSV_0000_9965_20170210_e90d6814-9c94-46...
3,1,1,2016-08-01,2017-03-10,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АКЦЕНТ""","ООО ""АКЦЕНТ""",0276115295,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.10,"69.20, 68.31.3, 68.31.1, 68.31.2, 68.31.4,",901,VO_RRMSPSV_0000_9965_20170310_29af6a36-a956-47...
4,1,1,2016-08-01,2017-04-10,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АКЦЕНТ""","ООО ""АКЦЕНТ""",0276115295,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.10,"68.31.2, 68.31.4, 69.20, 68.31.1, 68.31.3,",900,VO_RRMSPSV_0000_9965_20170410_e2be2ec0-0825-42...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
75,1,1,2016-08-01,2023-03-10,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АКЦЕНТ""","ООО ""АКЦЕНТ""",0276115295,1080276003026,2,БАШКОРТОСТАН,РЕСП.,,,Уфа,Г.,,,69.10,"68.31.1, 68.31.2, 68.31.3, 68.31.4, 69.20,",900,VO_RRMSPSV_0000_9965_20230310_0c0ddf02-6d4c-4c...
76,1,1,2016-08-01,2023-04-10,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АКЦЕНТ""","ООО ""АКЦЕНТ""",0276115295,1080276003026,2,БАШКОРТОСТАН,РЕСП.,,,Уфа,Г.,,,69.10,"68.31.1, 68.31.2, 68.31.3, 68.31.4, 69.20,",900,VO_RRMSPSV_0000_9965_20230411_1955b9df-34c0-4d...
77,1,1,2016-08-01,2023-05-10,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АКЦЕНТ""","ООО ""АКЦЕНТ""",0276115295,1080276003026,2,БАШКОРТОСТАН,РЕСП.,,,Уфа,Г.,,,69.10,"68.31.4, 68.31.2, 69.20, 68.31.3, 68.31.1,",901,VO_RRMSPSV_0000_9965_20230511_131d450c-2c9b-4e...
78,1,1,2016-08-01,2023-06-10,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АКЦЕНТ""","ООО ""АКЦЕНТ""",0276115295,1080276003026,2,БАШКОРТОСТАН,РЕСП.,,,Уфа,Г.,,,69.10,"68.31.4, 68.31.2, 69.20, 68.31.3, 68.31.1,",901,VO_RRMSPSV_0000_9965_20230611_deaeaa5b-3333-48...


In [59]:
cols_to_check_for_duplicates = [
    "kind", "category",
    "first_name", "last_name", "patronymic",
    "org_name", "org_short_name",
    "region_code", "region_name",
    "district_name", "city_name", "settlement_name",
    "activity_code_main"
]
w_for_start_date = Window.partitionBy(cols_to_check_for_duplicates).orderBy("date")
w_for_end_date = Window.partitionBy("ind_tin", "org_tin").orderBy("date")

rsmp_table = (
    rsmp
    .withColumn("row_number", row_number().over(w_for_start_date))
    .filter("row_number = 1")
    .withColumn("end_date", lead("date").over(w_for_end_date))
    .withColumnRenamed("date", "start_date")
    .select(
        "kind",
        "category",
        "ind_tin",
        "ind_number",
        upper("first_name").alias("first_name"),
        upper("last_name").alias("last_name"),
        upper("patronymic").alias("patronymic"),
        upper("org_name").alias("org_name"),
        upper("org_short_name").alias("org_short_name"),
        "org_tin", 
        "org_number",
        "region_code",
        upper("region_name").alias("region_name"),
        upper("region_type").alias("region_type"),
        upper("district_name").alias("district_name"),
        upper("district_type").alias("district_type"),
        upper("city_name").alias("city_name"),
        upper("city_type").alias("city_type"),
        upper("settlement_name").alias("settlement_name"),
        upper("settlement_type").alias("settlement_type"),
        "activity_code_main",
        "start_date",
        "end_date",
    )
    .cache()
)

In [61]:
rsmp_table.count()

                                                                                

172095

In [60]:
rsmp_table.limit(10).toPandas()

                                                                                

Unnamed: 0,kind,category,ind_tin,ind_number,first_name,last_name,patronymic,org_name,org_short_name,org_tin,org_number,region_code,region_name,region_type,district_name,district_type,city_name,city_type,settlement_name,settlement_type,activity_code_main,start_date,end_date
0,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АВАН...","ООО ""АВАНТАЖ""",273080245,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2017-04-10,
1,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АКЦЕНТ""","ООО ""АКЦЕНТ""",276115295,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2016-12-10,2022-03-10
2,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""АКЦЕНТ""","ООО ""АКЦЕНТ""",276115295,1080276003026.0,2,БАШКОРТОСТАН,РЕСП.,,,УФА,Г.,,,69.1,2022-03-10,
3,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ИМПЕ...","ООО ""ИМПЕРИЯ НЕДВИЖИМОСТИ""",276909812,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2016-08-10,
4,1,1,,,,,,ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ЮРИДИ...,"ООО ЮКЦ ""АКТИС""",276918454,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2016-12-10,2018-07-10
5,1,1,,,,,,ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ЮРИДИ...,"ООО ЮК ""БЕЛЫЙ ДОМ""",276918454,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2018-07-10,
6,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ЦЕНТ...","ООО ""ЦЕНТР КАДРОВОГО КОНСАЛТИНГА ""ВАШИ КАДРЫ""",277070791,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2019-08-10,
7,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ЦЕНТ...","ООО ""ЦЕНТР ЮРИДИЧЕСКОЙ ПОМОЩИ""",277107233,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2016-08-10,2020-09-10
8,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""ЦЕНТ...","ООО ""ЦЕНТР ЮРИДИЧЕСКОЙ ПОМОЩИ""",277107233,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,УФИМСКИЙ,РАЙОН,,,БУЛГАКОВО,СЕЛО,69.1,2020-09-10,
9,1,1,,,,,,"ОБЩЕСТВО С ОГРАНИЧЕННОЙ ОТВЕТСТВЕННОСТЬЮ ""СКОР...","ООО ""СКОРАЯ ЮРИДИЧЕСКАЯ ПОМОЩЬ""",277126959,,2,БАШКОРТОСТАН,РЕСПУБЛИКА,,,УФА,ГОРОД,,,69.1,2016-08-10,


In [62]:
rsmp_table.toPandas().to_csv("rsmp/csv/data.csv", index=False, na_rep="NA", float_format="%.0f")

                                                                                

# Number of Employees

In [6]:
staff_schema = StructType([
    StructField("org_tin", StringType(), False),
    StructField("employees_count", IntegerType(), True),
    StructField("data_date", DateType(), True),
    StructField("doc_date", DateType(), True),
    StructField("file_id", StringType(), True),
])
staff_path = pathlib.Path("sshr/csv")
staff_csv_files = [str(fn) for fn in staff_path.glob("data-*.csv")]

In [7]:
staff = session.read.options(header=True, dateFormat="dd.MM.yyyy").schema(staff_schema).csv(staff_csv_files)
staff.printSchema()

root
 |-- org_tin: string (nullable = true)
 |-- employees_count: integer (nullable = true)
 |-- data_date: date (nullable = true)
 |-- doc_date: date (nullable = true)
 |-- file_id: string (nullable = true)



In [36]:
w_rn = Window.partitionBy("org_tin", "data_date").orderBy(desc("doc_date"))
w_agg = Window.partitionBy("org_tin", "data_date").rangeBetween(Window.unboundedPreceding, Window.unboundedFollowing)
(staff
 .withColumn("row_number", row_number().over(w_rn))
 .withColumn("max", max("employees_count").over(w_agg))
 .withColumn("min", min("employees_count").over(w_agg))
 .filter("max != min")
 .dropDuplicates(["org_tin", "data_date"])
 .count()
)

                                                                                

24230

In [37]:
staff.dropDuplicates(["org_tin", "data_date"]).count()

                                                                                

11145413

In [65]:
w_rn = Window.partitionBy("org_tin", "data_date").orderBy(desc("doc_date"))

staff_table = (
    staff
    .join(reestr_org_tins, on="org_tin", how="leftsemi")
    .withColumn("row_number", row_number().over(w_rn))
    .filter("row_number = 1")
    .dropDuplicates(["org_tin", "data_date"])
    .select("org_tin", year("data_date").alias("year"), "employees_count")
    .orderBy("org_tin", "year")
    .cache()
)

In [66]:
staff_table.show(10)



+----------+----+---------------+
|   org_tin|year|employees_count|
+----------+----+---------------+
|0101013292|2021|              1|
|0104015040|2018|              2|
|0104015040|2019|              2|
|0104015040|2020|              2|
|0104015040|2021|              2|
|0104015040|2022|              2|
|0105006257|2018|             24|
|0105006257|2020|              1|
|0105036692|2018|              2|
|0105036692|2019|              2|
+----------+----+---------------+
only showing top 10 rows



                                                                                

In [67]:
staff_table.count()

                                                                                

174487

In [69]:
staff_table.toPandas().to_csv("sshr/csv/data.csv", index=False, na_rep="NA", float_format="%.0f")

                                                                                

In [71]:
revexp_schema = StructType([
    StructField("org_tin", StringType(), False),
    StructField("revenue", FloatType(), True),
    StructField("expediture", FloatType(), True),
    StructField("data_date", DateType(), True),
    StructField("doc_date", DateType(), True),
    StructField("file_id", StringType(), True),
])
revexp_path = pathlib.Path("revexp/csv")
revexp_csv_files = [str(fn) for fn in revexp_path.glob("data-*.csv")]

revexp = session.read.options(header=True, dateFormat="dd.MM.yyyy").schema(revexp_schema).csv(revexp_csv_files)
revexp.printSchema()

root
 |-- org_tin: string (nullable = true)
 |-- revenue: float (nullable = true)
 |-- expediture: float (nullable = true)
 |-- data_date: date (nullable = true)
 |-- doc_date: date (nullable = true)
 |-- file_id: string (nullable = true)



In [72]:
revexp.show(5)

+----------+---------+----------+----------+----------+--------------------+
|   org_tin|  revenue|expediture| data_date|  doc_date|             file_id|
+----------+---------+----------+----------+----------+--------------------+
|4632100208|1102000.0|  949000.0|2018-12-31|2019-10-15|VO_OTKRDAN5_9965_...|
|3437013648|  32000.0|   12000.0|2018-12-31|2019-10-15|VO_OTKRDAN5_9965_...|
|3662152924| 339000.0|  338000.0|2018-12-31|2019-10-15|VO_OTKRDAN5_9965_...|
|6454111471|  9.712E7|  9.5442E7|2018-12-31|2019-10-15|VO_OTKRDAN5_9965_...|
|6673090640|      0.0| 1150000.0|2018-12-31|2019-10-15|VO_OTKRDAN5_9965_...|
+----------+---------+----------+----------+----------+--------------------+
only showing top 5 rows



[Stage 98:>                                                         (0 + 1) / 1]                                                                                

In [73]:
w_rn = Window.partitionBy("org_tin", "data_date").orderBy(desc("doc_date"))

revexp_table = (
    revexp
    .join(reestr_org_tins, on="org_tin", how="leftsemi")
    .withColumn("row_number", row_number().over(w_rn))
    .filter("row_number = 1")
    .dropDuplicates(["org_tin", "data_date"])
    .select("org_tin", year("data_date").alias("year"), "revenue", "expediture")
    .orderBy("org_tin", "year")
    .cache()
)

In [74]:
revexp_table.count()

                                                                                

163074

In [76]:
revexp_table.show(10)

+----------+----+---------+----------+
|   org_tin|year|  revenue|expediture|
+----------+----+---------+----------+
|0104015040|2018| 864000.0|  719000.0|
|0104015040|2019| 424000.0|  657000.0|
|0104015040|2020|      0.0|       0.0|
|0104015040|2021|      0.0|  171000.0|
|0104015040|2022|      0.0|  136000.0|
|0105006257|2018|3560000.0| 3215000.0|
|0105006257|2019|1260000.0| 1260000.0|
|0105006257|2020|      0.0|       0.0|
|0105036692|2018| 276000.0|  264000.0|
|0105036692|2019| 300000.0|  271000.0|
+----------+----+---------+----------+
only showing top 10 rows



In [77]:
revexp_table.toPandas().to_csv("revexp/csv/data.csv", index=False, na_rep="NA", float_format="%.0f")

                                                                                