In [1]:
import pyspark
from pyspark.sql import SparkSession
from pyspark.sql import types
import pandas as pd
spark = SparkSession.builder \
    .master("local[*]") \
    .appName('test') \
    .getOrCreate()

In [2]:
df_xoso = spark.read.options(inferSchema = 'True', header = 'True').csv("xoso_2024-10-10.csv")

In [3]:
df_xoso.show(10)

+----------+--------------------+
|    cob_dt|         list_result|
+----------+--------------------+
|01-10-2023|03,06,07,08,15,23...|
|02-10-2023|00,02,09,11,27,32...|
|03-10-2023|00,05,15,18,19,23...|
|04-10-2023|00,07,11,16,27,28...|
|05-10-2023|01,02,03,07,10,14...|
|06-10-2023|02,07,08,09,16,17...|
|07-10-2023|06,10,11,12,13,18...|
|08-10-2023|00,02,05,14,18,20...|
|09-10-2023|03,04,11,17,19,25...|
|10-10-2023|04,13,18,19,21,22...|
+----------+--------------------+
only showing top 10 rows



In [5]:
from pyspark.sql import functions as F

In [6]:
df_split = df_xoso.withColumn("list_result", F.split(F.col("list_result"), ","))

In [7]:
df_split.show()

+----------+--------------------+
|    cob_dt|         list_result|
+----------+--------------------+
|01-10-2023|[03, 06, 07, 08, ...|
|02-10-2023|[00, 02, 09, 11, ...|
|03-10-2023|[00, 05, 15, 18, ...|
|04-10-2023|[00, 07, 11, 16, ...|
|05-10-2023|[01, 02, 03, 07, ...|
|06-10-2023|[02, 07, 08, 09, ...|
|07-10-2023|[06, 10, 11, 12, ...|
|08-10-2023|[00, 02, 05, 14, ...|
|09-10-2023|[03, 04, 11, 17, ...|
|10-10-2023|[04, 13, 18, 19, ...|
|11-10-2023|[01, 06, 11, 19, ...|
|12-10-2023|[02, 03, 12, 20, ...|
|13-10-2023|[08, 09, 17, 19, ...|
|14-10-2023|[00, 03, 21, 24, ...|
|15-10-2023|[13, 24, 26, 29, ...|
|16-10-2023|[00, 06, 13, 15, ...|
|17-10-2023|[03, 06, 07, 10, ...|
|18-10-2023|[00, 04, 08, 09, ...|
|19-10-2023|[03, 04, 05, 06, ...|
|20-10-2023|[02, 03, 13, 15, ...|
+----------+--------------------+
only showing top 20 rows



In [9]:
df_xoso.printSchema()

root
 |-- cob_dt: string (nullable = true)
 |-- list_result: string (nullable = true)



In [10]:
df_split.printSchema()

root
 |-- cob_dt: string (nullable = true)
 |-- list_result: array (nullable = true)
 |    |-- element: string (containsNull = false)



In [14]:
from pyspark.sql.functions import explode, explode_outer

In [15]:
df2 = df_split.select(df_split.cob_dt,explode_outer(df_split.list_result))
df2.printSchema()
df2.show(10)

root
 |-- cob_dt: string (nullable = true)
 |-- col: string (nullable = true)

+----------+---+
|    cob_dt|col|
+----------+---+
|01-10-2023| 03|
|01-10-2023| 06|
|01-10-2023| 07|
|01-10-2023| 08|
|01-10-2023| 15|
|01-10-2023| 23|
|01-10-2023| 25|
|01-10-2023| 29|
|01-10-2023| 32|
|01-10-2023| 33|
+----------+---+
only showing top 10 rows



In [16]:
dict_data = dict()

In [17]:
dict_data["main_number"] = "00"
dict_data["number_00"] = 0
for i in range(0,100):
    if i in range(0,10):
        dict_data["number_0" + str(i)] = 0
    else:
        dict_data["number_" + str(i)] = 0
print(dict_data)

{'main_number': '00', 'number_00': 0, 'number_01': 0, 'number_02': 0, 'number_03': 0, 'number_04': 0, 'number_05': 0, 'number_06': 0, 'number_07': 0, 'number_08': 0, 'number_09': 0, 'number_10': 0, 'number_11': 0, 'number_12': 0, 'number_13': 0, 'number_14': 0, 'number_15': 0, 'number_16': 0, 'number_17': 0, 'number_18': 0, 'number_19': 0, 'number_20': 0, 'number_21': 0, 'number_22': 0, 'number_23': 0, 'number_24': 0, 'number_25': 0, 'number_26': 0, 'number_27': 0, 'number_28': 0, 'number_29': 0, 'number_30': 0, 'number_31': 0, 'number_32': 0, 'number_33': 0, 'number_34': 0, 'number_35': 0, 'number_36': 0, 'number_37': 0, 'number_38': 0, 'number_39': 0, 'number_40': 0, 'number_41': 0, 'number_42': 0, 'number_43': 0, 'number_44': 0, 'number_45': 0, 'number_46': 0, 'number_47': 0, 'number_48': 0, 'number_49': 0, 'number_50': 0, 'number_51': 0, 'number_52': 0, 'number_53': 0, 'number_54': 0, 'number_55': 0, 'number_56': 0, 'number_57': 0, 'number_58': 0, 'number_59': 0, 'number_60': 0, 'n

In [29]:
import copy 

In [31]:
list_data_xs = list()
list_data_xs.append(dict_data)
for i in range(1,100):
    dict_copy = copy.deepcopy(dict_data)  
    if i in range(0,10):
        dict_copy["main_number"] = "0" + str(i)
    else:
        dict_copy["main_number"] = str(i)
    list_data_xs.append(dict_copy)

In [28]:
dict_data['main_number'] = "00"
print(dict_data)

{'main_number': '00', 'number_00': 0, 'number_01': 0, 'number_02': 0, 'number_03': 0, 'number_04': 0, 'number_05': 0, 'number_06': 0, 'number_07': 0, 'number_08': 0, 'number_09': 0, 'number_10': 0, 'number_11': 0, 'number_12': 0, 'number_13': 0, 'number_14': 0, 'number_15': 0, 'number_16': 0, 'number_17': 0, 'number_18': 0, 'number_19': 0, 'number_20': 0, 'number_21': 0, 'number_22': 0, 'number_23': 0, 'number_24': 0, 'number_25': 0, 'number_26': 0, 'number_27': 0, 'number_28': 0, 'number_29': 0, 'number_30': 0, 'number_31': 0, 'number_32': 0, 'number_33': 0, 'number_34': 0, 'number_35': 0, 'number_36': 0, 'number_37': 0, 'number_38': 0, 'number_39': 0, 'number_40': 0, 'number_41': 0, 'number_42': 0, 'number_43': 0, 'number_44': 0, 'number_45': 0, 'number_46': 0, 'number_47': 0, 'number_48': 0, 'number_49': 0, 'number_50': 0, 'number_51': 0, 'number_52': 0, 'number_53': 0, 'number_54': 0, 'number_55': 0, 'number_56': 0, 'number_57': 0, 'number_58': 0, 'number_59': 0, 'number_60': 0, 'n

In [32]:
print(len(list_data_xs))

100


In [33]:
for i in list_data_xs:
    print(i['main_number'])

00
01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
