In [1]:
id = 3
topicName = 'class-actions-pl'
title = """
Class actions in Poland 2010-2021
""".replace('\n',' ').strip()
titlePl = """
Pozwy zbiorowe w Polsce 2010-2021
""".replace('\n',' ').strip()
country = 'Poland'
countryPl = 'Polska'
startYear = 2010
endYear = 2021
description="""
Class action lawsuits in civil, commercial and labor 
law cases in which claims were brought by 
a larger number of employees (2010-2021).
Data is collected and published by the Polish 
Ministry of Justice.
""".replace('\n',' ').strip()
descriptionPl="""
Pozwy zbiorowe w sprawach cywilnych, 
gospodarczych i sprawy z zakresu prawa pracy, 
w których z pozwem wystąpiła większa grupa pracowników (2010-2021) - Polska.
Dane opublikowane przez polskie Ministerstwo Sprawiedliwości.
""".replace('\n',' ').strip()
sourceName = "Ministry of Justice (PL)"
sourceNamePl ="Ministerstwo Sprawiedliwości (PL)"
sourceLink="https://isws.ms.gov.pl/pl/baza-statystyczna/opracowania-wieloletnie/"
fileName= "class-actions-pl"
sourceFileExt = 'xlsx'

In [2]:
from src.database.insert_topic import insert_topic

insert_topic(
    id,
    topicName,
    title,
    titlePl,
    country,
    countryPl,
    startYear,
    endYear,
    description,
    descriptionPl,
    sourceName,
    sourceNamePl,
    sourceLink,
    fileName,
    sourceFileExt
)

In [3]:
import pandas as pd

original_data_path = '../data/raw/class-actions-pl/pozwy-zbiorowe-2010-2021.xlsx'

df = pd.read_excel(original_data_path, 
                   skiprows=6)

df.head()

Unnamed: 0.1,Unnamed: 0,Unnamed: 1,Unnamed: 2,odrzucono,oddalono,zwrócono,Unnamed: 6
0,2010,21,.,.,.,.,.
1,2011,37,21,4,–,11,20
2,2012,35,20,6,1,10,33
3,2013,22,26,5,6,5,29
4,2014,41,19,9,2,7,51


In [4]:
df = df.replace(['.', '–'], 0)

In [5]:
column_names_descriptions = {
    0: ('year', 
        'year', 
        'rok'),
    1: ('filed', 
        'court cases filed', 
        'wpłynęło'),
    2: ('settled', 
        'court cases settled', 
        'załatwione'),
    3: ('rejected', 
        'court cases rejected', 
        'odrzucone'),
    4: ('denied', 
        'court cases denied', 
        'oddalono'),
    5: ('returned', 
        'court cases returned', 
        'zwrócono'),
    6: ('pending', 
        'court cases pending', 
        'oczekujące na rozstrzygnięcie')
}

In [6]:
new_column_names = [t[0] for t in column_names_descriptions.values()]

In [7]:
df_c = df[:12]

df_c.columns = new_column_names

df_c.isna().any()

df_c = df_c.astype(int)

df_c

Unnamed: 0,year,filed,settled,rejected,denied,returned,pending
0,2010,21,0,0,0,0,0
1,2011,37,21,4,0,11,20
2,2012,35,20,6,1,10,33
3,2013,22,26,5,6,5,29
4,2014,41,19,9,2,7,51
5,2015,32,31,9,2,7,52
6,2016,30,23,5,2,10,59
7,2017,16,27,6,3,4,48
8,2018,22,18,1,4,0,52
9,2019,16,25,1,2,8,43


In [8]:
df_gc = df[18:30]

df_gc.columns = new_column_names

df_gc.isna().any()

df_gc = df_gc.astype(int)

df_gc

Unnamed: 0,year,filed,settled,rejected,denied,returned,pending
18,2010,0,0,0,0,0,0
19,2011,1,0,0,0,0,0
20,2012,4,1,1,0,0,4
21,2013,0,2,1,0,0,2
22,2014,1,2,2,0,0,1
23,2015,1,0,0,0,0,1
24,2016,0,0,0,0,0,2
25,2017,1,1,0,0,0,2
26,2018,1,2,0,0,0,2
27,2019,0,0,0,0,0,2


In [9]:
df_pr = df.iloc[37:49, 2:5]

df_pr.columns = new_column_names[:3]

df_pr = df_pr.astype(int)

df_pr

Unnamed: 0,year,filed,settled
37,2010,1883,1841
38,2011,2395,2057
39,2012,2531,1373
40,2013,1323,1163
41,2014,2322,1064
42,2015,1078,2749
43,2016,992,1155
44,2017,514,642
45,2018,78,99
46,2019,37,75


In [10]:
df_po = df.iloc[50:62, 2:5]

df_po.columns = new_column_names[:3]

df_po = df_po.astype(int)

df_po

Unnamed: 0,year,filed,settled
50,2010,59,62
51,2011,52,40
52,2012,14,13
53,2013,49,26
54,2014,10,11
55,2015,13,15
56,2016,19,23
57,2017,9,7
58,2018,6,2
59,2019,2,7


In [11]:
dsName = 'classActionsPl'
case_types = {
    'C': df_c,
    'Gc': df_gc,
    'Pr': df_pr,
    'Po': df_po,
}

ds_names = {f'{dsName}{x}' for x in case_types}

ds_names

{'classActionsPlC', 'classActionsPlGc', 'classActionsPlPo', 'classActionsPlPr'}

In [12]:
from src.database.connect_db import connect_db

db = connect_db()

cursor = db.cursor()

In [13]:
from pandas import DataFrame


def createTable(ca_df:DataFrame, tableName:str):
    query = f"CREATE TABLE {tableName} (year INT PRIMARY KEY, "

    for column in ca_df.columns[1:]:
        query += f"{column} INT NOT NULL, "


    query = query[:-2] + ")"
    
    try:
        cursor.execute(query)
    except:
        print('table already created')
    

In [14]:
for key, value in case_types.items():
    createTable(value, f'{dsName}{key}')


In [15]:
from pandas import DataFrame


def insertData(ca_df:DataFrame, tableName:str):
    data = [tuple(row) for index, row in ca_df.iterrows()]
    
    query = f"""
    INSERT INTO {tableName} ({', '.join(ca_df.columns)}) 
    VALUES ({', '.join(['%s']*len(ca_df.columns))});
    """

    try:
        cursor.executemany(query, data)
    except:
        print('data added already')
    

In [16]:
for key, value in case_types.items():
    insertData(value, f'{dsName}{key}')

In [17]:
cursor.execute(f"""
               SELECT * 
               FROM classActionsPlC
               LIMIT 5
               """)
cursor.fetchall()

((2010, 21, 0, 0, 0, 0, 0),
 (2011, 37, 21, 4, 0, 11, 20),
 (2012, 35, 20, 6, 1, 10, 33),
 (2013, 22, 26, 5, 6, 5, 29),
 (2014, 41, 19, 9, 2, 7, 51))

In [18]:
from src.database.create_ds_desc_table import create_ds_desc_table


create_ds_desc_table(dsName)

In [19]:
data = [row for row in column_names_descriptions.values()]

In [20]:
from src.database.insert_into_ds_desc import insert_into_ds_desc


insert_into_ds_desc(dsName, data)

In [21]:
df_desc = pd.DataFrame(data, 
    columns=['column_name', 
    'description', 
    'descriptionPl'])

In [22]:
df_path_desc =f'../data/processed/{topicName}/{fileName}-desc.csv'

In [23]:
for key, value in case_types.items():
    value.to_csv(f'../data/processed/{topicName}/{fileName}-{key}.csv', index=False)

In [24]:
df_desc.to_csv(df_path_desc, index=False)

In [25]:
destination = f"{topicName}/{fileName}"
destination_desc = f"{topicName}/{fileName}-desc.csv"
destination_original = f"{topicName}/{fileName}-source.{sourceFileExt}"

In [26]:
from src.utils.upload_file_gpc import upload_file
from src.utils.zip_folder import zip_folder

output_path = f'../data/processed/{topicName}.zip'

zip_folder( '../data/processed/class-actions-pl/',output_path)


upload_file(output_path,destination)
upload_file(df_path_desc,destination_desc)
upload_file(original_data_path,destination_original)

File ../data/processed/class-actions-pl.zip uploaded to None/class-actions-pl/class-actions-pl.
File ../data/processed/class-actions-pl/class-actions-pl-desc.csv uploaded to None/class-actions-pl/class-actions-pl-desc.csv.
File ../data/raw/class-actions-pl/pozwy-zbiorowe-2010-2021.xlsx uploaded to None/class-actions-pl/class-actions-pl-source.xlsx.
