In [322]:
import pandas as pd
import ast
import re

In [323]:
df1= pd.read_csv('../data/fine_tuning/gdpr_compliance.csv')
df2= pd.read_csv('../data/fine_tuning/gdpr_violations.csv')

# Dataset 1: GDPR Compliance cleaning

In [324]:
df1.head()

Unnamed: 0,id,source,text_segment,tags
0,1,foodnetwork.com,We may provide our analysis and certain non-pe...,['Data Recipients']
1,2,msn.com,We use Secure Socket Layer (SSL) technology to...,['Safeguards Copy']
2,3,msn.com,We incorporate standard industry practices sui...,['Safeguards Copy']
3,4,msn.com,We limit access to personal data only to those...,['Safeguards Copy']
4,5,msn.com,"We store data on multiple service systems, in ...",['Safeguards Copy']


In [325]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10511 entries, 0 to 10510
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            10511 non-null  int64 
 1   source        10511 non-null  object
 2   text_segment  10511 non-null  object
 3   tags          10511 non-null  object
dtypes: int64(1), object(3)
memory usage: 328.6+ KB


In [326]:
df1= df1[['text_segment','tags']]

In [327]:
df1['tags'].describe()

count                      10511
unique                        99
top       ['Processing Purpose']
freq                        1692
Name: tags, dtype: object

In [328]:
df1['tags'].unique()

array(["['Data Recipients']", "['Safeguards Copy']",
       "['Processing Purpose']", "['Data Categories']",
       "['Source of Data']", "['Right to Erase']",
       "['Right to Restrict']", "['Right to Access']",
       "['Right to Object']", "['Withdraw Consent']",
       "['Right to Portability']", "['Profiling']",
       "['Processing Purpose', 'Data Categories']",
       "['Data Recipients', 'Processing Purpose']",
       "['Controller Contact']", "['Right to Object', 'Profiling']",
       "['Right to Access', 'Right to Erase']",
       "['Right to Restrict', 'Right to Access']",
       "['Processing Purpose', 'Data Recipients', 'Profiling']",
       "['Provision Requirement']", "['Data Categories', 'Profiling']",
       "['Data Categories', 'Source of Data']",
       "['Data Recipients', 'Profiling']",
       "['Controller Contact', 'Right to Access']", "['Storage Period']",
       "['Lodge Complaint']", "['Processing Purpose', 'Source of Data']",
       "['Processing Purpose', 

In [329]:

tags_set = set()

for i in df1['tags']:
    try:
        tag_list = ast.literal_eval(i)
        # Iterate over the extracted list of tags
        for tag in tag_list:
            tags_set.add(tag.strip())
    except (SyntaxError, ValueError):
        print(f"Skipping invalid entry: {i}")

print(tags_set)


{'Withdraw Consent', 'Safeguards Copy', 'Profiling', 'Right to Restrict', 'Right to Access', 'Controller Contact', 'Lodge Complaint', 'Data Recipients', 'DPO Contact', 'Processing Purpose', 'Right to Portability', 'Storage Period', 'Right to Erase', 'Right to Object', 'Data Categories', 'Source of Data', 'Provision Requirement', 'Adequacy Decision'}


In [330]:
tags_set

{'Adequacy Decision',
 'Controller Contact',
 'DPO Contact',
 'Data Categories',
 'Data Recipients',
 'Lodge Complaint',
 'Processing Purpose',
 'Profiling',
 'Provision Requirement',
 'Right to Access',
 'Right to Erase',
 'Right to Object',
 'Right to Portability',
 'Right to Restrict',
 'Safeguards Copy',
 'Source of Data',
 'Storage Period',
 'Withdraw Consent'}

**map each tag with an article number and title**

In [331]:
articles = {
    'Adequacy Decision': 'Article 45 - Transfers on the basis of an adequacy decision',
    'Controller Contact': 'Article 13 - Information to be provided where personal data are collected from the data subject',
    'DPO Contact': 'Article 37 - Designation of the data protection officer',
    'Data Categories': 'Article 9 - Processing of special categories of personal data',
    'Data Recipients': 'Article 13 - Information to be provided where personal data are collected from the data subject',
    'Lodge Complaint': 'Article 77 - Right to lodge a complaint with a supervisory authority',
    'Processing Purpose': 'Article 5 - Principles relating to processing of personal data',
    'Profiling': 'Article 22 - Automated individual decision-making, including profiling',
    'Provision Requirement': 'Article 13 - Information to be provided where personal data are collected from the data subject',
    'Right to Access': 'Article 15 - Right of access by the data subject',
    'Right to Erase': 'Article 17 - Right to erasure (right to be forgotten)',
    'Right to Object': 'Article 21 - Right to object',
    'Right to Portability': 'Article 20 - Right to data portability',
    'Right to Restrict': 'Article 18 - Right to restriction of processing',
    'Safeguards Copy': 'Article 46 - Transfers subject to appropriate safeguards',
    'Source of Data': 'Article 14 - Information to be provided where personal data have not been obtained from the data subject',
    'Storage Period': 'Article 5 - Principles relating to processing of personal data',
    'Withdraw Consent': 'Article 7 - Conditions for consent'
}


In [332]:
def map_tags_to_articles(tags_str, articles):
    try:
        # Convert the string representation of a list to an actual list
        tags_list = ast.literal_eval(tags_str)
        mapped_tags = []

        for tag in tags_list:
            # Check if the tag exists in the articles dictionary
            if tag in articles:
                mapped_tags.append(f"{articles[tag]} ")
            else:
                mapped_tags.append(f"{tag}: Unknown Article")
        
        # Join the mapped tags into a single string separated by commas
        return ", ".join(mapped_tags)
    except (SyntaxError, ValueError):
        return "Invalid tags format"


In [333]:
df1['tags'] = df1['tags'].apply(lambda x: map_tags_to_articles(x, articles))


In [334]:
df1.head()

Unnamed: 0,text_segment,tags
0,We may provide our analysis and certain non-pe...,Article 13 - Information to be provided where ...
1,We use Secure Socket Layer (SSL) technology to...,Article 46 - Transfers subject to appropriate ...
2,We incorporate standard industry practices sui...,Article 46 - Transfers subject to appropriate ...
3,We limit access to personal data only to those...,Article 46 - Transfers subject to appropriate ...
4,"We store data on multiple service systems, in ...",Article 46 - Transfers subject to appropriate ...


In [335]:
df1['tags'].unique()

array(['Article 13 - Information to be provided where personal data are collected from the data subject ',
       'Article 46 - Transfers subject to appropriate safeguards ',
       'Article 5 - Principles relating to processing of personal data ',
       'Article 9 - Processing of special categories of personal data ',
       'Article 14 - Information to be provided where personal data have not been obtained from the data subject ',
       'Article 17 - Right to erasure (right to be forgotten) ',
       'Article 18 - Right to restriction of processing ',
       'Article 15 - Right of access by the data subject ',
       'Article 21 - Right to object ',
       'Article 7 - Conditions for consent ',
       'Article 20 - Right to data portability ',
       'Article 22 - Automated individual decision-making, including profiling ',
       'Article 5 - Principles relating to processing of personal data , Article 9 - Processing of special categories of personal data ',
       'Article 13 - I

In [336]:
df1.head()

Unnamed: 0,text_segment,tags
0,We may provide our analysis and certain non-pe...,Article 13 - Information to be provided where ...
1,We use Secure Socket Layer (SSL) technology to...,Article 46 - Transfers subject to appropriate ...
2,We incorporate standard industry practices sui...,Article 46 - Transfers subject to appropriate ...
3,We limit access to personal data only to those...,Article 46 - Transfers subject to appropriate ...
4,"We store data on multiple service systems, in ...",Article 46 - Transfers subject to appropriate ...


In [337]:
df1.rename(columns={'tags': 'articles'}, inplace=True)

In [338]:
df1['compliance'] = 'compliant'

In [339]:
df1.head()

Unnamed: 0,text_segment,articles,compliance
0,We may provide our analysis and certain non-pe...,Article 13 - Information to be provided where ...,compliant
1,We use Secure Socket Layer (SSL) technology to...,Article 46 - Transfers subject to appropriate ...,compliant
2,We incorporate standard industry practices sui...,Article 46 - Transfers subject to appropriate ...,compliant
3,We limit access to personal data only to those...,Article 46 - Transfers subject to appropriate ...,compliant
4,"We store data on multiple service systems, in ...",Article 46 - Transfers subject to appropriate ...,compliant


In [340]:
df1.to_csv('../data/fine_tuning/gdpr_compliance_preprocessed.csv', index=False)

# Dataset 2: GDPR Violations cleaning

In [341]:
df2.head()

Unnamed: 0,id,picture,name,price,authority,date,controller,article_violated,type,source,summary
0,1,https://www.privacyaffairs.com/wp-content/uplo...,Poland,9380,Polish National Personal Data Protection Offic...,10/18/2019,Polish Mayor,Art. 28 GDPR,Non-compliance with lawful basis for data proc...,https://uodo.gov.pl/decyzje/ZSPU.421.3.2019,No data processing agreement has been conclude...
1,2,https://www.privacyaffairs.com/wp-content/uplo...,Romania,2500,Romanian National Supervisory Authority for Pe...,10/17/2019,UTTIS INDUSTRIES,Art. 12 GDPR|Art. 13 GDPR|Art. 5 (1) c) GDPR|A...,Information obligation non-compliance,https://www.dataprotection.ro/?page=A_patra_am...,A controller was sanctioned because he had unl...
2,3,https://www.privacyaffairs.com/wp-content/uplo...,Spain,60000,Spanish Data Protection Authority (AEPD),10/16/2019,Xfera Moviles S.A.,Art. 5 GDPR|Art. 6 GDPR,Non-compliance with lawful basis for data proc...,https://www.aepd.es/resoluciones/PS-00262-2019...,The company had unlawfully processed the perso...
3,4,https://www.privacyaffairs.com/wp-content/uplo...,Spain,8000,Spanish Data Protection Authority (AEPD),10/16/2019,Iberdrola Clientes,Art. 31 GDPR,Failure to cooperate with supervisory authority,https://www.aepd.es/resoluciones/PS-00304-2019...,Iberdrola Clientes violated Article 13 of the ...
4,5,https://www.privacyaffairs.com/wp-content/uplo...,Romania,150000,Romanian National Supervisory Authority for Pe...,10/09/2019,Raiffeisen Bank SA,Art. 32 GDPR,Failure to implement sufficient measures to en...,https://www.dataprotection.ro/?page=Comunicat_...,Raiffeisen Bank Romania did not observe the ne...


In [342]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 437 entries, 0 to 436
Data columns (total 11 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   id                437 non-null    int64 
 1   picture           437 non-null    object
 2   name              437 non-null    object
 3   price             437 non-null    int64 
 4   authority         437 non-null    object
 5   date              437 non-null    object
 6   controller        437 non-null    object
 7   article_violated  437 non-null    object
 8   type              437 non-null    object
 9   source            437 non-null    object
 10  summary           437 non-null    object
dtypes: int64(2), object(9)
memory usage: 37.7+ KB


In [343]:
df2= df2[['summary','article_violated', 'type']]

In [344]:
df2['article_violated'].describe()

count              437
unique              86
top       Art. 32 GDPR
freq                73
Name: article_violated, dtype: object

In [345]:
df2['article_violated'].head()

0                                         Art. 28 GDPR
1    Art. 12 GDPR|Art. 13 GDPR|Art. 5 (1) c) GDPR|A...
2                              Art. 5 GDPR|Art. 6 GDPR
3                                         Art. 31 GDPR
4                                         Art. 32 GDPR
Name: article_violated, dtype: object

In [346]:
import pandas as pd
import re

def transform_articles(df, column_name):
    def transform_text(text):
        # Split the text by '|' and process each part
        articles = text.split('|')
        transformed = []
        for article in articles:
            # Remove any extra spaces and standardize the format
            article = article.strip()
            # Extract the main article number using regex
            match = re.search(r'Art\.\s*(\d+)', article)
            if match:
                transformed.append(f"Art.{match.group(1)}")
        return ', '.join(transformed)
    
    df[column_name] = df[column_name].apply(transform_text)
    return df

df2 = transform_articles(df2, 'article_violated')
print(df2.head())


                                             summary  \
0  No data processing agreement has been conclude...   
1  A controller was sanctioned because he had unl...   
2  The company had unlawfully processed the perso...   
3  Iberdrola Clientes violated Article 13 of the ...   
4  Raiffeisen Bank Romania did not observe the ne...   

               article_violated  \
0                        Art.28   
1  Art.12, Art.13, Art.5, Art.6   
2                  Art.5, Art.6   
3                        Art.31   
4                        Art.32   

                                                type  
0  Non-compliance with lawful basis for data proc...  
1              Information obligation non-compliance  
2  Non-compliance with lawful basis for data proc...  
3    Failure to cooperate with supervisory authority  
4  Failure to implement sufficient measures to en...  


In [347]:
print(df2['article_violated'].unique())

['Art.28' 'Art.12, Art.13, Art.5, Art.6' 'Art.5, Art.6' 'Art.31' 'Art.32'
 'Art.32, Art.33' 'Art.5, Art.25' 'Art.21, Art.25'
 'Art.15, Art.17, Art.21' 'Art.5' 'Art.13, Art.37' 'Art.17'
 'Art.5, Art.9, Art.35, Art.36' 'Art.6'
 'Art.5, Art.5, Art.6, Art.13, Art.14' 'Art.25, Art.5' 'Art.33'
 'Art.5, Art.12, Art.13, Art.32' 'Art.5, Art.7' 'Art.5, Art.5'
 'Art.6, Art.5, Art.13' 'Art.5, Art.32, Art.33' 'Art.5, Art.5, Art.32'
 'Art.15' 'Art.5, Art.9, Art.9, Art.6' 'Art.33, Art.33, Art.34'
 'Art.5, Art.32' 'Art.14' 'Art.5, Art.5, Art.13, Art.17, Art.6'
 'Art.6, Art.5' 'Art.13, Art.14, Art.6, Art.4, Art.5'
 'Art.5, Art.5, Art.6, Art.13' 'Art.12, Art.15, Art.18, Art.13' 'Art.13'
 '' 'Art.83, Art.33, Art.34' 'Art.12, Art.17'
 'Art.5, Art.6, Art.13, Art.14, Art.21' 'Art.6, Art.7' 'Art.25'
 'Art.12, Art.15' 'Art.37' 'Art.12, Art.15, Art.17' 'Art.5, Art.13'
 'Art.58' 'Art.21' 'Art.6, Art.12, Art.13' 'Art.6, Art.25' 'Art.6, Art.9'
 'Art.5, Art.6, Art.17, Art.21' 'Art.5, Art.6, Art.7, Art.21'
 'Art.5,

In [348]:
len(df2['article_violated'].unique())

59

In [349]:
article_titles = {
    "Art.5": "Article 5 - Principles relating to processing of personal data",
    "Art.6": "Article 6 - Lawfulness of processing",
    "Art.7": "Article 7 - Conditions for consent",
    "Art.9": "Article 9 - Processing of special categories of personal data",
    "Art.12": "Article 12 - Transparent information, communication and modalities for the exercise of the rights of the data subject",
    "Art.13": "Article 13 - Information to be provided where personal data are collected from the data subject",
    "Art.14": "Article 14 - Information to be provided where personal data have not been obtained from the data subject",
    "Art.15": "Article 15 - Right of access by the data subject",
    "Art.17": "Article 17 - Right to erasure ('right to be forgotten')",
    "Art.18": "Article 18 - Right to restriction of processing",
    "Art.21": "Article 21 - Right to object",
    "Art.25": "Article 25 - Data protection by design and by default",
    "Art.28": "Article 28 - Processor",
    "Art.31": "Article 31 - Cooperation with the supervisory authority",
    "Art.32": "Article 32 - Security of processing",
    "Art.33": "Article 33 - Notification of a personal data breach to the supervisory authority",
    "Art.34": "Article 34 - Communication of a personal data breach to the data subject",
    "Art.35": "Article 35 - Data protection impact assessment",
    "Art.36": "Article 36 - Prior consultation",
    "Art.37": "Article 37 - Designation of the data protection officer",
    "Art.44": "Article 44 - General principle for transfers",
    "Art.50": "Article 50 - International cooperation for the protection of personal data",
    "Art.58": "Article 58 - Powers",
    "Art.83": "Article 83 - General conditions for imposing administrative fines"
}
def map_articles_to_titles(articles_str, article_titles):
    try:
        # Split the string into individual articles
        articles_list = [article.strip() for article in articles_str.split(',')]
        mapped_articles = []

        for article in articles_list:
            # Check if the article exists in the article_titles dictionary
            if article in article_titles:
                mapped_articles.append(f"{article_titles[article]} ")
            else:
                mapped_articles.append(f"{article}: Unknown Article")
        
        # Join the mapped articles into a single string separated by commas
        return ", ".join(mapped_articles)
    except (SyntaxError, ValueError):
        return "Invalid articles format"
df2['article_violated'] = df2['article_violated'].apply(lambda x: map_articles_to_titles(x, article_titles))
df2.head()

Unnamed: 0,summary,article_violated,type
0,No data processing agreement has been conclude...,Article 28 - Processor,Non-compliance with lawful basis for data proc...
1,A controller was sanctioned because he had unl...,"Article 12 - Transparent information, communic...",Information obligation non-compliance
2,The company had unlawfully processed the perso...,Article 5 - Principles relating to processing ...,Non-compliance with lawful basis for data proc...
3,Iberdrola Clientes violated Article 13 of the ...,Article 31 - Cooperation with the supervisory ...,Failure to cooperate with supervisory authority
4,Raiffeisen Bank Romania did not observe the ne...,Article 32 - Security of processing,Failure to implement sufficient measures to en...


In [350]:
len(df2['article_violated'].unique())
df2['article_violated'].unique()

array(['Article 28 - Processor ',
       'Article 12 - Transparent information, communication and modalities for the exercise of the rights of the data subject , Article 13 - Information to be provided where personal data are collected from the data subject , Article 5 - Principles relating to processing of personal data , Article 6 - Lawfulness of processing ',
       'Article 5 - Principles relating to processing of personal data , Article 6 - Lawfulness of processing ',
       'Article 31 - Cooperation with the supervisory authority ',
       'Article 32 - Security of processing ',
       'Article 32 - Security of processing , Article 33 - Notification of a personal data breach to the supervisory authority ',
       'Article 5 - Principles relating to processing of personal data , Article 25 - Data protection by design and by default ',
       'Article 21 - Right to object , Article 25 - Data protection by design and by default ',
       "Article 15 - Right of access by the data sub

In [351]:
df2= df2[['summary', 'article_violated']]
df2.rename(columns={'summary': 'text_segment', 'article_violated': 'articles'}, inplace=True)

In [352]:
df2['compliance'] = 'non-compliant'
df2.head()

Unnamed: 0,text_segment,articles,compliance
0,No data processing agreement has been conclude...,Article 28 - Processor,non-compliant
1,A controller was sanctioned because he had unl...,"Article 12 - Transparent information, communic...",non-compliant
2,The company had unlawfully processed the perso...,Article 5 - Principles relating to processing ...,non-compliant
3,Iberdrola Clientes violated Article 13 of the ...,Article 31 - Cooperation with the supervisory ...,non-compliant
4,Raiffeisen Bank Romania did not observe the ne...,Article 32 - Security of processing,non-compliant


In [353]:
df2.to_csv('../data/fine_tuning/gdpr_violations_preprocessed.csv', index=False)