In [55]:
import pandas as pd
import ast


In [56]:
df1= pd.read_csv('../data/fine_tuning/gdpr_compliance.csv')
df2= pd.read_csv('../data/fine_tuning/gdpr_violations.csv')

# Dataset 1: GDPR Compliance cleaning

In [57]:
df1.head()

Unnamed: 0,id,source,text_segment,tags
0,1,foodnetwork.com,We may provide our analysis and certain non-pe...,['Data Recipients']
1,2,msn.com,We use Secure Socket Layer (SSL) technology to...,['Safeguards Copy']
2,3,msn.com,We incorporate standard industry practices sui...,['Safeguards Copy']
3,4,msn.com,We limit access to personal data only to those...,['Safeguards Copy']
4,5,msn.com,"We store data on multiple service systems, in ...",['Safeguards Copy']


In [58]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10511 entries, 0 to 10510
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            10511 non-null  int64 
 1   source        10511 non-null  object
 2   text_segment  10511 non-null  object
 3   tags          10511 non-null  object
dtypes: int64(1), object(3)
memory usage: 328.6+ KB


In [59]:
df1= df1[['text_segment','tags']]

In [60]:
df1['tags'].describe()

count                      10511
unique                        99
top       ['Processing Purpose']
freq                        1692
Name: tags, dtype: object

In [61]:
df1['tags'].unique()

array(["['Data Recipients']", "['Safeguards Copy']",
       "['Processing Purpose']", "['Data Categories']",
       "['Source of Data']", "['Right to Erase']",
       "['Right to Restrict']", "['Right to Access']",
       "['Right to Object']", "['Withdraw Consent']",
       "['Right to Portability']", "['Profiling']",
       "['Processing Purpose', 'Data Categories']",
       "['Data Recipients', 'Processing Purpose']",
       "['Controller Contact']", "['Right to Object', 'Profiling']",
       "['Right to Access', 'Right to Erase']",
       "['Right to Restrict', 'Right to Access']",
       "['Processing Purpose', 'Data Recipients', 'Profiling']",
       "['Provision Requirement']", "['Data Categories', 'Profiling']",
       "['Data Categories', 'Source of Data']",
       "['Data Recipients', 'Profiling']",
       "['Controller Contact', 'Right to Access']", "['Storage Period']",
       "['Lodge Complaint']", "['Processing Purpose', 'Source of Data']",
       "['Processing Purpose', 

In [62]:

tags_set = set()

for i in df1['tags']:
    try:
        tag_list = ast.literal_eval(i)
        # Iterate over the extracted list of tags
        for tag in tag_list:
            tags_set.add(tag.strip())
    except (SyntaxError, ValueError):
        print(f"Skipping invalid entry: {i}")

print(tags_set)


{'Withdraw Consent', 'Safeguards Copy', 'Profiling', 'Right to Restrict', 'Right to Access', 'Controller Contact', 'Lodge Complaint', 'Data Recipients', 'DPO Contact', 'Processing Purpose', 'Right to Portability', 'Storage Period', 'Right to Erase', 'Right to Object', 'Data Categories', 'Source of Data', 'Provision Requirement', 'Adequacy Decision'}


In [63]:
tags_set

{'Adequacy Decision',
 'Controller Contact',
 'DPO Contact',
 'Data Categories',
 'Data Recipients',
 'Lodge Complaint',
 'Processing Purpose',
 'Profiling',
 'Provision Requirement',
 'Right to Access',
 'Right to Erase',
 'Right to Object',
 'Right to Portability',
 'Right to Restrict',
 'Safeguards Copy',
 'Source of Data',
 'Storage Period',
 'Withdraw Consent'}

**map each tag with an article number and title**

In [64]:
articles = {
    'Adequacy Decision': 'Article 45 - Transfers on the basis of an adequacy decision',
    'Controller Contact': 'Article 13 - Information to be provided where personal data are collected from the data subject',
    'DPO Contact': 'Article 37 - Designation of the data protection officer',
    'Data Categories': 'Article 9 - Processing of special categories of personal data',
    'Data Recipients': 'Article 13 - Information to be provided where personal data are collected from the data subject',
    'Lodge Complaint': 'Article 77 - Right to lodge a complaint with a supervisory authority',
    'Processing Purpose': 'Article 5 - Principles relating to processing of personal data',
    'Profiling': 'Article 22 - Automated individual decision-making, including profiling',
    'Provision Requirement': 'Article 13 - Information to be provided where personal data are collected from the data subject',
    'Right to Access': 'Article 15 - Right of access by the data subject',
    'Right to Erase': 'Article 17 - Right to erasure (right to be forgotten)',
    'Right to Object': 'Article 21 - Right to object',
    'Right to Portability': 'Article 20 - Right to data portability',
    'Right to Restrict': 'Article 18 - Right to restriction of processing',
    'Safeguards Copy': 'Article 46 - Transfers subject to appropriate safeguards',
    'Source of Data': 'Article 14 - Information to be provided where personal data have not been obtained from the data subject',
    'Storage Period': 'Article 5 - Principles relating to processing of personal data',
    'Withdraw Consent': 'Article 7 - Conditions for consent'
}


In [65]:
def map_tags_to_articles(tags_str, articles):
    try:
        # Convert the string representation of a list to an actual list
        tags_list = ast.literal_eval(tags_str)
        mapped_tags = []

        for tag in tags_list:
            # Check if the tag exists in the articles dictionary
            if tag in articles:
                mapped_tags.append(f"{articles[tag]} ")
            else:
                mapped_tags.append(f"{tag}: Unknown Article")
        
        # Join the mapped tags into a single string separated by commas
        return ", ".join(mapped_tags)
    except (SyntaxError, ValueError):
        return "Invalid tags format"


In [66]:
df1['tags'] = df1['tags'].apply(lambda x: map_tags_to_articles(x, articles))


In [67]:
df1.head()

Unnamed: 0,text_segment,tags
0,We may provide our analysis and certain non-pe...,Article 13 - Information to be provided where ...
1,We use Secure Socket Layer (SSL) technology to...,Article 46 - Transfers subject to appropriate ...
2,We incorporate standard industry practices sui...,Article 46 - Transfers subject to appropriate ...
3,We limit access to personal data only to those...,Article 46 - Transfers subject to appropriate ...
4,"We store data on multiple service systems, in ...",Article 46 - Transfers subject to appropriate ...


In [68]:
df1['tags'].unique()

array(['Article 13 - Information to be provided where personal data are collected from the data subject ',
       'Article 46 - Transfers subject to appropriate safeguards ',
       'Article 5 - Principles relating to processing of personal data ',
       'Article 9 - Processing of special categories of personal data ',
       'Article 14 - Information to be provided where personal data have not been obtained from the data subject ',
       'Article 17 - Right to erasure (right to be forgotten) ',
       'Article 18 - Right to restriction of processing ',
       'Article 15 - Right of access by the data subject ',
       'Article 21 - Right to object ',
       'Article 7 - Conditions for consent ',
       'Article 20 - Right to data portability ',
       'Article 22 - Automated individual decision-making, including profiling ',
       'Article 5 - Principles relating to processing of personal data , Article 9 - Processing of special categories of personal data ',
       'Article 13 - I

In [69]:
df1.head()

Unnamed: 0,text_segment,tags
0,We may provide our analysis and certain non-pe...,Article 13 - Information to be provided where ...
1,We use Secure Socket Layer (SSL) technology to...,Article 46 - Transfers subject to appropriate ...
2,We incorporate standard industry practices sui...,Article 46 - Transfers subject to appropriate ...
3,We limit access to personal data only to those...,Article 46 - Transfers subject to appropriate ...
4,"We store data on multiple service systems, in ...",Article 46 - Transfers subject to appropriate ...


In [70]:
df1.rename(columns={'tags': 'articles'}, inplace=True)

In [71]:
df1['compliance'] = 'compliant'

In [72]:
df1.head()

Unnamed: 0,text_segment,articles,compliance
0,We may provide our analysis and certain non-pe...,Article 13 - Information to be provided where ...,compliant
1,We use Secure Socket Layer (SSL) technology to...,Article 46 - Transfers subject to appropriate ...,compliant
2,We incorporate standard industry practices sui...,Article 46 - Transfers subject to appropriate ...,compliant
3,We limit access to personal data only to those...,Article 46 - Transfers subject to appropriate ...,compliant
4,"We store data on multiple service systems, in ...",Article 46 - Transfers subject to appropriate ...,compliant


In [73]:
df1.to_csv('../data/fine_tuning/gdpr_compliance_preprocessed.csv', index=False)