In [None]:
import pandas as pd #pandas for data analysis
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker
import numpy as np

In [None]:
#Privacy Law courpus metadata file imported
df = pd.read_csv("/content/privacy_law_corpus_metadata.csv")

#Drop the unknown column
#df.drop(['unknown'], axis=1)

#strip leading and trailing spaces from the values
df = df.apply(lambda x: x.str.strip() if x.dtype == "object" else x)

#save cleaned data to a new csv file
df.to_csv('cleaned_privacy_law_corpus_metadata.csv', index=False)

#remove any whitespaces in the column names
df.columns = df.columns.str.strip()

#preview results
df.head()

#validation of column names
print(df.columns.tolist())

In [None]:
# Replace corresponding values to 'Originally in other languages'
df['Translation Type'] = df['Translation Type'].apply(lambda x: 'Originally in other languages' if x == '[Not Applicable]' else x)

# Replace corresponding values to 'Originally English'
df.loc[(df['Original Language'] == 'English') & (df['Translation Type'] == 'Originally in other languages'), 'Translation Type'] = 'Originally English'

In [None]:
#frequency count of "Translation_Type" column
valuecount_translation_type = df['Translation Type'].value_counts()

#save it to a new frame
valuecount_translation_type_df = valuecount_translation_type.to_frame()

#reset column names
valuecount_translation_type_df = valuecount_translation_type_df.reset_index()

#add custom column names: Translation Type, No. of Documents
valuecount_translation_type_df.columns = ['Translation Type', 'No. of Documents']

#Add % of Total column
valuecount_translation_type_df['% of Total'] = (valuecount_translation_type_df['No. of Documents'] / valuecount_translation_type_df['No. of Documents'].sum()) * 100

#Format Total column values upto two decimals
valuecount_translation_type_df['% of Total'] = valuecount_translation_type_df['% of Total'].round(2)

#Print the dataframe
print(valuecount_translation_type_df)

In [None]:
#Replace special values with None
#NYIP = Not yet in effect
df["First Privacy Law"] = df["First Privacy Law"].replace("NYIF", None)
df["First Privacy Law"] = df["First Privacy Law"].replace("[None]", None)

In [None]:
#Convert the column 'First Privacy Law' in datatime format and store it in a new column 'date_created'
df['date_created'] = pd.to_datetime(df['First Privacy Law'])
df['date_created']

In [None]:
# extract year from date_created column
df['year'] = pd.DatetimeIndex(df['date_created']).year

# group by year and get its frequency
year_freq = df.groupby('year').size().reset_index(name='frequency')

# sort year_freq by year in ascending order
year_freq = year_freq.sort_values(by='year')

# add cumulative frequency column
year_freq['cumulative_frequency'] = year_freq['frequency'].cumsum()

# print the resulting dataframe
print(year_freq)

In [None]:
# create a figure and axis objects
fig, ax1 = plt.subplots(figsize=(14, 8))

# plot the cumulative number of privacy documents enacted on the left axis
line1, = ax1.plot(year_freq['year'], year_freq['cumulative_frequency'], color='blue')
ax1.set_ylabel('Cumulative # Privacy Documents Enacted', color='blue')
ax1.yaxis.set_major_locator(ticker.MultipleLocator(100))

# create a second axis object for the number of privacy documents enacted per year
ax2 = ax1.twinx()
line2, = ax2.plot(year_freq['year'], year_freq['frequency'], color='red')
ax2.set_ylabel('# Privacy Documents Enacted', color='red')
ax2.yaxis.set_major_locator(ticker.MultipleLocator(20))

# set the x-axis label and tick format
ax1.set_xlabel('Year Enacted')
ax1.xaxis.set_major_locator(ticker.MultipleLocator(10))
# set the x-axis limits to start from the minimum year and increase by 10 years
#ax1.set_xlim(year_freq['year'].min(), 2022, 10)
ax1.set_xlim(year_freq['year'].min(), year_freq['year'].max(), 10)

# add a title to the plot
plt.title('Privacy Laws enacted overtime', fontsize=16)

# add a legend to the plot
ax1.legend(handles=[line1, line2], labels=['Cumulative # Privacy Documents', '# Privacy Documents'], loc='upper left')

plt.show()

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# create a figure and axis objects
fig, ax1 = plt.subplots(figsize=(14, 8))

# plot the cumulative number of privacy documents enacted on the left axis
line1, = ax1.plot(year_freq['year'], year_freq['cumulative_frequency'], color='blue')
ax1.set_ylabel('Cumulative # Privacy Documents Enacted', color='blue', fontsize=14)  # Increase font size here
ax1.yaxis.set_major_locator(ticker.MultipleLocator(100))

# create a second axis object for the number of privacy documents enacted per year
ax2 = ax1.twinx()
line2, = ax2.plot(year_freq['year'], year_freq['frequency'], color='red')
ax2.set_ylabel('# Privacy Documents Enacted', color='red', fontsize=14)  # Increase font size here
ax2.yaxis.set_major_locator(ticker.MultipleLocator(20))

# set the x-axis label and tick format
ax1.set_xlabel('Year Enacted', fontsize=14)  # Increase font size here
ax1.xaxis.set_major_locator(ticker.MultipleLocator(10))
ax1.tick_params(axis='both', labelsize=12)  # Increase tick label font size

# set the x-axis limits to start from the minimum year and increase by 10 years
ax1.set_xlim(year_freq['year'].min(), year_freq['year'].max(), 10)

# add a title to the plot
plt.title('Privacy Laws enacted over time', fontsize=16)  # Increase font size here

# add a legend to the plot
ax1.legend(handles=[line1, line2], labels=['Cumulative # Privacy Documents', '# Privacy Documents'], loc='upper left')

plt.show()


In [None]:
# create a figure and axis objects
fig, ax1 = plt.subplots(figsize=(14, 8))

# plot the cumulative number of laws enacted on the left axis
ax1.plot(year_freq['year'], year_freq['cumulative_frequency'], color='blue')
ax1.set_ylabel('Cumulative # Privacy Documents Enacted', color='blue')
ax1.yaxis.set_major_locator(ticker.MultipleLocator(100))

# create a second axis object for the number of laws enacted per year
ax2 = ax1.twinx()
ax2.plot(year_freq['year'], year_freq['frequency'], color='red')
ax2.set_ylabel('# Privacy Documents Enacted', color='red')
ax2.yaxis.set_major_locator(ticker.MultipleLocator(20))

# set the x-axis label and tick format
ax1.set_xlabel('Year Enacted')
ax1.xaxis.set_major_locator(ticker.MultipleLocator(10))
# set the x-axis limits to start from the minimum year and increase by 10 years
#ax1.set_xlim(year_freq['year'].min(), 2022, 10)
ax1.set_xlim(2000, 2020, 10)

# add a title to the plot
plt.title('Privacy Laws enacted overtime - 21st Century')

plt.show()

In [None]:
import matplotlib.pyplot as plt
import matplotlib.ticker as ticker

# create a figure and axis objects
fig, ax1 = plt.subplots(figsize=(14, 8))

# plot the cumulative number of laws enacted on the left axis
ax1.plot(year_freq['year'], year_freq['cumulative_frequency'], color='blue')
ax1.set_ylabel('Cumulative # Privacy Documents Enacted', color='blue', fontsize=14)  # Increase font size here
ax1.yaxis.set_major_locator(ticker.MultipleLocator(100))

# create a second axis object for the number of laws enacted per year
ax2 = ax1.twinx()
ax2.plot(year_freq['year'], year_freq['frequency'], color='red')
ax2.set_ylabel('# Privacy Documents Enacted', color='red', fontsize=14)  # Increase font size here
ax2.yaxis.set_major_locator(ticker.MultipleLocator(20))

# set the x-axis label and tick format
ax1.set_xlabel('Year Enacted', fontsize=14)  # Increase font size here
ax1.xaxis.set_major_locator(ticker.MultipleLocator(10))
ax1.tick_params(axis='both', labelsize=12)  # Increase tick label font size

# set the x-axis limits to start from the minimum year and increase by 10 years
ax1.set_xlim(2000, 2020, 10)

# add a title to the plot
plt.title('Privacy Laws enacted over time - 21st Century', fontsize=16)  # Increase font size here

plt.show()


In [None]:
# assuming your DataFrame is called df
df['jurisdiction_type'] = None

In [None]:
State = np.array(['California'])
Special_Administrative_Regions = np.array(['Hong Kong SAR',  'Macao SAR', 'BES Islands (Bonaire, Sint Estatius and Saba)'])
Special_Economic_Zones = np.array(['Abu Dhabi Global Market', 'Dubai Healthcare City (DHCC)', 'Dubai International Financial Centre', 'Qatar Financial Centre'])
International_Organizations = np.array(['European Union', 'African Union', 'Asia-Pacific Economic Cooperation', 'United Nations'])
Crown_Dependencies = np.array(['Isle of Man', 'Guernsey', 'Jersey'])
British_Overseas_Territories = np.array(['Bermuda', 'Cayman Islands', 'Gibraltar'])
Intergovernmental_Organizations = np.array(['CoE + Uruguay, Mauritius, Senegal, Tunisia', 'European Union + United States', 'Switzerland + United States', 'United States + 23 others'])
Countries = np.array(['Antigua & Barbuda', 'Cura√ßao', 'S√£o Tom√© and Principe', 'Bosnia & Herzegovina', 'Cape Verde', 'Democratic Republic of the Congo', 'East Timor', 'Korea, South', 'Macedonia (FYROM)', 'Phillippines', 'St. Lucia', 'St. Vincent & Grenadines', 'Trinidad & Tobago'])

In [None]:
# iterate over each row in your DataFrame
for i, row in df.iterrows():
    # check if the jurisdiction name is in any of the arrays
    if row['Jurisidction Name'] in State:
        df.at[i, 'jurisdiction_type'] = 'State'
    elif row['Jurisidction Name'] in Special_Administrative_Regions:
        df.at[i, 'jurisdiction_type'] = 'Special Administrative Regions'
    elif row['Jurisidction Name'] in Special_Economic_Zones:
        df.at[i, 'jurisdiction_type'] = 'Special Economic Zones'
    elif row['Jurisidction Name'] in International_Organizations:
        df.at[i, 'jurisdiction_type'] = 'International Organizations'
    elif row['Jurisidction Name'] in Crown_Dependencies:
        df.at[i, 'jurisdiction_type'] = 'Crown Dependencies'
    elif row['Jurisidction Name'] in British_Overseas_Territories:
        df.at[i, 'jurisdiction_type'] = 'British Overseas Territories'
    elif row['Jurisidction Name'] in Intergovernmental_Organizations:
        df.at[i, 'jurisdiction_type'] = 'Intergovernmental Organizations'
    elif row['Jurisidction Name'] in Countries:
        df.at[i, 'jurisdiction_type'] = 'Countries'
    else:
        df.at[i, 'jurisdiction_type'] = None

In [None]:
value_counts = df['jurisdiction_type'].value_counts(dropna=False)
print(value_counts)

In [None]:
!pip install pycountry

In [None]:
import pandas as pd
import pycountry

In [None]:
# assuming your DataFrame is called df
def get_jurisdiction_type(jurisdiction_name):
    try:
        country = pycountry.countries.search_fuzzy(jurisdiction_name)[0]
        return 'Countries'
    except LookupError:
        return None

#df['jurisdiction_type'] = df['Jurisidction Name'].apply(get_jurisdiction_type)
df.loc[df['jurisdiction_type'].isna(), 'jurisdiction_type'] = df.loc[df['jurisdiction_type'].isna(), 'Jurisidction Name'].apply(get_jurisdiction_type)

In [None]:
value_counts = df['jurisdiction_type'].value_counts(dropna=False)
print(value_counts)

In [None]:
print(df.loc[df['jurisdiction_type'].isnull(), 'Jurisidction Name'])

In [None]:
corpus_composition = {'Jurisdiction Type': [],
        'Coverage Type': [],
        '#Unique Jurisdictions': [],
        '#Documents': [],
        'Examples': []}
cc = pd.DataFrame(corpus_composition)

In [None]:
# Group by 'jurisdiction_type' and count the unique values in 'jurisdiction_name'
unique_counts = df.groupby('jurisdiction_type')['Jurisidction Name'].nunique()

# Count the frequency of each unique value in the 'jurisdiction_type' column of df
jurisdiction_counts = df['jurisdiction_type'].value_counts()

# Iterate over the unique values and their counts
for jurisdiction_type, count in jurisdiction_counts.items():
    # Add the value, its count, and the unique count to the 'corpus_composition' DataFrame
    cc = cc.append({'Jurisdiction Type': jurisdiction_type,
                    '#Documents': count,
                    '#Unique Jurisdictions': unique_counts[jurisdiction_type]},
                   ignore_index=True)

In [None]:
# create an array of replacement values
Coverage_Type = ['N', 'N', 'N', 'S/P', 'I', 'S/P', 'I', 'S/P']
Examples = ['Albania', 'Cayman Islands', 'Isle of Man', 'Macau', 'United Nations', 'Qatar Financial Centre', 'US + 23 Countries', 'California (USA)']

# replace the NaN values in column 'A' with values from the replacement_array
cc['Coverage Type'] = cc['Coverage Type'].fillna(pd.Series(Coverage_Type))
cc['Examples'] = cc['Examples'].fillna(pd.Series(Examples))

In [None]:
df.head()

In [None]:
# get the unique values of the 'Name' column and add them to a list
unique_countries = df['Jurisidction Name'].unique().tolist()
print(unique_countries)

In [None]:
import pycountry

countries = []
for country_name in unique_countries:
    try:
        country = pycountry.countries.search_fuzzy(country_name)[0]
        countries.append(country.name)
    except LookupError:
        pass

print(countries)

In [None]:
!pip install pycountry_convert

In [None]:
import pycountry_convert as pc

In [None]:
# add new column with country codes
df['country code'] = df['Jurisidction Name'].map(country_dict)

In [None]:
df

In [None]:
country_df = df[['country code']].copy()

In [None]:
!pip install geopandas

In [None]:
import pandas as pd
import plotly.express as px
import json

# Drop rows with NaN values in 'country_code' column
country_df = country_df.dropna(subset=['country code'])

country_df

In [None]:
# check for NaN values in 'country_code' column
if country_df['country code'].isna().any():
    print('There are NaN values in the "country_code" column')
else:
    print('No NaN values in the "country_code" column')

In [None]:
# count the frequency of each country code
country_freq = country_df['country code'].value_counts().reset_index()

# rename the columns to 'country_code' and 'frequency'
country_freq.columns = ['country code', 'frequency']

# print the new dataframe
country_freq

In [None]:
import plotly.express as px
# Generate choropleth map using built-in country codes
fig = px.choropleth(country_freq, locations='country code', color='frequency',
                    scope='world', # set the map's scope to 'world'
                    projection='equirectangular',
                    color_continuous_scale='blues') # set the map's projection

fig.update_layout(geo=dict(showframe=False))

fig.show()