In [None]:
import pandas as pd

In [None]:
import plotly.io as pio
pio.renderers.default = "notebook_connected"

In [None]:
# Read the CSV file into a pandas DataFrame

df = pd.read_csv('../Data/sexuality_country_gender.csv')

In [None]:
df.head()

In [None]:
# Fill down 'Country' and 'Sex' values
df['Country'].fillna(method='ffill', inplace=True)
df['Gender'].fillna(method='ffill', inplace=True)

# Filter out rows related to "Weighted base (000s)" and "Unweighted sample" for separate handling
main_df = df[~df['Gender'].str.contains("Weighted base|Unweighted sample")]

# Drop unnecessary NaN columns
main_df = main_df.dropna(axis=1, how='all')
main_df = main_df.dropna(axis=0, how='any')

# Display the cleaned main data to ensure it's structured correctly
main_df.head()

In [None]:
year_columns = ['2010', '2011', '2012', '2013', '2014']  # Update this list based on your dataset
long_format_df = main_df.melt(id_vars=['Country', 'Gender', 'Sexuality'], value_vars=year_columns, var_name='Year', value_name='Percentage')

# Convert 'Percentage' to numeric, as it may be read as string due to the initial NaN values
long_format_df['Percentage'] = pd.to_numeric(long_format_df['Percentage'], errors='coerce')

# Display the transformed dataset ready for plotting
long_format_df.head(20)

In [None]:
# Sort datafrmae into right order 

# Sorting the DataFrame by 'Country', 'Sex', and then 'Year'
sorted_df = long_format_df.sort_values(by=['Country', 'Gender', 'Year']).reset_index(drop = True)

# Display the sorted DataFrame to check if it flows as expected
sorted_df.head(20)

In [None]:
# Round values in Percentage column to 2 decimal places

sorted_df['Percentage'] = sorted_df['Percentage'].round(2)

In [None]:
# Save df

sorted_df.to_csv('../Data/cleaned_sexuality_df.csv', index = False)