In [1]:
import pandas as pd
import plotly.express as px

file_path = "../data/Etape 1 Identification du film - Feuille 1 - enrichi.csv"  
data = pd.read_csv(file_path,sep=';',encoding='utf-8')

# list all columns
# fields = data.columns.to_frame()



# Analyse de la technologie

In [2]:
def prepare_technology_data(data, colname_id):
    """
    Extracts and prepares technology-related data for analysis from multiple characters.
    
    Parameters:
        data (DataFrame): The original dataset containing technology tools and demographic information for characters.
        colname_id (String): Part of the column name for which we want to do the analysis, e.g. 'gender'.
    
    Returns:
        DataFrame: A long-format DataFrame ready for analysis and visualization.
    """
    # Technology tools as described in the dataset
    tech_tools_suffix = [
        'Smartphone', 'Ordinateur', 'TV', 'Tablette', 'Console de jeux', 
        'Objets connectés', 'Robotique', 'Autre'
    ]


    # Prepare and concatenate data for all characters with accurate column names
    all_characters_data = pd.DataFrame()

    # Loop through each character number
    for i in range(1, 5):
        # Prepare the mapping for each character's technology columns using the correct format
        colnames = {
            f"character{i}_technology_tools [{tool}]": tool for tool in tech_tools_suffix
        }
        colnames[f"character{i}_" + colname_id] = colname_id
        
        # Select and rename the relevant columns for each character
        temp_data = data[list(colnames.keys())].rename(columns=colnames)
        
        # Append to the overall DataFrame
        all_characters_data = pd.concat([all_characters_data, temp_data], ignore_index=True)

    # Melt the DataFrame to long format for easier plotting
    melted_data_all = all_characters_data.melt(id_vars=[colname_id], 
                                               value_vars=tech_tools_suffix, 
                                               var_name='Technology', 
                                               value_name='Frequency')

    # Remove NaN entries for plotting
    melted_data_all.dropna(inplace=True)

    return melted_data_all

# Example usage:
# df = pd.read_csv('your_dataset.csv')
# prepared_data = prepare_technology_data(df)
# print(prepared_data.head())


In [3]:
def prepare_character_data(data, colname_suffixes):
    """
    Extracts and prepares data for analysis from multiple characters.
    
    Parameters:
        data (DataFrame): The original dataset containing technology tools and demographic information for characters.
        colname_id (String): Part of the column name for which we want to do the analysis, e.g. 'gender'.
    
    Returns:
        DataFrame: A long-format DataFrame ready for analysis and visualization.
    """

    # Prepare and concatenate data for all characters with accurate column names
    all_characters_data = pd.DataFrame()

    # Loop through each character number
    for i in range(1, 5):
        # Prepare the mapping for each character's technology columns using the correct format
        colnames = {
            f"character{i}_{suffix}": suffix for suffix in colname_suffixes
        }

        
        # Select and rename the relevant columns for each character
        temp_data = data[list(colnames.keys())].rename(columns=colnames)
        
        # Append to the overall DataFrame
        all_characters_data = pd.concat([all_characters_data, temp_data], ignore_index=True)

    return all_characters_data


In [4]:
melted_data_all = prepare_technology_data(data=data, colname_id='gender')

## Graphiques

In [5]:
# Custom color mapping 
color_map = { "Pas du tout" : '#98FB98', "Occasionnellement": '#99CCFF', "Souvent": '#3A4EC6', "Systématiquement": '#FF5050'}
category_orders={"Frequency": ["Pas du tout", "Occasionnellement", "Souvent", "Systématiquement"]}

label_nb_characters = 'Nombre de réponses'


### Utilisation de la technologie selon le genre

In [6]:
import plotly.express as px

fig = px.histogram(melted_data_all, x='Technology', color='Frequency', 
                   barmode='stack', title='Utilisation de la technologie par appareil et fréquence',
                   labels={'count':'Count of Responses'}, 
                   color_discrete_map=color_map,
                   category_orders=category_orders)
fig.update_layout(# xaxis_title='Technology Tool',
                  yaxis_title=label_nb_characters,
                  legend_title='Fréquence',
                  xaxis={'categoryorder':'total descending'},
                  xaxis_tickangle=-45)
fig.show()



In [7]:

fig = px.histogram(melted_data_all, x='Technology', color='Frequency', 
                   barmode='stack', facet_col='gender', 
                   title='Utilisation de la technologie par genre, type d\'appareil et fréquence',
                   labels={'count':'Count of Responses'}, 
                   color_discrete_map=color_map,
                   category_orders=category_orders)

# Update the x-axis title for each subplot
fig.update_xaxes(title_text='', tickangle=-45)

fig.update_layout(# xaxis_title='Technologie',
                  yaxis_title=label_nb_characters,
                  legend_title='Fréquence'
                  )


fig.show()


### Utilisation de la technologie selon l'ethnie

In [8]:
melted_data_all = prepare_technology_data(data=data, colname_id='ethnic_origin')
melted_data_all.rename(columns={'ethnic_origin':'Ethnie'}, inplace=True)

In [9]:

fig = px.histogram(melted_data_all, x='Technology', color='Frequency', 
                   barmode='stack', facet_col='Ethnie', 
                   title='Utilisation de la technologie par ethnie, type d\'appareil et fréquence',
                   color_discrete_map=color_map,
                   category_orders=category_orders)

# Update the x-axis title for each subplot
fig.update_xaxes(title_text='', tickangle=-45)

fig.update_layout(yaxis_title=label_nb_characters,
                  legend_title='Fréquence'
                  )

fig.show()


### Utilisation de la technologie selon l'âge

In [10]:
melted_data_all = prepare_technology_data(data=data, colname_id='age_group')
melted_data_all.rename(columns={'age_group':'Catégorie d\'âge'}, inplace=True)

fig = px.histogram(melted_data_all, x='Technology', color='Frequency', 
                   barmode='group', pattern_shape='Catégorie d\'âge',
                   title='Utilisation de la technologie par catégorie d\'âge, type d\'appareil et fréquence',
                   color_discrete_map=color_map,
                   category_orders=category_orders)

# Update the x-axis title for each subplot
fig.update_xaxes(title_text='', tickangle=-45)

fig.update_layout(yaxis_title=label_nb_characters,
                  legend_title='Fréquence'
                  )

fig.show()


# Analyse de la corrélation entre le métier pratiqué et la sensibilité du personnage à l’écologie

In [11]:
job_data = prepare_character_data(data=data,colname_suffixes={'job_sector'})

# Calculate the frequency of each job sector
job_sector_counts = job_data['job_sector'].value_counts().reset_index()
job_sector_counts.columns = ['Job Sector', 'Frequency']

# Creating a bar chart for job sector distribution
fig = px.bar(job_sector_counts, x='Job Sector', y='Frequency',
             title='Frequency of Job Sectors',
             labels={'Job Sector': 'Job Sector', 'Frequency': 'Frequency'})

# Update layout for better visualization
fig.update_layout(xaxis_title='Job Sector',
                  yaxis_title='Count',
                  xaxis_tickangle=-45)

# Show the plot
fig.show()


In [12]:
data.replace('Non, il / elle a même des comportements et valeurs explicitement anti-écologiques ','Non, anti-écolo', inplace=True)
job_data = prepare_character_data(data= data, colname_suffixes={'interested_ecology','job_sector'})

# Create a cross-tabulation
ct = pd.crosstab(job_data['job_sector'], job_data['interested_ecology'])

# Generate a heatmap
fig = px.imshow(ct, text_auto=True, aspect="auto",
                labels=dict(x="Interest in Ecology", y="Job Sector", color="Count"),
                title='Heatmap of Job Sectors and Interest in Ecology')

# Update layout for clarity
fig.update_xaxes(side="bottom")

# Display the plot
fig.show()


In [13]:
job_data = prepare_character_data(data= data, colname_suffixes={'interested_ecology','job'})


# Create a cross-tabulation
ct = pd.crosstab(job_data['job'], job_data['interested_ecology'])

# Generate a heatmap
fig = px.imshow(ct, text_auto=True, aspect="auto",
                labels=dict(x="Interest in Ecology", y="Job", color="Count"),
                title='Heatmap of Job and Interest in Ecology')

# Update layout for clarity
fig.update_xaxes(side="bottom")

# Display the plot
fig.show()


## Enjeux écologiques

In [14]:
# Des enjeux écologiques et environnementaux sont-ils mentionnés au cours du récit, même brièvement ?
response_counts = data['environmental_issues'].value_counts().reset_index()
response_counts.columns = ['environmental_issues', 'Count']


fig = px.pie(response_counts, names='environmental_issues', values='Count', title='Des enjeux écologiques et environnementaux sont-ils mentionnés au cours du récit, même brièvement ?')
fig.show()

### Enjeux les plus évoqués

In [15]:
import re
def extract_text_between_brackets(string):
    # Regular expression pattern to find text between [ and (
    # This is useful for extracting short labels for the environmental issues

    # \[  -> match the character '[' literally
    # (   -> start capturing group
    # [^\[\(]+ -> match any character except '[' or '(' one or more times
    # )   -> end capturing group
    # \)  -> match the character '(' literally
    match = re.search(r'\[([^\[\(]+)\(', string)
    
    if match:
        return match.group(1).strip()  # Return the matched group and strip any extra whitespace
    return None  # Return None if no match is found


In [16]:
# Filter columns that start with 'environmental_issues'
env_columns = data[[col for col in data.columns if col.startswith('environmental_issues')]]
enjeux = env_columns[env_columns['environmental_issues'] == 'Oui'].drop(axis=1, labels='environmental_issues')

# rename columns with shorter names
colnames = { colname : extract_text_between_brackets(colname) for colname in enjeux.columns}
enjeux.rename(columns=colnames, inplace=True)
enjeux


Unnamed: 0,Le changement climatique,L'accès à l'eau,La biodiversité,L'épuisement des ressources,Les contraintes énergétiques,La pollution
0,,,Absent,,,Mentionné une fois
3,Absent,,Mentionné à plusieurs reprises,,Absent,Absent
5,,Mentionné une fois,,Joue un rôle dans le récit,,
6,,,Absent,Absent,,
7,,,Joue un rôle dans le récit,Absent,,
10,,Joue un rôle dans le récit,Absent,,,Absent
11,Mentionné une fois,Absent,Mentionné à plusieurs reprises,,Est le sujet du récit,
12,,,,,,Mentionné une fois
14,,Joue un rôle dans le récit,Mentionné à plusieurs reprises,,,
19,,Absent,,,,Absent


In [17]:

# Melt the DataFrame to long format
long_format_data = enjeux.melt(var_name='Column', value_name='Value')

# Count the frequency of each value in each column
value_counts = long_format_data.groupby(['Column', 'Value']).size().reset_index(name='Counts')

# Pivot the data for heatmap
heatmap_data = value_counts.pivot(index='Value', columns='Column', values='Counts').fillna(0)

# Create the heatmap using Plotly Express
fig = px.imshow(heatmap_data, 
                labels=dict(x="Column", y="Value", color="Frequency"),
                x=heatmap_data.columns,
                y=heatmap_data.index,
                title="Fréquence des mentions des enjeux écologiques selon le type d\'enjeu")
fig.update_xaxes(side="bottom")  # Ensuring the x-axis labels are at the bottom
fig.show()



# Analyse de l'époque du récit

In [18]:
from collections import Counter

# Define the ordered list of time periods
time_periods = [
    "Entre la préhistoire et l'antiquité",      # Prehistory to antiquity
    "Entre le moyen-âge et le XVIIIe siècle",  # Middle Ages to 18th century
    "Au XIXe siècle",                          # 19th century
    "Entre 1900 et 1951",                      # 1900 to 1951
    "Entre 1950 et 1980",                      # 1950 to 1980
    "Entre 1980 et 2000",                      # 1980 to 2000
    "Entre 2000 et 2010",                      # 2000 to 2010
    "Entre 2010 et 2020",                      # 2010 to 2020
    "Entre 2020 et 2030",                      # 2020 to 2030
    "Entre 2030 et 2050",                      # 2030 to 2050
    "Au-delà de 2050",                         # Beyond 2050
    "Autre"                                    # Other
]

# remove trailing spaces
df = pd.DataFrame(data)

# Process the 'story_era' column
periods = df['story_era'].str.split(',').explode().str.strip()

# Count occurrences
df_count = periods.value_counts().reindex(time_periods, fill_value=0)

# Plot using Plotly
fig = px.bar(
    df_count,
    x=df_count.index,
    y=df_count.values,
    labels={'x': 'Story Era', 'y': 'Frequency'},
    title='Frequency of Each Story Era'
)
fig.show()


In [19]:
# Count periods with environmental issues
era = data.loc[:,["story_era","environmental_issues"]]

# Process the 'story_era' column
periods = era['story_era'].str.split(',').explode().str.strip()

# Total count of periods
total_counts = periods.value_counts().reindex(time_periods, fill_value=0)
total_counts.columns = ['story_era', 'total_count']

# Count of periods where environmental_issues is "Oui"
oui_era = era[era['environmental_issues'] == "Oui"]['story_era']
oui_periods = oui_era.str.split(',').explode().str.strip()
oui_counts = oui_periods.value_counts().reindex(time_periods, fill_value=0)
oui_counts.columns = ['story_era', 'oui_count']

# Merge counts
counts = pd.merge(total_counts, oui_counts, on='story_era', how='left').fillna(0)
# Rename columns using a dictionary
counts.rename(columns={'count_x': 'total_count', 'count_y': 'oui_count'}, inplace=True)

# Calculate the ratio and convert to percentage
counts['ratio_percentage'] = (counts['oui_count'] / counts['total_count']) * 100
counts['ratio_percentage'] = counts['ratio_percentage'].fillna(0)


In [20]:
import plotly.graph_objects as go

# Create a figure to which we will add the bars
fig = go.Figure()

# Add the first series
fig.add_trace(go.Bar(
    x=counts.index,
    y=counts['total_count'],
    name='Frequency',  # Name, which appears in the legend
    marker_color='indianred'
))

# Add the second series
fig.add_trace(go.Bar(
    x=counts.index,
    y=counts['oui_count'],
    name='Frequency with environmental issues',
    marker_color='lightsalmon'
))

fig.add_trace(go.Scatter(
    x=counts.index,
    y=counts['ratio_percentage'],
    name='Ratio with environmental issues',
    mode='lines+markers',
    marker_color='black',
    yaxis='y2'))

# Change the bar mode to group to display bars side by side
fig.update_layout(
    barmode='overlay',
    title='Frequency of Each Story Era',
    xaxis_tickangle=-45,
    xaxis_title='Story Era',
    yaxis=dict(
        title='Frequency',
        side='left'
    ),
    yaxis2=dict(
        title='Ratio',
        side='right',  # Secondary y-axis on the right
        overlaying='y',  # This axis overlays the primary y-axis
        range=[0, 100] 
    ),
    legend=dict(
        x=1.05,  # Places the legend to the right of the plot
        y=1,  # Aligns the top of the legend with the top of the plot
        xanchor='left',  # Anchors the legend from its left side
        yanchor='top'  # Anchors the legend from its top
    )
)

# Show the figure
fig.show()


In [23]:

# pas intéressant

ct = pd.crosstab(data['environmental_issues'], data['story_era'])

# Generate a heatmap
fig = px.imshow(ct, text_auto=True, aspect="auto",
                labels=dict(x="Interest in Ecology", y="Job Sector", color="Count"),
                title='Heatmap of Job Sectors and Interest in Ecology')

# Update layout for clarity
fig.update_xaxes(side="bottom")

# Display the plot
fig.show()