In [3]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

# 1. LOAD AND PREPARE THE DATA
try:
    # Load the dataset from the uploaded file
    df = pd.read_csv('/Users/padmanabhjagdishwanikar/Downloads/22070521058_PadmanabhWanikar_SecA_CA1_DSEDA/cleaned_dataset.csv')
    
    # Filter for the year 2017 and for entries where murder cases are greater than 0
    df_2018 = df[df['year'] == 2018].copy()
    sankey_df = df_2018[['state_name', 'district_name', 'murder']].copy()
    sankey_df = sankey_df[sankey_df['murder'] > 0].dropna()
    sankey_df.rename(columns={'murder': 'cases'}, inplace=True)

    # 2. GET A SORTED LIST OF ALL STATES BY TOTAL CASES
    state_totals = sankey_df.groupby('state_name')['cases'].sum().sort_values(ascending=False)
    all_sorted_states = state_totals.index.tolist()

    # 3. SET THE CHUNK SIZE
    CHUNK_SIZE = 3
    
    # 4. LOOP THROUGH STATES AND GENERATE A DIAGRAM FOR EACH CHUNK
    for i in range(0, len(all_sorted_states), CHUNK_SIZE):
        states_chunk = all_sorted_states[i : i + CHUNK_SIZE]
        chunk_df = sankey_df[sankey_df['state_name'].isin(states_chunk)].copy()
        
        # --- Data preparation for this chunk ---
        if chunk_df.empty:
            continue

        chunk_districts = chunk_df['district_name'].unique().tolist()
        all_labels = states_chunk + chunk_districts
        label_to_id = {label: j for j, label in enumerate(all_labels)}
        
        source_ids = [label_to_id[state] for state in chunk_df['state_name']]
        target_ids = [label_to_id[district] for district in chunk_df['district_name']]
        
        # --- Create the Sankey Diagram for this chunk ---
        colors = px.colors.qualitative.Plotly
        node_colors = []
        for label in all_labels:
            if label in states_chunk:
                state_index = states_chunk.index(label)
                node_colors.append(colors[state_index % len(colors)])
            else:
                node_colors.append('lightgray')

        fig = go.Figure(data=[go.Sankey(
            node=dict(
              pad=25,
              thickness=20,
              line=dict(color="black", width=0.5),
              label=all_labels,
              color=node_colors
            ),
            link=dict(
              source=source_ids,
              target=target_ids,
              value=chunk_df['cases']
          ))])
        
        # --- Customize the title and show the plot ---
        start_rank = i + 1
        end_rank = i + len(states_chunk)
        state_names_str = ", ".join(states_chunk)
        
        # **THIS IS THE MODIFIED SECTION**
        fig.update_layout(
            title_text=f"<b>Murder Cases for States (Rank {start_rank}-{end_rank})</b><br><sup>{state_names_str}</sup>",
            # Set the font size for all text in the figure
            font=dict(
                size=16  # Increased font size
            ),
            height=600 
        )
        
        fig.show()

except FileNotFoundError:
    print("Error: 'cleaned_dataset.csv' not found. Please ensure the file is uploaded correctly.")
except Exception as e:
    print(f"An error occurred: {e}")

In [12]:
import pandas as pd
import plotly.graph_objects as go

# 1. LOAD AND PREPARE DATA
df = pd.read_csv('/Users/padmanabhjagdishwanikar/Downloads/22070521058_PadmanabhWanikar_SecA_CA1_DSEDA/cleaned_dataset.csv')
df_2018 = df[df['year'] == 2018].copy()

# 2. DEFINE BROAD CRIME CATEGORIES
crime_categories = {
    'Heinous Crimes': ['murder', 'rape', 'dowry_death', 'acid_attack', 'human_trafficking'],
    'Property Crimes': ['robbery', 'dacoity', 'auto_motor_vehicle_theft', 'other_thefts', 'ext_and_blackmailing'],
    'Rioting & Unrest': ['rioting_communal_religious', 'rioting_political', 'rioting_caste_conflict', 'affray'],
    'Bodily Harm': ['volntri_cusng_grvus_hrt', 'grvus_hrt_dang_wepon_or_mean', 'hrt_by_endngrng_lyf_sfty_others'],
    'Crimes Against Women': ['assault_on_women', 'sexual_harassment_at_work', 'stalking', 'cruelty_by_husband_relatives']
}

# Keep only the columns we need
all_crime_cols = [col for sublist in crime_categories.values() for col in sublist]
df_filtered = df_2018[['state_name'] + all_crime_cols]

# 3. GET A SORTED LIST OF ALL STATES
df_filtered['total_crimes'] = df_filtered[all_crime_cols].sum(axis=1)
state_totals = df_filtered.groupby('state_name')['total_crimes'].sum().sort_values(ascending=False)
all_sorted_states = state_totals[state_totals > 0].index.tolist()

# 4. LOOP THROUGH STATES IN CHUNKS OF 3
CHUNK_SIZE = 3
for i in range(0, len(all_sorted_states), CHUNK_SIZE):
    states_chunk = all_sorted_states[i : i + CHUNK_SIZE]
    chunk_df = df_filtered[df_filtered['state_name'].isin(states_chunk)].copy()
    
    # --- Data preparation for this chunk ---
    alluvial_data = []
    for state in states_chunk:
        state_data = chunk_df[chunk_df['state_name'] == state]
        for category, cols in crime_categories.items():
            value = state_data[cols].sum().sum()
            if value > 0:
                alluvial_data.append({'source': state, 'target': category, 'value': value})

    plot_df = pd.DataFrame(alluvial_data)
    
    if plot_df.empty:
        continue

    # --- Create and plot the Alluvial Diagram for this chunk ---
    all_labels = pd.concat([plot_df['source'], plot_df['target']]).unique().tolist()
    label_to_id = {label: j for j, label in enumerate(all_labels)}
    
    fig = go.Figure(go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=all_labels
        ),
        link=dict(
            source=[label_to_id[s] for s in plot_df['source']],
            target=[label_to_id[t] for t in plot_df['target']],
            value=plot_df['value']
        )
    ))
    
    # Customize the title and show the plot
    start_rank = i + 1
    end_rank = i + len(states_chunk)
    state_names_str = ", ".join(states_chunk)
    
    # **THIS IS THE MODIFIED SECTION**
    fig.update_layout(
        title_text=f"<b>Alluvial Diagram for States (Rank {start_rank}-{end_rank})</b><br><sup>{state_names_str}</sup>",
        # Set the font size for all text in the figure
        font=dict(
            size=16  # Increased font size
        )
    )
    
    fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [11]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px

# 1. LOAD AND PREPARE THE DATA
try:
    # Use the correct filename for your uploaded data
    df = pd.read_csv('/Users/padmanabhjagdishwanikar/Downloads/22070521058_PadmanabhWanikar_SecA_CA1_DSEDA/cleaned_dataset.csv')
    
    # --- MODIFICATION FOR ALL YEARS ---
    # a. Select the relevant columns without filtering by year
    sankey_df = df[['state_name', 'district_name', 'murder']].copy()
    sankey_df = sankey_df[sankey_df['murder'] > 0].dropna()

    # b. Group by state and district and sum the cases across all years
    sankey_df = sankey_df.groupby(['state_name', 'district_name'])['murder'].sum().reset_index()
    
    sankey_df.rename(columns={'murder': 'cases'}, inplace=True)

    # 2. GET A SORTED LIST OF ALL STATES BY TOTAL CASES
    state_totals = sankey_df.groupby('state_name')['cases'].sum().sort_values(ascending=False)
    all_sorted_states = state_totals.index.tolist()

    # 3. SET THE CHUNK SIZE
    CHUNK_SIZE = 3
    
    # 4. LOOP THROUGH STATES AND GENERATE A DIAGRAM FOR EACH CHUNK
    for i in range(0, len(all_sorted_states), CHUNK_SIZE):
        states_chunk = all_sorted_states[i : i + CHUNK_SIZE]
        chunk_df = sankey_df[sankey_df['state_name'].isin(states_chunk)].copy()
        
        if chunk_df.empty:
            continue

        chunk_districts = chunk_df['district_name'].unique().tolist()
        all_labels = states_chunk + chunk_districts
        label_to_id = {label: j for j, label in enumerate(all_labels)}
        
        source_ids = [label_to_id[state] for state in chunk_df['state_name']]
        target_ids = [label_to_id[district] for district in chunk_df['district_name']]
        
        colors = px.colors.qualitative.Plotly
        node_colors = []
        for label in all_labels:
            if label in states_chunk:
                state_index = states_chunk.index(label)
                node_colors.append(colors[state_index % len(colors)])
            else:
                node_colors.append('lightgray')

        fig = go.Figure(data=[go.Sankey(
            node=dict(
              pad=25,
              thickness=20,
              line=dict(color="black", width=0.5),
              label=all_labels,
              color=node_colors
            ),
            link=dict(
              source=source_ids,
              target=target_ids,
              value=chunk_df['cases']
          ))])
        
        start_rank = i + 1
        end_rank = i + len(states_chunk)
        state_names_str = ", ".join(states_chunk)
        
        # Updated title to reflect all years
        fig.update_layout(
            title_text=f"<b>Total Murder Cases (All Years) for States (Rank {start_rank}-{end_rank})</b><br><sup>{state_names_str}</sup>",
            font=dict(size=16),
            height=600 
        )
        
        fig.show()

except FileNotFoundError:
    print("Error: 'districtwise-ipc-crime-by-juveniles-2017-onwards.csv' not found.")
except Exception as e:
    print(f"An error occurred: {e}")

In [13]:
import pandas as pd
import plotly.graph_objects as go

# 1. LOAD AND PREPARE DATA
# Use the correct filename for your uploaded data
df = pd.read_csv('/Users/padmanabhjagdishwanikar/Downloads/22070521058_PadmanabhWanikar_SecA_CA1_DSEDA/cleaned_dataset.csv')

# --- MODIFICATION FOR ALL YEARS ---
# The line filtering for a specific year has been removed.

# 2. DEFINE BROAD CRIME CATEGORIES
# Column names are corrected to match your CSV file exactly.
crime_categories = {
    'Heinous Crimes': ['murder', 'rape', 'dowry_death', 'acid_attack', 'human_trafficking'],
    'Property Crimes': ['robbery', 'dacoity', 'auto_motor_vehicle_theft', 'other_thefts', 'ext_and_blackmailing'],
    'Rioting & Unrest': ['rioting_communal_religious', 'rioting_political', 'rioting_caste_conflict', 'affray'],
    'Bodily Harm': ['volntri_cusng_grvus_hrt', 'grvus_hrt_dang_wepon_or_mean', 'hrt_by_endngrng_lyf_sfty_others'],
    'Crimes Against Women': ['assault_on_women', 'sexual_harassment_at_work', 'stalking', 'cruelty_by_husband_relatives']
}

# Keep only the columns we need from the original dataframe
all_crime_cols = [col for sublist in crime_categories.values() for col in sublist if col in df.columns]
df_filtered = df[['state_name'] + all_crime_cols]

# 3. GET A SORTED LIST OF ALL STATES
# This will now sum the crimes across all years for each district before grouping by state
df_filtered['total_crimes'] = df_filtered[all_crime_cols].sum(axis=1)
state_totals = df_filtered.groupby('state_name')['total_crimes'].sum().sort_values(ascending=False)
all_sorted_states = state_totals[state_totals > 0].index.tolist()

# 4. LOOP THROUGH STATES IN CHUNKS OF 3
CHUNK_SIZE = 3
for i in range(0, len(all_sorted_states), CHUNK_SIZE):
    states_chunk = all_sorted_states[i : i + CHUNK_SIZE]
    chunk_df = df_filtered[df_filtered['state_name'].isin(states_chunk)].copy()
    
    # Data preparation for this chunk
    alluvial_data = []
    for state in states_chunk:
        state_data = chunk_df[chunk_df['state_name'] == state]
        for category, cols in crime_categories.items():
            # Ensure columns exist before summing
            existing_cols = [col for col in cols if col in state_data.columns]
            value = state_data[existing_cols].sum().sum()
            if value > 0:
                alluvial_data.append({'source': state, 'target': category, 'value': value})

    plot_df = pd.DataFrame(alluvial_data)
    
    if plot_df.empty:
        continue

    # Create and plot the Alluvial Diagram for this chunk
    all_labels = pd.concat([plot_df['source'], plot_df['target']]).unique().tolist()
    label_to_id = {label: j for j, label in enumerate(all_labels)}
    
    fig = go.Figure(go.Sankey(
        node=dict(
            pad=15,
            thickness=20,
            line=dict(color="black", width=0.5),
            label=all_labels
        ),
        link=dict(
            source=[label_to_id[s] for s in plot_df['source']],
            target=[label_to_id[t] for t in plot_df['target']],
            value=plot_df['value']
        )
    ))
    
    # Customize the title and show the plot
    start_rank = i + 1
    end_rank = i + len(states_chunk)
    state_names_str = ", ".join(states_chunk)
    
    # Updated title to reflect all years
    fig.update_layout(
        title_text=f"<b>Total Crime Flow (All Years) for States (Rank {start_rank}-{end_rank})</b><br><sup>{state_names_str}</sup>",
        font=dict(size=16)
    )
    
    fig.show()



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

