# Import Libraries 

In [3]:
import luigi  # Luigi for building pipelines
import json   # JSON for handling JSON data
import pandas as pd   # Pandas for data manipulation
from pymongo import MongoClient  # PyMongo for MongoDB interaction
from sqlalchemy import create_engine  # SQLAlchemy for SQL database interaction
import logging  # Logging for capturing runtime information
import os  # OS for interacting with the operating system
import plotly.express as px # Plotly is for visualization
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression # for linear regression model
import numpy as np

In [4]:
def load_db_config():
    with open('db_config.json', 'r') as file:
        config = json.load(file)
    print("Loaded configuration:", config)
    return config

db_config = load_db_config()


Loaded configuration: {'mongodb': {'username': 'dap', 'password': 'dap', 'host': 'localhost', 'port': 27017, 'database': 'Group_O'}, 'postgresql': {'username': 'postgres', 'password': 'root', 'host': 'localhost', 'port': 5432, 'database': 'Group_O'}}


In [5]:
# Configure logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Luigi ETL Pipeline

## Prepare Data

In [6]:
class PrepareData(luigi.Task):
    filepath = luigi.Parameter()  # Define a parameter to store the file path

    def output(self):
        # Define the output target where the prepared data will be saved
        return luigi.LocalTarget(self.filepath.replace('.json', '_prepared.json'))

    def run(self):
        logging.info(f"Starting to prepare data from {self.filepath}")
        # Check if the input file exists
        if not os.path.exists(self.filepath):
            logging.error(f"Input file not found: {self.filepath}")
            raise FileNotFoundError(f"{self.filepath} not found.")

        # Load and prepare data from the input JSON file
        data = self.load_and_prepare_data(self.filepath)

        # Write the prepared data to the output target
        with self.output().open('w') as f:
            json.dump(data, f)
            
        # Log information about the completion of data preparation
        logging.info(f"Prepared data and saved to {self.output().path}")
        logging.info(f"Number of records prepared: {len(data)}")

    @staticmethod
    def load_and_prepare_data(filepath):
        # Load JSON data from the given file path
        with open(filepath, 'r') as file:
            data = json.load(file)
        # Extract metadata and column names from the JSON data
        meta_data = data['meta']['view']
        columns = [col['name'] for col in meta_data['columns']]
        # Extract records and create a list of dictionaries with column names as keys
        records = data['data']
        prepared_data = [dict(zip(columns, record)) for record in records]
        return prepared_data

## Load data into MongoDB

In [7]:
class LoadDataToMongoDB(luigi.Task):
    filepath = luigi.Parameter()  # Define a parameter to store the file path

    def requires(self):
        # Specify the task dependency: requires PrepareData task to be completed first
        return PrepareData(filepath=self.filepath)

    def output(self):
        # Define the output target, which is a log file indicating success
        return luigi.LocalTarget('mongodb_store_success.log')

    def run(self):
        # Load the prepared data from the output of the PrepareData task
        with self.input().open('r') as f:
            data = json.load(f)
            
        # Connect to MongoDB
        config = db_config['mongodb']
        client = MongoClient(f"mongodb://{config['username']}:{config['password']}@localhost:{config['port']}/")
        db = client["Group_O"]
        collection = db["drugs-overdose_23113561"]
        
        # Clear the collection before inserting new data
        collection.delete_many({})
        # Insert the prepared data into the MongoDB collection
        result = collection.insert_many(data)
        
        # Write a success message to the output log file
        with self.output().open('w') as f:
            f.write(f'Data successfully inserted into MongoDB. Documents inserted: {len(result.inserted_ids)}\n')
            
        # Log information about the successful data insertion
        logging.info(f"Successfully loaded {len(result.inserted_ids)} documents into MongoDB.")

## Extract data from MongoDB and Converted it into CSV

In [8]:
class ExtractDataFromMongoDB(luigi.Task):
    def requires(self):
        # Specify the task dependency: requires LoadDataToMongoDB task to be completed first
        return LoadDataToMongoDB(filepath='drug_overdose.json')

    def output(self):
        # Define the output target, which is a CSV file containing the extracted data
        return luigi.LocalTarget('extracted_data.csv')

    def run(self):
        logging.info("Extracting data from MongoDB")

        # Connect to MongoDB
        config = db_config['mongodb']
        client = MongoClient(f"mongodb://{config['username']}:{config['password']}@localhost:{config['port']}/")
        db = client["Group_O"]
        collection = db["drugs-overdose_23113561"]
        
        # Query all documents from the collection
        cursor = collection.find()
        
        # Convert MongoDB cursor to DataFrame
        df = pd.DataFrame(list(cursor))
        
        # Write DataFrame to CSV file
        df.to_csv(self.output().path, index=False)

        # Log information about the completion of data extraction
        logging.info("Data extraction completed and written to CSV.")

## Transformation

In [9]:
class TransformData(luigi.Task):
    def requires(self):
        # Specify the task dependency: requires ExtractDataFromMongoDB task to be completed first
        return ExtractDataFromMongoDB()

    def output(self):
        # Define the output target, which is a CSV file containing the transformed data
        return luigi.LocalTarget('transformed_data.csv')

    def run(self):
        logging.info("Starting data transformation")

        # Load the extracted data from the input CSV file
        df = pd.read_csv(self.input().path)

        # Drop unnecessary columns
        df.drop(columns=[col for col in ['_id', 'id', 'created_meta', 'updated_meta', 'meta', 'created_at', 'updated_at', 'FLAG', 'position', 'sid'] if col in df.columns], axis=1, inplace=True)
        
        # Convert column names to lowercase
        df.columns = [col.lower() for col in df.columns]
        
        # Convert specified columns to numeric data type
        numeric_cols = ['panel_num', 'unit_num', 'stub_name_num', 'stub_label_num', 'year_num', 'age_num', 'estimate']
        for col in numeric_cols:
            if col in df.columns:
                df[col] = pd.to_numeric(df[col], errors='coerce')
        # Drop duplicate rows
        df = df.drop_duplicates()

        # Identify missing values and fill them using the mean by age group
        missing_cols = df.columns[df.isnull().any()].tolist()
        df = self.fill_missing_values_by_age(df, missing_cols, method='mean')

        # Write the transformed DataFrame to the output CSV file
        df.to_csv(self.output().path, index=False)
        
        # Log information about the completion of data transformation
        logging.info("Data transformation completed successfully.")

    @staticmethod
    def fill_missing_values_by_age(df, columns_to_fill, method='mean'):
        # Fill missing values in specified columns by the mean or median value within each age group
        for column in columns_to_fill:
            if method == 'mean':
                df[column] = df.groupby('age')[column].transform(lambda x: x.fillna(x.mean()))
            elif method == 'median':
                df[column] = df.groupby('age')[column].transform(lambda x: x.fillna(x.median()))
            else:
                raise ValueError("Method must be 'mean' or 'median'.")
        return df

## Loading Transformed data into Postgre SQL database

In [10]:
class InsertDataToPostgreSQL(luigi.Task):
    def requires(self):
        return TransformData()

    def output(self):
        return luigi.LocalTarget('data_insertion_to_postgres.log')

    def run(self):
        # Load the transformed data from the input CSV file
        df = pd.read_csv(self.input().path)

        # Load database configuration
        config = db_config['postgresql']
        # Specify the table name directly
        table_name = 'drugs-overdose_23113561'

        # Create the database engine
        engine = create_engine(f'postgresql://{config["username"]}:{config["password"]}@{config["host"]}:{config["port"]}/{config["database"]}')

        try:
            # Insert data into the table
            df.to_sql(table_name, engine, if_exists='append', index=False)
            engine.dispose()  # It's a good practice to dispose of the connection after use

            # Log the successful insertion
            with self.output().open('w') as f:
                f.write(f"DataFrame successfully inserted into PostgreSQL database. Rows inserted: {len(df)}\n")
            logging.info(f"DataFrame successfully inserted into PostgreSQL database. Rows inserted: {len(df)}")

        except Exception as e:
            # Log any errors that occur
            logging.error(f"An error occurred while inserting data into PostgreSQL: {e}")
            raise


## Luigi main Function

In [11]:
# Main execution setup
if __name__ == '__main__':
    luigi.build([InsertDataToPostgreSQL()], local_scheduler=False)

2024-05-01 19:41:09,443 - INFO - logging configured by default settings
DEBUG: Checking if InsertDataToPostgreSQL() is complete
2024-05-01 19:41:09,450 - DEBUG - Checking if InsertDataToPostgreSQL() is complete
DEBUG: Checking if TransformData() is complete
2024-05-01 19:41:09,454 - DEBUG - Checking if TransformData() is complete
INFO: Informed scheduler that task   InsertDataToPostgreSQL__99914b932b   has status   PENDING
2024-05-01 19:41:09,469 - INFO - Informed scheduler that task   InsertDataToPostgreSQL__99914b932b   has status   PENDING
DEBUG: Checking if ExtractDataFromMongoDB() is complete
2024-05-01 19:41:09,473 - DEBUG - Checking if ExtractDataFromMongoDB() is complete
INFO: Informed scheduler that task   TransformData__99914b932b   has status   PENDING
2024-05-01 19:41:09,483 - INFO - Informed scheduler that task   TransformData__99914b932b   has status   PENDING
DEBUG: Checking if LoadDataToMongoDB(filepath=drug_overdose.json) is complete
2024-05-01 19:41:09,485 - DEBUG - C

# Extracting data from PostgreSQL

In [12]:
def extract_data_from_postgres(table):
    """Connect to PostgreSQL and return a DataFrame."""
    # Load database configuration
    config = db_config['postgresql']
    # Create the database engine
    engine = create_engine(f'postgresql://{config["username"]}:{config["password"]}@{config["host"]}:{config["port"]}/{config["database"]}')
    query = f'SELECT * FROM public."{table}"'  
    df = pd.read_sql(query, engine)
    engine.dispose()
    return df

# Extracting PostgreSQL dataset using above function
data = extract_data_from_postgres(table="drugs-overdose_23113561")


## EDA

In [13]:
data.head()

Unnamed: 0,indicator,panel,panel_num,unit,unit_num,stub_name,stub_name_num,stub_label,stub_label_num,year,year_num,age,age_num,estimate
0,Drug overdose death rates,All drug overdose deaths,0,"Deaths per 100,000 resident population, age-ad...",1,Total,0,All persons,0.1,1999,1,All ages,1.1,6.1
1,Drug overdose death rates,All drug overdose deaths,0,"Deaths per 100,000 resident population, age-ad...",1,Total,0,All persons,0.1,2000,2,All ages,1.1,6.2
2,Drug overdose death rates,All drug overdose deaths,0,"Deaths per 100,000 resident population, age-ad...",1,Total,0,All persons,0.1,2001,3,All ages,1.1,6.8
3,Drug overdose death rates,All drug overdose deaths,0,"Deaths per 100,000 resident population, age-ad...",1,Total,0,All persons,0.1,2002,4,All ages,1.1,8.2
4,Drug overdose death rates,All drug overdose deaths,0,"Deaths per 100,000 resident population, age-ad...",1,Total,0,All persons,0.1,2003,5,All ages,1.1,8.9


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6228 entries, 0 to 6227
Data columns (total 14 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   indicator       6228 non-null   object 
 1   panel           6228 non-null   object 
 2   panel_num       6228 non-null   int64  
 3   unit            6228 non-null   object 
 4   unit_num        6228 non-null   int64  
 5   stub_name       6228 non-null   object 
 6   stub_name_num   6228 non-null   int64  
 7   stub_label      6228 non-null   object 
 8   stub_label_num  6228 non-null   float64
 9   year            6228 non-null   int64  
 10  year_num        6228 non-null   int64  
 11  age             6228 non-null   object 
 12  age_num         6228 non-null   float64
 13  estimate        6228 non-null   float64
dtypes: float64(3), int64(5), object(6)
memory usage: 681.3+ KB


In [15]:
data.describe()

Unnamed: 0,panel_num,unit_num,stub_name_num,stub_label_num,year,year_num,age_num,estimate
count,6228.0,6228.0,6228.0,6228.0,6228.0,6228.0,6228.0,6228.0
mean,2.5,1.578035,3.028902,3.383006,2008.66474,10.66474,1.354913,4.360653
std,1.707962,0.493913,1.447036,1.526819,5.849512,5.849512,0.301459,5.92097
min,0.0,1.0,0.0,0.1,1999.0,1.0,1.1,0.0
25%,1.0,1.0,2.0,2.1,2004.0,6.0,1.1,0.9
50%,2.5,2.0,3.0,3.22,2009.0,11.0,1.2,2.3
75%,4.0,2.0,4.0,4.6,2014.0,16.0,1.6,4.8
max,5.0,2.0,5.0,5.93,2018.0,20.0,1.91,54.3


### Dataset Description

The dataset contains the following columns:

- **INDICATOR:** Describes the type of data, which in this case is drug overdose death rates.
- **PANEL:** Details the specific type of drug overdose deaths.
- **PANEL_NUM:** Numerical code for the panel.
- **UNIT:** The unit of measurement, here it's deaths per 100,000 resident population, age-adjusted.
- **UNIT_NUM:** Numerical code for the unit.
- **STUB_NAME:** Grouping category (like total or specific demographics).
- **STUB_NAME_NUM:** Numerical code for the stub name.
- **STUB_LABEL:** Label for the stub.
- **STUB_LABEL_NUM:** Numerical code for the stub label.
- **YEAR:** Year of the data.
- **YEAR_NUM:** Numerical code for the year.
- **AGE:** Age category for the data.
- **AGE_NUM:** Numerical code for the age.
- **ESTIMATE:** The aetual value of th# death rate.

###cs and Missing Data

**Summary Statistics Highlights:**

- The dataset has data for 6228 entries.
- The `INDICATOR` column is constant and indicates the data is about drug overdose death rates.
- The `PANEL` column contains 6 unique types, one of which includes all drug overdose deaths.
- There are different `UNIT` types, indicating that some data are age-adjusted rates while others are crude rates.
- `STUB_NAME` and `STUB_LABEL` columns include categorizations like demographics and other attributes.
- The dataset spans from 1999 to 2018.
- There are multiple age groups categorized.
egorized.the death rate.

# Visualization 

## Trend in Overall Drug Overdose Death Rates (1999-2018)

In [16]:
# Filter data for overall drug overdose deaths and not subdivided by specific drugs or demographics
overall_deaths = data[(data['panel'] == 'All drug overdose deaths') & (data['stub_label'] == 'All persons')]

overall_deaths = overall_deaths.groupby('year', as_index=False).agg({'estimate': 'mean'})

# Plot using Plotly, setting the height and adding a border
fig = px.line(overall_deaths, x='year', y='estimate', markers=True,
              title='Trend in Overall Drug Overdose Death Rates (1999-2018)')

# Increase the height of the graph to, say, 600 pixels
fig.update_layout(height=600,width = 600)

# Add a border to the graph: this will add a line around the plotting area
fig.update_layout(plot_bgcolor='rgba(0,0,0,0)', # making the background transparent
                  paper_bgcolor='rgba(0,0,0,0)', # making the surrounding paper transparent
                  xaxis=dict(showline=True, linecolor='black'), # adding x-axis border
                  yaxis=dict(showline=True, linecolor='black'), # adding y-axis border
                  xaxis_title='Year',
                  yaxis_title='Death Rate per 100,000 Population',
                  showlegend=False)

# To add a border around the whole figure, we use shapes to draw a rectangle
fig.add_shape(
        # Rectangle outline
        type="rect",
        xref="paper",
        yref="paper",
        x0=0,
        y0=0,
        x1=1,
        y1=1,
        line=dict(
            color="Black",
            width=2,
        ),
    )

fig.show()


The line plot above illustrates the trend in overall drug overdose death rates in the United States from 1999 to 2018. Here are some key observations:

- Increasing Trend: There is a clear upward trend in drug overdose death rates over this period, indicating a worsening drug overdose crisis.
- Significant Increase Post-2010: The rate accelerates significantly after 2010, suggesting that the problem has become more severe in the latter part of the dataset.

## Trend in Drug Overdose Death Rates by Drug Type (1999-2018)

In [17]:
specific_drug_deaths = data.groupby(['year', 'panel'], as_index=False).agg({'estimate': 'mean'})

# Plot the data with Plotly
fig = px.line(specific_drug_deaths, x='year', y='estimate', color='panel', markers=True,
              title='Trend in Drug Overdose Death Rates by Drug Type (1999-2018)',
              labels={'estimate': 'Death Rate per 100,000 Population', 'panel': 'Drug Type'})

# Update the layout to make it square and adjust the title and legend to prevent overlapping
fig.update_layout(
    width=700,  # Adjust the width if necessary
    height=700,  # Adjust the height to match the width for a square aspect ratio
    title=dict(
        text='Trend in Drug Overdose Death Rates by Drug Type (1999-2018)',
        y=0.95,  # Adjust the title's vertical position
        x=0.5,
        xanchor='center',
        yanchor='top',
        font=dict(size=18)  # Adjust the font size as necessary
    ),
    legend=dict(
        font=dict(size=7),  # Decrease the legend font size
        yanchor="top",
        y=0.95,  # Adjust the vertical position of the legend
        xanchor="right",
        x=0.65   # Adjust the horizontal position of the legend
    ),
    margin=dict(t=80, b=80, l=80, r=80),  # Adjust the margins as necessary
)

# Add a border around the whole figure
fig.add_shape(
    # Rectangle outline
    type="rect",
    xref="paper",
    yref="paper",
    x0=0,
    y0=0,
    x1=1,
    y1=1,
    line=dict(
        color="Black",
        width=2,
    ),
)

# Show the figure
fig.show()

The plot shows the evolution of drug overdose death rates by drug type over two decades, revealing several important trends:

- Opioids (not specified) and Synthetic opioids, excluding methadone: These categories show a stark increase in death rates, particularly post-2010. Synthetic opioids demonstrate the most dramatic rise, underscoring their significant impact on the overdose crisis.
- Heroin and Natural and semi-synthetic opioids: Both show increasing trends, with heroin peaking around 2016 before a slight decline, while natural and semi-synthetic opioids have a steadier increase.
- Methadone: Interestingly, methadone-related deaths peak around 2007 and then show a gradual decline, possibly reflecting improved prescribing practices and monitoring.
- Cocaine and Psychostimulants with abuse potential: These categories show increases later in the timeline, with psychostimulants showing a sharp rise towards the end of the period.

These trends suggest targeted public health interventions might be needed, especially concerning synthetic opioids and psychostimulants.

## Trend in Drug Overdose Death Rates by Age Group (1999-2018)

In [18]:
age_group_deaths = data[(data['age'] != 'All ages') & (data['panel'] == 'All drug overdose deaths')]

# Aggregate data by year and age group to get the mean estimate
age_group_deaths = age_group_deaths.groupby(['year', 'age'], as_index=False).agg({'estimate': 'mean'})

# Plot the trend over the years for each age group using Plotly
fig = px.line(age_group_deaths, x='year', y='estimate', color='age', markers=True,
              title='Trend in Drug Overdose Death Rates by Age Group (1999-2018)',
              labels={'estimate':'Death Rate per 100,000 Population', 'age':'Age Group'})

# Update the layout to be square and modify legend location
fig.update_layout(width=600, height=600, # Adjust dimensions to be square
                  legend_title_text='Age Group',
                  legend=dict(orientation="h", yanchor="bottom", y=1, xanchor="right", x=1))

# Update the layout to increase the height, modify legend location, and adjust the title
fig.update_layout(
    height=600,
    legend_title_text='Age Group',
    legend=dict(orientation="h", yanchor="bottom", y=1, xanchor="right", x=1),
    title=dict(
        text='Trend in Drug Overdose Death Rates by Age Group (1999-2018)',
        y=0.95,  # This adjusts the title's vertical position
        x=0.5,
        xanchor='center',
        yanchor='top',
        font=dict(
            size=16  # Reduce font size to fit the title in the allocated space
        )
    ),
    margin=dict(t=130)  # Increase top margin to make space for the title
)

# To add a border around the whole figure, we use shapes to draw a rectangle
fig.add_shape(
        # Rectangle outline
        type="rect",
        xref="paper",
        yref="paper",
        x0=0,
        y0=0,
        x1=1,
        y1=1,
        line=dict(
            color="Black",
            width=2,
        ),
    )

fig.show()

The line plot above shows how different age groups have been impacted by drug overdose death rates over the period from 1999 to 2018. Here are some observations:

- Older Age Groups (e.g., 55-64 years, 45-54 years): These groups exhibit the highest death rates, particularly in the later years of the dataset. The rates have increased significantly over the period, highlighting a growing problem in these demographics.
- Younger Age Groups (e.g., 15-24 years, 25-34 years): While not as high as the older age groups, there is a noticeable increase in death rates, especially post-2010. The 25-34 year age group, in particular, shows a sharp rise towards the end of the period.
- Middle Age Group (35-44 years): This group also shows a significant rise over time, particularly from the mid-2000s onward.

These trends suggest that while older adults currently bear the brunt of overdose fatalities, younger and middle-aged adults have seen substantial increases in death rates, indicating a broad and multi-generational impact of the drug overdose crisis.

## Proportion of Drug Overdose Death Rates by Category in 2018

In [19]:
# Filter data for the most recent year, 2018
data_2018 = data[data['year'] == 2018]

# Group data by panel and sum the estimates for 2018
panel_data_2018 = data_2018.groupby('panel')['estimate'].sum().reset_index()

# Creating a pie chart using Plotly
fig = px.pie(panel_data_2018, values='estimate', names='panel', title='Proportion of Drug Overdose Death Rates by Category in 2018')

# Show the figure
fig.show()

The pie chart for the year 2018 is displayed above, showing the proportions of drug overdose death rates by different drug categories. Each slice of the pie represents a different category, illustrating how each contributes to the overall drug overdose deaths for that year.

## Heatmap of Drug Overdose Death Rates by Age Group and Year

In [20]:
# Filter out the 'All ages' group for a more detailed age-specific analysis
age_specific_data = data[data['age'] != 'All ages']

# Create a pivot table for the heatmap
heatmap_data = age_specific_data.pivot_table(values='estimate', index='year', columns='age', aggfunc='mean')

# Create the heatmap with Plotly
fig = go.Figure(data=go.Heatmap(
                   z=heatmap_data.values,
                   x=heatmap_data.columns,
                   y=heatmap_data.index,
                   colorscale='RdBu'))

# Update the layout
fig.update_layout(
    title='Heatmap of Drug Overdose Death Rates by Age Group and Year',
    xaxis_nticks=36,
    xaxis_title='Age Group',
    yaxis_title='Year',
    yaxis=dict(tickmode='array', tickvals=heatmap_data.index),
    width=800,
    height=600
)

# Show the figure
fig.show()


The heatmap above visualizes drug overdose death rates by age group across different years. Warmer colors indicate higher overdose rates, while cooler colors represent lower rates. This visualization allows us to quickly grasp patterns and changes over time:

- Higher overdose rates are evident in certain middle-aged groups, especially in later years, as indicated by the warmer colors.
- There is a general trend of increasing overdose rates across multiple age groups over time.

## Average Drug Overdose Death Rates by Age Group in 2018

In [21]:
# Filter data for the most recent year, 2018, and group by age to find mean estimates
age_data_2018 = age_specific_data[age_specific_data['year'] == 2018]
age_average_2018 = age_data_2018.groupby('age')['estimate'].mean().sort_values(ascending=False).reset_index()

# Plotting the bar chart with Plotly
fig = px.bar(age_average_2018, x='age', y='estimate', color='estimate',
             title='Average Drug Overdose Death Rates by Age Group in 2018',
             labels={'estimate': 'Deaths per 100,000 People', 'age': 'Age Group'})

# Customize the layout
fig.update_layout(
    xaxis_title='Age Group',
    yaxis_title='Deaths per 100,000 People',
    xaxis={'categoryorder':'total descending'},
    yaxis=dict(gridcolor='lightgrey', gridwidth=0.7),
    showlegend=False
)

# To add a border around the whole figure, we use shapes to draw a rectangle
fig.add_shape(
        # Rectangle outline
        type="rect",
        xref="paper",
        yref="paper",
        x0=0,
        y0=0,
        x1=1,
        y1=1,
        line=dict(
            color="Black",
            width=2,
        ),
    )

# Show the figure
fig.show()

The bar chart above shows the average drug overdose death rates by age group for the year 2018. This visualization helps us to quickly identify which age groups are most affected:

- The age groups of 35-44 years and 45-54 years have the highest rates, indicating a significant impact on these demographics.
- Younger groups (under 15 years) and the oldest group (85 years and over) have much lower rates.

## Stacked Bar Chart of Drug Overdose Death Rates by Drug Category Over the Years

In [22]:
# Group data by year and panel, then compute the sum of estimates for each group
stacked_data = data.groupby(['year', 'panel'])['estimate'].sum().unstack('panel').fillna(0)

# Create traces for each drug category, which will be stacked by year
traces = [go.Bar(name=category, x=stacked_data.index, y=stacked_data[category]) for category in stacked_data.columns]

# Create the figure with all traces
fig = go.Figure(data=traces)

# Make the bars stacked
fig.update_layout(barmode='stack')

# Customize the layout
fig.update_layout(
    title='Stacked Bar Chart of Drug Overdose Death Rates by Drug Category Over the Years',
    xaxis=dict(title='Year', tickangle=-45),
    yaxis=dict(title='Deaths per 100,000 People'),
    legend_title_text='Drug Category',
    legend=dict(x=1.05, y=1, bgcolor='rgba(255, 255, 255, 0.5)', bordercolor='rgba(0, 0, 0, 0.1)')
)

# Show the figure
fig.show()

The stacked bar chart above displays the drug overdose death rates by drug category over the years. Each color in a bar represents a different drug category, illustrating how each category contributes to the overall death rates annually.

- The growing impact of specific drug categories over time.
- How the composition of drug overdose deaths has changed from year to year, showing which drugs have become more prevalent in contributing to overdose fatalities.

## Top 10 Ethnicity Groups by Drug Overdose Death Rates

In [23]:
# Filtering data to exclude general categories and focus on specific demographic groups
specific_groups = data[~data['stub_label'].isin(['All persons', 'Male', 'Female', 'All ages'])]

# Identifying ethnicity groups by excluding those with age references
ethnicity_groups = specific_groups[~specific_groups['stub_label'].str.contains('years')]

# Calculating the average drug overdose death rates for these ethnic groups
average_rates_ethnic_groups = ethnicity_groups.groupby('stub_label')['estimate'].mean().sort_values(ascending=False)

# Selecting the top 10 ethnicity groups with the highest average drug overdose death rates
top_10_ethnic_groups = average_rates_ethnic_groups.head(10).reset_index()

# Creating a donut chart for the top 10 ethnicity groups
fig = px.pie(top_10_ethnic_groups, values='estimate', names='stub_label', title='Top 10 Ethnicity Groups by Drug Overdose Death Rates', 
             hole=0.4, color_discrete_sequence=px.colors.sequential.Viridis_r)

# Update layout to increase the figure size
fig.update_layout(
    width=1000,  # Set the width of the figure
    height=800,  # Set the height of the figure
    showlegend=True,
    annotations=[dict(text='', showarrow=False)]
)

# Show the figure
fig.show()


- Disparate Impact: Certain ethnicity groups experience significantly higher overdose death rates compared to others. This disparity suggests that targeted interventions and preventive measures might be necessary to address specific vulnerabilities and risk factors prevalent within these groups.

- Specific High-Risk Groups: The data highlights that groups such as "Male: Not Hispanic or Latino: American Indian or Alaska Native" and "Male: Not Hispanic or Latino: White" tend to have higher average death rates. This indicates that demographic and cultural factors may influence the risk of drug overdose, which can guide more culturally and demographically tailored public health responses.

## Drug Overdose Death Rates by Drug Category for Top 10 Ethnicity Groups

In [24]:
# First, make sure it is sorted and has the correct indices
average_rates_ethnic_groups = ethnicity_groups.groupby('stub_label')['estimate'].mean().sort_values(ascending=False)

# Selecting the top 10 and ensuring it keeps the group names
top_10_ethnic_groups = average_rates_ethnic_groups.head(10)

# Filter the data to only include these top 10 groups
top_10_ethnicity_data = ethnicity_groups[ethnicity_groups['stub_label'].isin(top_10_ethnic_groups.index)]

# Create a figure
fig = go.Figure()

# Add a bar for each drug category
for column in ethnicity_drug_data.columns:
    fig.add_trace(go.Bar(
        name=column,
        y=ethnicity_drug_data.index,
        x=ethnicity_drug_data[column],
        orientation='h'  # This makes the bar horizontal
    ))

# Update layout for stacked bar chart
fig.update_layout(
    barmode='stack',
    title='Drug Overdose Death Rates by Drug Category for Top 10 Ethnicity Groups',
    xaxis_title='Deaths per 100,000 People',
    yaxis_title='Ethnicity Groups',
    legend_title='Drug Category',
    width=1400,  # Adjust width as needed
    height=800,  # Adjust height as needed
    legend=dict(x=1.05, y=1, bgcolor='rgba(255, 255, 255, 0.5)', bordercolor='rgba(0, 0, 0, 0.1)')
)

# Show the figure
fig.show()


NameError: name 'ethnicity_drug_data' is not defined

- Certain drugs significantly impact specific ethnicity groups, showing distinct patterns in drug use and susceptibility within these groups.
- This differentiation by drug category can inform targeted interventions that are specifically tailored to address the predominant drugs affecting these high-risk groups.

## Summary of Visualization

### Age

- **Trends:** There are stark differences in overdose death rates among various age groups, with middle-aged adults (particularly those between 35-44 and 45-54 years) experiencing the highest rates. These trends emphasize the need for age-specific prevention and treatment strategies.
- **Youth and Elderly:** Lower rates were observed in the youngest (under 15 years) and oldest age groups (over 85 years), suggesting that interventions in these groups, while still necessary, might be tailored differently compared to interventions for middle-aged populations.

### Ethnic Groups

- **Disparities:** The analysis highlighted significant disparities in overdose death rates among different ethnic groups. Non-Hispanic whites and certain Native American groups have shown particularly high overdose rates, pointing towards socioeconomic, cultural, and access-related factors that may influence these trends.
- **Targeted Interventions:** Understanding the cultural contexts and healthcare needs of these high-risk groups can guide more effective public health policies and community-specific interventions.

### Drugs

- **Opioids:** Opioids remain the most significant contributor to drug overdose deaths, affecting a wide range of demographics. This epidemic requires focused attention on both prescription and illicit opioid use.
- **Broad Impact:** While opioids are the major drivers, the analysis also brought attention to other significant drug categories. Tailored strategies to combat the use of multiple substances are necessary to address the complex nature of drug misuse.
 misuse.

# Linear Regression 

In [None]:
# Identifying the top categories by average overdose rates over all years
top_drug_categories = data.groupby('panel')['estimate'].mean().nlargest(5).index.tolist()
# Filtering data for the selected top categories
top_drug_data = data[data['panel'].isin(top_drug_categories)]
# Selecting data for one top category
category_data = top_drug_data[top_drug_data['panel'] == top_drug_categories[0]]

# Preparing data for regression model
X = category_data['year'].values.reshape(-1, 1)  # Predictor
y = category_data['estimate'].values  # Response

# Creating and fitting the linear regression model
model = LinearRegression()
model.fit(X, y)

# Predicting future values for the next 5 years
future_years = np.array([year for year in range(2019, 2026)]).reshape(-1, 1)
predictions = model.predict(future_years)


In [None]:
# Creating a DataFrame for the predicted years and their average estimations
prediction_data = pd.DataFrame({
    "Year": future_years.flatten(),
    "Avg Estimation": predictions
})

prediction_data

Unnamed: 0,Year,Avg Estimation
0,2019,17.521484
1,2020,18.15887
2,2021,18.796256
3,2022,19.433642
4,2023,20.071028
5,2024,20.708414
6,2025,21.3458


In [None]:
# Calculate R^2 score
r2_score = model.score(X, y)
print(f"R^2 score for the model: {r2_score:.2f}")


R^2 score for the model: 0.17
