In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv("combined_df_Prevalence_2016_2022.csv")
df

Unnamed: 0,Year,FIPSST,SC_SEX,SC_AGE_YEARS,K2Q31A,K2Q31B,K2Q31C,SC_RACE_R
0,2016,18,1.0,2.0,2.0,,,1.0
1,2016,17,1.0,9.0,1.0,1.0,1.0,1.0
2,2016,50,1.0,11.0,2.0,,,1.0
3,2016,51,2.0,10.0,2.0,,,1.0
4,2016,53,2.0,15.0,2.0,,,1.0
...,...,...,...,...,...,...,...,...
279541,2022,36,1.0,13.0,2.0,,,7.0
279542,2022,6,2.0,9.0,2.0,,,1.0
279543,2022,16,2.0,2.0,2.0,,,1.0
279544,2022,29,1.0,6.0,2.0,,,1.0


In [3]:
# Assuming your dataframe is named df

# Define a dictionary with the old column names as keys and the new names as values
new_column_names = {
    "SC_SEX": "Sex",
    "SC_AGE_YEARS": "Age",
    "K2Q31A": "ADHD_Diagnosis",
    "K2Q31B": "ADHD_Current",
    "K2Q31C": "ADHD_Severity",
    "SC_RACE_R": "Race"
}

# Rename the columns using the dictionary
df.rename(columns=new_column_names, inplace=True)

# Now your dataframe has the new column names
df


Unnamed: 0,Year,FIPSST,Sex,Age,ADHD_Diagnosis,ADHD_Current,ADHD_Severity,Race
0,2016,18,1.0,2.0,2.0,,,1.0
1,2016,17,1.0,9.0,1.0,1.0,1.0,1.0
2,2016,50,1.0,11.0,2.0,,,1.0
3,2016,51,2.0,10.0,2.0,,,1.0
4,2016,53,2.0,15.0,2.0,,,1.0
...,...,...,...,...,...,...,...,...
279541,2022,36,1.0,13.0,2.0,,,7.0
279542,2022,6,2.0,9.0,2.0,,,1.0
279543,2022,16,2.0,2.0,2.0,,,1.0
279544,2022,29,1.0,6.0,2.0,,,1.0


In [4]:

# Define the mapping for each column
sex_mapping = {1.0: 'Male', 2.0: 'Female'}
ADHD_Diagnosis_mapping = {1.0: 'Yes', 2.0: 'No'}
ADHD_Current_mapping = {1.0:'Yes',2.0:'No'}
ADHD_Severity_mapping ={1.0: 'Mild', 2.0: 'Moderate', 3.0: 'Severe'}
race_mapping = {
    1.0: 'White',
    2.0: 'Black/African American',
    3.0: 'American Indian/Alaska Native',
    4.0: 'Asian',
    5.0: 'Native Hawaiian/Other Pacific Islander alone',
    7.0: 'Two or More Races'
}


# Apply the mapping to the corresponding columns
df['Sex'] = df['Sex'].map(sex_mapping)
df['ADHD_Diagnosis'] = df['ADHD_Diagnosis'].map(ADHD_Diagnosis_mapping)
df['ADHD_Current'] = df['ADHD_Current'].map(ADHD_Current_mapping)
df['ADHD_Severity'] = df['ADHD_Severity'].map(ADHD_Severity_mapping)
df['Race'] = df['Race'].map(race_mapping)

# Now your dataframe has the values mapped to their textual representations
df


Unnamed: 0,Year,FIPSST,Sex,Age,ADHD_Diagnosis,ADHD_Current,ADHD_Severity,Race
0,2016,18,Male,2.0,No,,,White
1,2016,17,Male,9.0,Yes,Yes,Mild,White
2,2016,50,Male,11.0,No,,,White
3,2016,51,Female,10.0,No,,,White
4,2016,53,Female,15.0,No,,,White
...,...,...,...,...,...,...,...,...
279541,2022,36,Male,13.0,No,,,Two or More Races
279542,2022,6,Female,9.0,No,,,White
279543,2022,16,Female,2.0,No,,,White
279544,2022,29,Male,6.0,No,,,White


In [5]:
# Check for NaN values and print the count for each column
nan_counts = df.isna().sum()
print(nan_counts)

Year                   0
FIPSST                 0
Sex                    0
Age                    0
ADHD_Diagnosis      1881
ADHD_Current      252628
ADHD_Severity     254933
Race                2662
dtype: int64


In [6]:
# Remove rows with NaN values only in 'K2Q31A' columns
df_cleaned = df.dropna(subset=['ADHD_Diagnosis'])
df_cleaned.shape

(277665, 8)

In [7]:

# Calculate the distribution of 'K2Q31A' for each sex and year
sex_distribution_df = (
    df_cleaned.groupby(['Year', 'Sex'])['ADHD_Diagnosis']
    .value_counts(normalize=True)  # Get the relative frequencies within each group
    .rename('percentage')  # Rename the series for clarity
    .reset_index()  # Flatten the multi-index into a DataFrame
    .assign(percentage=lambda x: x['percentage'] * 100)  # Convert the relative frequencies to percentages
)

# Optionally, you can sort the DataFrame for better readability
sex_distribution_df.sort_values(by=['Year', 'Sex', 'ADHD_Diagnosis'], inplace=True)

sex_distribution_df = sex_distribution_df[sex_distribution_df['ADHD_Diagnosis']== 'Yes' ]
sex_distribution_df


Unnamed: 0,Year,Sex,ADHD_Diagnosis,percentage
1,2016,Female,Yes,6.099518
3,2016,Male,Yes,12.767875
5,2017,Female,Yes,6.413075
7,2017,Male,Yes,12.656847
9,2018,Female,Yes,6.481224
11,2018,Male,Yes,12.971743
13,2019,Female,Yes,6.827424
15,2019,Male,Yes,13.736949
17,2020,Female,Yes,6.538013
19,2020,Male,Yes,13.448699


In [8]:
# Save to a CSV file
sex_distribution_df.to_csv('sex_distribution_df.csv', index=True)


In [9]:
# Calculate the distribution of 'K2Q31A' for each sex and year
race_distribution_df = (
    df_cleaned.groupby(['Year', 'Race'])['ADHD_Diagnosis']
    .value_counts(normalize=True)  # Get the relative frequencies within each group
    .rename('percentage')  # Rename the series for clarity
    .reset_index()  # Flatten the multi-index into a DataFrame
    .assign(percentage=lambda x: x['percentage'] * 100)  # Convert the relative frequencies to percentages
)

# Optionally, you can sort the DataFrame for better readability
race_distribution_df.sort_values(by=['Year', 'Race', 'ADHD_Diagnosis'], inplace=True)

race_distribution_df = race_distribution_df[race_distribution_df['ADHD_Diagnosis']== 'Yes' ]
race_distribution_df


Unnamed: 0,Year,Race,ADHD_Diagnosis,percentage
1,2016,American Indian/Alaska Native,Yes,10.991957
3,2016,Asian,Yes,3.283473
5,2016,Black/African American,Yes,11.608624
7,2016,Native Hawaiian/Other Pacific Islander alone,Yes,8.219178
9,2016,Two or More Races,Yes,9.886143
11,2016,White,Yes,9.847076
13,2017,American Indian/Alaska Native,Yes,13.071895
15,2017,Asian,Yes,3.054807
17,2017,Black/African American,Yes,12.237762
19,2017,Native Hawaiian/Other Pacific Islander alone,Yes,8.064516


In [10]:
# Save to a CSV file
race_distribution_df.to_csv('race_distribution_df.csv', index=True)

In [11]:
# Define a function to categorize age into age groups
def categorize_age(age):
    if 3 <= age <= 5:
        return '3-5 years'
    elif 6 <= age <= 11:
        return '6-11 years'
    elif 12 <= age <= 17:
        return '12-17 years'
    else:
        return 'Other'  # For ages outside the specified ranges

# Apply the function to the 'SC_AGE_YEARS' column to create a new 'Age_Group' column
df_cleaned['Age_Group'] = df_cleaned['Age'].apply(categorize_age)

age_group_distribution_df = (
    df_cleaned.groupby(['Year', 'Age_Group'])['ADHD_Diagnosis']
    .value_counts(normalize=True)  # Get the relative frequencies within each group
    .rename('percentage')  # Rename the series for clarity
    .reset_index()  # Flatten the multi-index into a DataFrame
    .assign(percentage=lambda x: x['percentage'] * 100)  # Convert the relative frequencies to percentages
)

# Optionally, you can sort the DataFrame for better readability
age_group_distribution_df.sort_values(by=['Year', 'Age_Group', 'ADHD_Diagnosis'], inplace=True)

age_group_ADHD_Prevalence = age_group_distribution_df[age_group_distribution_df['ADHD_Diagnosis']== 'Yes' ]
age_group_ADHD_Prevalence


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cleaned['Age_Group'] = df_cleaned['Age'].apply(categorize_age)


Unnamed: 0,Year,Age_Group,ADHD_Diagnosis,percentage
1,2016,12-17 years,Yes,14.267634
3,2016,3-5 years,Yes,1.86195
5,2016,6-11 years,Yes,11.127547
7,2016,Other,Yes,0.217802
9,2017,12-17 years,Yes,14.108071
11,2017,3-5 years,Yes,1.688028
13,2017,6-11 years,Yes,11.606042
15,2017,Other,Yes,0.273224
17,2018,12-17 years,Yes,14.63886
19,2018,3-5 years,Yes,1.503596


In [12]:
age_group_ADHD_Prevalence = age_group_distribution_df[age_group_distribution_df['ADHD_Diagnosis']== 'Yes' ]
age_group_ADHD_Prevalence

age_group_ADHD_Prevalence.to_csv("age_group_ADHD_Prevalence.csv", index = True)

In [13]:
# Calculate the distribution of 'ADHD_Diagnosis' for each state and year
State_distribution_df = (
    df_cleaned.groupby(['Year', 'FIPSST'])['ADHD_Diagnosis']
    .value_counts(normalize=True)  # Get the relative frequencies within each group
    .rename('percentage')  # Rename the series for clarity
    .reset_index()  # Flatten the multi-index into a DataFrame
    .assign(percentage=lambda x: x['percentage'] * 100)  # Convert the relative frequencies to percentages
)

# Optionally, you can sort the DataFrame for better readability
State_distribution_df.sort_values(by=['Year', 'FIPSST', 'ADHD_Diagnosis'], inplace=True)

State_distribution_df = State_distribution_df[State_distribution_df['ADHD_Diagnosis'] == 'Yes']
State_distribution_df

Unnamed: 0,Year,FIPSST,ADHD_Diagnosis,percentage
1,2016,1,Yes,12.804878
3,2016,2,Yes,6.854345
5,2016,4,Yes,9.070295
7,2016,5,Yes,12.515188
9,2016,6,Yes,6.538049
...,...,...,...,...
705,2022,51,Yes,10.661765
707,2022,53,Yes,9.644670
709,2022,54,Yes,14.558473
711,2022,55,Yes,8.695652


In [14]:
State_distribution_df.to_csv('State_distribution_df.csv',index=True)