In [1]:
# Load prepped_raw_data.xlsx into a dataframe. First row is header, second row should be skipped.
import numpy as np
import pandas as pd

import worldview
from worldview import preprocessor

# Show full dataframes when printing
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
pd.set_option("display.float_format", lambda x: "%.5f" % x)
np.set_printoptions(threshold=100, suppress=True)

data_loc = worldview.get_data_dir()
df_raw = pd.read_excel(data_loc / "prepped_raw_data.xlsx", header=0, skiprows=[1])

# Load and prepare the data
df = preprocessor.create_prepped_data()

Number of rows with a tie for the maximum isi score: 8
Number of rows with a tie for the maximum worldview score: 34


Look at the demographics and categorical vars and do any cleaning as needed. 

NOTE: For questions where "prefer not to answer" was an option, I mapped those to null/NaN, as that is easier for the statistical analysis. 

In [2]:
# Age
display(df["age_group"].value_counts(dropna=False))

age_group
25-34    134
35-44    117
45-54     71
55-64     54
65-74     23
75+        7
Name: count, dtype: int64

# Age group where we combine the last two to create 65+ or 55+

In [3]:
display(df["age_group_65"].value_counts(dropna=False))
display(df["age_group_55"].value_counts(dropna=False))

age_group_65
25-34     134
35-44     117
45-54      71
55-64+     54
65+        30
Name: count, dtype: int64

age_group_55
25-34    134
35-44    117
55+       84
45-54     71
Name: count, dtype: int64

I fixed a bug in the age groups and now we don't have any missing data. We might want to combine the last 2 groups to form a 65+ group, but we can decide that as we do analysis. 

In [4]:
# Gender
display(df["gender"].value_counts(dropna=False))

gender
male         208
female       189
nonbinary      8
NaN            1
Name: count, dtype: int64

We only have 8 non-binary (and 1 prefer not to answer). We decided to drop them from analysis, but if you can use these numbers to report how many missings (the prefer not to answer), and how many nonbinary you dropped due to not enough data. 

In [5]:
# Gender 2 levels
display(df["gender_2"].value_counts(dropna=False))

gender_2
male      208
female    189
NaN         9
Name: count, dtype: int64

In [6]:
# transsexual
display(df["transsexual"].value_counts(dropna=False))

transsexual
NaN    247
no     157
yes      2
Name: count, dtype: int64

Only 2 people responded yes (NaN means null/no answer). Not enough to do stats on. 

In [7]:
# ethnicity
display(df["ethnicity"].value_counts(dropna=False))

ethnicity
White                                        266
Black or African American                     86
Asian or Asian American                       27
Hispanic or Latino                            21
Middle Eastern or North African                3
NaN                                            2
Native Hawaiian or other Pacific Islander      1
Name: count, dtype: int64

In [8]:
# ethnicity - specify
display(df["ethnicity_specify"].value_counts(dropna=False))

ethnicity_specify
NaN                          396
biracial/black/white           1
White-passing latinx           1
white/hispanic                 1
Black and Middle Eastern       1
Black/White                    1
Black White                    1
White, Puerto Rican            1
Australian                     1
black and white                1
Black caribbean and white      1
Name: count, dtype: int64

It looks like we could have benefited from offering a mixed race option, but the group would have been small. It also wouldn't get at different values for mixed race (black/white vs black/middle eastern). 

People were given the option to fill this out regardless of what they answered for the main ethnicity question. Just noting that because how "other" was treated differed across the demographic questions - this required some manual effort to figure out and clean the data for. 

In [9]:
# education
display(df["education"].value_counts(dropna=False))

education
Graduated with Bachelors                152
1-2 years college/associate’s degree     79
Graduated with master’s degree           79
Highschool gradate or proficiency        47
Graduated with PhD                       23
Some graduate school                     16
Attended trade school/certifications      7
NaN                                       3
Name: count, dtype: int64

In [10]:
# education - other
display(df["education_other"].value_counts(dropna=False))

education_other
NaN                                                        402
Masters, two.    and the student loans for both :((((((      1
not a high school graduate                                   1
Graduated JD                                                 1
MD                                                           1
Name: count, dtype: int64

I cleaned up the main education question to make sure that those with the JD/MD were in the "Graduated with PhD" group, but in the future I recommend the option be "Graduate with Doctorate degree" to reflect that not all doctorates are PhDs. 

Below is the education variable after I combined some levels per our discussion. 

In [11]:
display(df["education_5_levels"].value_counts(dropna=False))

education_5_levels
Graduated with Bachelors                                            168
1-2 years college/associate’s degree/trade school/certifications     86
Graduated with master’s degree                                       79
Highschool gradate or proficiency                                    47
Graduated with PhD                                                   23
NaN                                                                   3
Name: count, dtype: int64

religious/spiritual orientation - Includes the cleanup I did where we mapped those who put "catholic" in 
the other question to "Christian", mapped those that said some variation of none/not religious to "Atheist"
and created the "Muslim" group. 

In [12]:
display(df["religious_spiritual_orientation"].value_counts(dropna=False))

religious_spiritual_orientation
Christian               182
Agnostic                 82
Atheist                  59
Spiritually eclectic     43
NaN                      25
Judaism                   8
Buddhist                  5
Muslim                    2
Name: count, dtype: int64

Religion - combining the small groups

In [13]:
display(df["religious_spiritual_orientation_reduced"].value_counts(dropna=False))

religious_spiritual_orientation_reduced
Christian               182
Agnostic                 82
Atheist                  59
Spiritually eclectic     43
NaN                      25
Other                    15
Name: count, dtype: int64

In [14]:
# Consider themselves to be open/inclusive
print(df["consider_open_inclusive"].value_counts(dropna=False))

consider_open_inclusive
yes    390
no      16
Name: count, dtype: int64


Not enough people answered no for any real analysis.

In [15]:
# Had experiences to make them open/inclusive
print(df["experience_open_inclusive"].value_counts(dropna=False))

experience_open_inclusive
yes    343
no      63
Name: count, dtype: int64


Analysis on this TBD

In [16]:
# Feels their experiences have changed views of themselves/others
print(df["feel_experience_changed"].value_counts(dropna=False))

feel_experience_changed
yes    350
no      52
NaN      4
Name: count, dtype: int64


Analysis on this TBD

# Dominant Worldview
Adding in the counts for dominant worldview. Note that we did have some participants where their top scores tied. The paper had no instructions on what to do. The most simple thing would be to drop those participants (and report that in papers). Otherwise we have to do the more complex factor analysis which will be time consuming/require more complex writeup. 

In [17]:
# Had experiences to make them open/inclusive
print(df["dominant_worldview"].value_counts(dropna=False))

dominant_worldview
integrative      123
modern           103
postmodern        74
traditional       72
multiple_ties     34
Name: count, dtype: int64
