# Explore the Data

In [28]:
import pandas as pd
df = pd.read_csv('nonvoters_data.csv')

df.head()

Unnamed: 0,RespId,weight,Q1,Q2_1,Q2_2,Q2_3,Q2_4,Q2_5,Q2_6,Q2_7,...,Q30,Q31,Q32,Q33,ppage,educ,race,gender,income_cat,voter_category
0,470001,0.7516,1,1,1,2,4,1,4,2,...,2,,1.0,,73,College,White,Female,$75-125k,always
1,470002,1.0267,1,1,2,2,3,1,1,2,...,3,,,1.0,90,College,White,Female,$125k or more,always
2,470003,1.0844,1,1,1,2,2,1,1,2,...,2,,2.0,,53,College,White,Male,$125k or more,sporadic
3,470007,0.6817,1,1,1,1,3,1,1,1,...,2,,1.0,,58,Some college,Black,Female,$40-75k,sporadic
4,480008,0.991,1,1,1,-1,1,1,1,1,...,1,-1.0,,,81,High school or less,White,Male,$40-75k,always


In [29]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5836 entries, 0 to 5835
Columns: 119 entries, RespId to voter_category
dtypes: float64(23), int64(91), object(5)
memory usage: 5.3+ MB


By looking at the info from our dataframe, we learn that we have 119 columns in our dataset and 5,836 rows. 

There's too many columns to see the full number and so, to find out information about the columns, [we refer to the codebook for the dataset](https://github.com/fivethirtyeight/data/blob/master/non-voters/nonvoters_codebook.pdf).

This analysis will focus on questions two and three in the study and so we'll filter the dataframe to only include the pertinent columns.


2. In your view, how important are each of the following to being a good American?

1. Voting in elections
2. Serving on a jury
3. Following what happens in government and politics
4. Displaying the American flag
5. Participating in the U.S. Census every ten years
6. Knowing the Pledge of Allegiance
7. Supporting the military
8. Respecting the opinions of those who disagree with you
9. Believing in God
10. Protesting if you believe government actions are wrong
    
Responses
1. Very important
2. Somewhat important
3. Not so important
4. Not at all important


3. How much do you agree or disagree with the following statements?

1. Systemic racism is a problem in the United States.
2. Systemic racism in policing is a bigger problem than violence and vandalism in protests.
3. Society as a whole has become too soft and feminine.
4. The mainstream media is more interested in making money than telling the truth.
5. Traditional parties and politicians don’t care about people like me.
6. The way people talk needs to change with the times to be more sensitive to people with different backgrounds.

Responses
1. Strongly agree
2. Somewhat agree
3. Somewhat disagree
4. Strongly disagree

In [46]:
df.describe()

Unnamed: 0,RespId,weight,Q1,Q2_1,Q2_2,Q2_3,Q2_4,Q2_5,Q2_6,Q2_7,...,Q29_6,Q29_7,Q29_8,Q29_9,Q29_10,Q30,Q31,Q32,Q33,ppage
count,5836.0,5836.0,5836.0,5836.0,5836.0,5836.0,5836.0,5836.0,5836.0,5836.0,...,1342.0,1342.0,1342.0,1342.0,1342.0,5836.0,1592.0,2002.0,2242.0,5836.0
mean,474653.997772,0.991023,1.0,1.246402,1.705106,1.63828,2.175977,1.277245,1.805517,1.491604,...,-0.926975,-0.758569,-0.697466,-0.81073,-0.700447,2.325051,1.36495,1.365634,1.220339,51.693797
std,3628.475677,0.345022,0.0,0.660253,0.866346,0.765741,1.091391,0.626386,1.011524,0.80812,...,0.375264,0.651835,0.716885,0.585638,0.71397,1.259642,0.519249,0.497046,0.958569,17.071561
min,470001.0,0.2298,1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,-1.0,22.0
25%,472069.75,0.79315,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,1.0,1.0,1.0,1.0,36.0
50%,474152.0,0.9676,1.0,1.0,2.0,2.0,2.0,1.0,1.0,1.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,2.0,1.0,1.0,1.0,54.0
75%,476217.5,1.1696,1.0,1.0,2.0,2.0,3.0,1.0,2.0,2.0,...,-1.0,-1.0,-1.0,-1.0,-1.0,3.0,2.0,2.0,2.0,65.0
max,488325.0,3.0386,1.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,1.0,1.0,1.0,1.0,1.0,5.0,2.0,2.0,2.0,94.0


Now that we've inspected our data, we can make a list of the ways we'll need to restructure the data.

1) Create bins for our age data: create quartiles for age data
2) Filter dataframe for the specific questions: questions two and three 
3) Remove values of -1 
4) Likert Scale data needs to be relabeled
5) Rename Columns

# Clean and Restructure the Data

## Create bins for our age data: create quartiles for age data

In [31]:
df["ppage"].describe()

# Use quartile values to create bins

age_bins = ["22-36","37-54","55-65","66-94"]

df['age_binned'] = pd.qcut(df['ppage'],
                              q=[0,0.25, 0.5, 0.75, 1],
                              labels=age_bins)

df


Unnamed: 0,RespId,weight,Q1,Q2_1,Q2_2,Q2_3,Q2_4,Q2_5,Q2_6,Q2_7,...,Q31,Q32,Q33,ppage,educ,race,gender,income_cat,voter_category,age_binned
0,470001,0.7516,1,1,1,2,4,1,4,2,...,,1.0,,73,College,White,Female,$75-125k,always,66-94
1,470002,1.0267,1,1,2,2,3,1,1,2,...,,,1.0,90,College,White,Female,$125k or more,always,66-94
2,470003,1.0844,1,1,1,2,2,1,1,2,...,,2.0,,53,College,White,Male,$125k or more,sporadic,37-54
3,470007,0.6817,1,1,1,1,3,1,1,1,...,,1.0,,58,Some college,Black,Female,$40-75k,sporadic,55-65
4,480008,0.9910,1,1,1,-1,1,1,1,1,...,-1.0,,,81,High school or less,White,Male,$40-75k,always,66-94
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5831,477662,1.1916,1,1,3,1,3,1,2,2,...,,1.0,,27,Some college,Hispanic,Male,$40-75k,always,22-36
5832,477663,1.4623,1,1,1,1,2,1,2,1,...,,2.0,,59,High school or less,White,Female,$125k or more,rarely/never,55-65
5833,488322,0.9252,1,1,2,1,3,1,1,2,...,,1.0,,51,College,Other/Mixed,Male,$125k or more,sporadic,37-54
5834,488325,2.6311,1,2,2,2,2,2,2,2,...,,,1.0,22,High school or less,Black,Female,Less than $40k,always,22-36


## Filter dataframe for the specific questions

In [32]:
import re

# Create new dataframe that only includes demographic and Questions two and three. 
# We'll use these lists to create our final dataframe

full_column_list = list(df.columns)

question_columns = []

demographic_columns = []

for column in full_column_list:
    # Append any columns that contain the text Q2_ or Q2_
    if re.search("Q2_.+", column) or re.search("Q3_.+", column):
        question_columns.append(column)
    # Append demographic criteria to list of list of demographic variables
    elif df[column].dtype == object:
        demographic_columns.append(column)
        # TO DO: Remove and replace with binned data
    elif re.search("age_binned",column):
        demographic_columns.append(column)

# Create full list of columns we'll use for our subset
full_column_list = (question_columns+demographic_columns)

# Create a copy of the dataframe we'll use for our final df
subset_df = df[full_column_list].copy()


In [33]:
# Remove values that are below zero 

for name in question_columns:
    subset_df.drop(subset_df[subset_df[name]<0].index, inplace = True)

subset_df.describe()

Unnamed: 0,Q2_1,Q2_2,Q2_3,Q2_4,Q2_5,Q2_6,Q2_7,Q2_8,Q2_9,Q2_10,Q3_1,Q3_2,Q3_3,Q3_4,Q3_5,Q3_6
count,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0,5570.0
mean,1.256912,1.717056,1.652065,2.203232,1.286176,1.826032,1.511131,1.472711,2.130162,2.032316,1.920467,2.410054,2.66535,1.87702,2.033393,2.085099
std,0.632705,0.839422,0.73734,1.067143,0.605214,0.995449,0.778344,0.652131,1.229364,0.913004,1.04468,1.154553,1.06767,0.967673,0.827541,0.94921
min,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
25%,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,1.0,1.0
50%,1.0,2.0,2.0,2.0,1.0,1.0,1.0,1.0,2.0,2.0,2.0,2.0,3.0,2.0,2.0,2.0
75%,1.0,2.0,2.0,3.0,1.0,2.0,2.0,2.0,3.0,3.0,3.0,3.0,4.0,2.0,3.0,3.0
max,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0


# Use Lambda function to relabel Likert scale data



In [34]:
def likert_converter (column_name):
    """Replace Likert Scale numeric values with words"""
    if 'Q2_' in column_name:
        subset_df[column_name] = subset_df[column_name].map(lambda x: 'Very Important' if x==1 
        else ('Somewhat important' if x==2 
        else('Not so important'if x==3 
        else 'Not at all important')))
    else:
        subset_df[column_name] = subset_df[column_name].map(lambda x: 'Very Important' if x==1 
        else ('Somewhat important' 
        if x==2 else('Not so important'if x==3 
        else 'Not at all important')))


#Replace with correct column list
for column in question_columns: 
    likert_converter(column)

 


## Rename columns


In [35]:
column_dict = {"Q2_1":"Voting",
        "Q2_2": "Jury",
        "Q2_3": "Govt_Politics",
        "Q2_4": "American_Flag",
        "Q2_5": "US_Census",
        "Q2_6": "Pledge",
        "Q2_7":"Military Support",
        "Q2_8": "Respecting_Others",
        "Q2_9":"Believing in God",
        "Q2_10":"Protesting", 
        "Q3_1": "Racism_US",
        "Q3_2": "Racism_Policing",
        "Q3_3":"Society_Whole",
        "Q3_4":"Mainstream_Media",
        "Q3_5":"Traditional_Parties",
        "Q3_5":"Way_People_Talk"}


subset_df = subset_df.rename(columns=column_dict)



In [36]:
# Checking for missing values for our dataframe
subset_df.isna().sum().sort_values()

Voting               0
income_cat           0
gender               0
race                 0
educ                 0
Q3_6                 0
Way_People_Talk      0
Mainstream_Media     0
Society_Whole        0
Racism_Policing      0
Racism_US            0
Protesting           0
Believing in God     0
Respecting_Others    0
Military Support     0
Pledge               0
US_Census            0
American_Flag        0
Govt_Politics        0
Jury                 0
voter_category       0
age_binned           0
dtype: int64

## Export df into csv file

In [37]:
subset_df.to_csv('american_values.csv')

# Perform exploratory analysis & visualization using Matplotlib and Seaborn

In [38]:
# Create visualizations for dataset 

# Ask and Answer questions about the data in a Jupyter Notebook

- Do people of certain backgrounds tend to think differently American values?
- Do certain answers correlate with each other?

# Summarize your inference and write a conclusion
