# Project 5: The war with Star Wars

In [None]:
# Set up

# libraries
import pandas as pd
import altair as alt
import numpy as np

In [None]:
url = 'https://github.com/fivethirtyeight/data/raw/master/star-wars-survey/StarWars.csv'

sw_cols = pd.read_csv(url, encoding = "ISO-8859-1", header = None, nrows = 2)
sw_data = pd.read_csv(url, encoding = "ISO-8859-1", header = None, skiprows = 2) 

### Grand Question 1
**Shorten the column names and clean them up for easier use with pandas.**

In [None]:
bob = (sw_cols
       .iloc[0,:]
       .replace("Have you seen any of the 6 films in the Star Wars franchise?", "seen_any")
       .replace("Do you consider yourself to be a fan of the Star Wars film franchise?", "is_fan_star_wars")
       .replace("Which of the following Star Wars films have you seen? Please select all that apply.", "seen_")
       .replace("Please rank the Star Wars films in order of preference with 1 being your favorite film in the franchise and 6 being your least favorite film.", "film_rank")
       .replace("Please state whether you view the following characters favorably, unfavorably, or are unfamiliar with him/her.", "favorable_character")
       .replace("Which character shot first?", "shot_first")
       .replace("Are you familiar with the Expanded Universe?", "familiar_expanded_universe")
       .str.lower()
       .str.replace(" ", "_")
       .ffill()
) 
# bob

In [13]:
mary = (sw_cols
        .iloc[1,:]
        .replace("Response", "")
        .str.replace("Star Wars: Episode", "")
        .str.lower()
        .str.replace(" ", "_")  #partial match, not full match
        .fillna("") #replace function specifically for NA values
)
# mary

In [14]:
new_column_names = bob + mary
# new_column_names

In [25]:
sw_data.columns = new_column_names

# sw_data.head()
8912
# now we have our data set

8912

### Grand Question 2
**Please validate that the data provided on GitHub lines up with the article by recreating 2 of their visuals and calculating 2 summaries that they report in the article.**

In [None]:
# First recreated visual

In [19]:
shot = (sw_data['shot_first']
        .dropna()
        .value_counts(normalize = True)
        .reset_index()
        )

shot["percent"] = round(shot['shot_first']*100, 0) 

shot

Unnamed: 0,index,shot_first,percent
0,Han,0.392512,39.0
1,I don't understand this question,0.369565,37.0
2,Greedo,0.237923,24.0


In [20]:
part1 = (alt.Chart(shot)
            .mark_bar()
            .encode(
                x = alt.X('shot_first', 
                          axis = None), 
                y = alt.Y('index', 
                          sort = ["Han", "Greedo", "I don't understand this question"], 
                          axis = None)
    )
)
# part1

In [21]:
part2 = (alt.Chart(shot)
            .mark_text(
                align = 'left', baseline = 'middle', dx = 3
                )
            .encode(
                x = 'shot_first', 
                y = alt.Y('index', 
                        sort = ["Han", "Greedo", "I don't understand this question"]), 
                        text = "percent")
        )
# part2

In [22]:
(part1 + part2).properties(
    title = {
        "text": ["Who Shot First"], 
        "subtitle": ["According to 828 Respondents"]
    }
).configure(
    background = "#f0f0f0"
).configure_title(
    anchor = "start"
)

### Second Recreated Visual
#### What is the Best Star Wars Movie?

In [None]:
sw_data2 = (sw_data
    .replace()

) 

In [23]:
# First graph in the article - "Which 'Star Wars' Movies Have You Seen?"

# dat_1.seen_any.value_counts() # what-what?
(
sw_data
.filter(regex='^seen__')
.dropna(how="all")
.shape
)


# On tuesday we'll do text data stuff.
# Watch the video she'll post on this.
# The take method's test.

(835, 6)

In [None]:
# First recreated summary

In [None]:
(sw_data.query("gender == 'Female' & seen_any == 'Yes'")
        .is_fan_star_wars
        .value_counts(normalize = True))

In [None]:
# Second recreated summary

### Grand Question 3
**Clean and format the data so that it can be used in a machine learning model. As you format the data, you should complete each item listed below. In your final report provide example(s) of the reformatted data with a short description of the changes made.**

a. Filter the dataset to respondents that have seen at least one film.

b. Create a new column that converts the age ranges to a single number. Drop the age range categorical column.

c. Create a new column that converts the school groupings to a single number. Drop the school categorical 
column.

d. Create a new column that converts the income ranges to a single number. Drop the income range categorical 
column.

e. Create your target (also known as "y" or "label") column based on the new income range column.

f. One-hot encode all remaining categorical columns.

In [None]:
# A) Filter the dataset to respondents that have seen at least one film.

q3 = sw_data.query('seen_any == "Yes"')
# q3.head()

In [None]:
# B) Create a new column that converts the age ranges to a single number. Drop the age range categorical column.

ml_age = (q3.age
   .str.replace("> ", "")
   .str.split("-", expand = True)
   .rename(columns = {0: "age_min", 1: "age_max"}) # this makes altair happy because they don't like integers as column names
   .age_min
   .astype("float")
)
# ml_age

In [None]:
# C) Create a new column that converts the school groupings to a single number. Drop the school categorical column.

ml_school = (q3.education.
        str.replace('Less than high school degree', '9').
        str.replace('High school degree', '12').
        str.replace('Some college or Associate degree', '14').
        str.replace('Bachelor degree', '16').
        str.replace('Graduate degree', '20').
        astype('float')
        )
# ml_school

In [None]:
# D) Create a new column that converts the income ranges to a single number. Drop the income range categorical column.

ml_income = (q3.household_income
   .str.replace("\$|,|\+", "")
   .str.split(" - ", expand=True)
   .rename(columns = {0: "income_min", 1: "income_max"}) # this makes altair happy because they don't like integers as column names
   .income_min
   .astype("float")
)
# ml_income

In [None]:
# E) One-hot encode all remaining categorical columns.

# sw_data.columns
# q3.columns

ml_dummies = pd.get_dummies(q3.filter()) # copy and paste all columns from columns above. See picture on phone. This is where I need help wrangling things from the lab

In [None]:
sw_data.columns

In [None]:
# F) Create your target (also known as "y" or "label") column based on the new income range column.


### Grand Question 4
**Build a machine learning model that predicts whether a person makes more than $50k.**