## Read Thanksgiving Data

In [17]:
import pandas as pd

data = pd.read_csv("thanksgiving.csv", encoding="Latin-1")

# First few rows of data to see what the columns and rows look like.
print(data.head(3))

   RespondentID Do you celebrate Thanksgiving?  \
0    4337954960                            Yes   
1    4337951949                            Yes   
2    4337935621                            Yes   

  What is typically the main dish at your Thanksgiving dinner?  \
0                                             Turkey             
1                                             Turkey             
2                                             Turkey             

  What is typically the main dish at your Thanksgiving dinner? - Other (please specify)  \
0                                                NaN                                      
1                                                NaN                                      
2                                                NaN                                      

  How is the main dish typically cooked?  \
0                                  Baked   
1                                  Baked   
2                                Roas

In [18]:
# Display all of the column names
data.columns.tolist()

['RespondentID',
 'Do you celebrate Thanksgiving?',
 'What is typically the main dish at your Thanksgiving dinner?',
 'What is typically the main dish at your Thanksgiving dinner? - Other (please specify)',
 'How is the main dish typically cooked?',
 'How is the main dish typically cooked? - Other (please specify)',
 'What kind of stuffing/dressing do you typically have?',
 'What kind of stuffing/dressing do you typically have? - Other (please specify)',
 'What type of cranberry saucedo you typically have?',
 'What type of cranberry saucedo you typically have? - Other (please specify)',
 'Do you typically have gravy?',
 'Which of these side dishes aretypically served at your Thanksgiving dinner? Please select all that apply. - Brussel sprouts',
 'Which of these side dishes aretypically served at your Thanksgiving dinner? Please select all that apply. - Carrots',
 'Which of these side dishes aretypically served at your Thanksgiving dinner? Please select all that apply. - Cauliflower',
 

## How many people celebrate Thanksgiving

In [19]:
# Display counts of how many times each category occurs
celebrators = pd.Series.value_counts(data["Do you celebrate Thanksgiving?"])
celebrators

Yes    980
No      78
Name: Do you celebrate Thanksgiving?, dtype: int64

In [20]:
# Reassign 'data' to the YES response of 'Do you celebrate Thanksgiving?'.
data = data[data["Do you celebrate Thanksgiving?"] == "Yes"]

## What main dishes do people tend to eat during Thanksgiving dinner


In [21]:
pd.Series.value_counts(data["What is typically the main dish at your Thanksgiving dinner?"])

Turkey                    859
Other (please specify)     35
Ham/Pork                   29
Tofurkey                   20
Chicken                    12
Roast beef                 11
I don't know                5
Turducken                   3
Name: What is typically the main dish at your Thanksgiving dinner?, dtype: int64

## Tofurkey eaters who use or don't use gravy

In [22]:
tofurkey = data[data["What is typically the main dish at your Thanksgiving dinner?"] == "Tofurkey"]
gravy_w_tofurkey_count = pd.Series.value_counts(tofurkey["Do you typically have gravy?"])
gravy_w_tofurkey = tofurkey["Do you typically have gravy?"]

print(gravy_w_tofurkey)
print(gravy_w_tofurkey_count)

4      Yes
33     Yes
69      No
72      No
77     Yes
145    Yes
175    Yes
218     No
243    Yes
275     No
393    Yes
399    Yes
571    Yes
594    Yes
628     No
774     No
820     No
837    Yes
860     No
953    Yes
Name: Do you typically have gravy?, dtype: object
Yes    12
No      8
Name: Do you typically have gravy?, dtype: int64


## Find out how many people didn't have any pie

In [23]:
# How many people didn't have APPLE pie
apple_pie = data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Apple"]
apple_isnull = pd.isnull(apple_pie)

# How many people didn't have PUMPKIN pie
pumpkin_pie = data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pumpkin"]
pumpkin_isnull = pd.isnull(pumpkin_pie)

# How many people didn't have PECAN pie
pecan_pie = data["Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pecan"]
pecan_isnull = pd.isnull(pecan_pie)

# Group all 3 pies' boolean values
ate_pie = (apple_isnull & pumpkin_isnull & pecan_isnull)
# Show the counts: False == ate pie; True == didn't have any pie
ate_pie.value_counts()

# 104 people didn't have any pie

False    876
True     104
dtype: int64

## How many people had which kind of pie: Apple, Pumpkin, or Pecan 

In [24]:
print(apple_pie.value_counts())
print(pumpkin_pie.value_counts())
print(pecan_pie.value_counts())

Apple    514
Name: Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Apple, dtype: int64
Pumpkin    729
Name: Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pumpkin, dtype: int64
Pecan    342
Name: Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pecan, dtype: int64


## Extract Age

In [25]:
# Filter out all NaN values
age_col = data.loc[data["Age"].isnull() != True, 'Age']

# Slice string value to first two chars and change to integer
def get_age(age: str) -> int:    
    age_to_int = int(age[:2])
    return age_to_int

data["int_age"] = age_col.apply(get_age)
print(data["int_age"].describe())

print("AGE GROUP COUNTS, null values filtered out:")
print(pd.Series.value_counts(age_col))

print("----MOST FREQUENT 'AGE' BRACKET WITHIN TOTAL COUNT----")
s = pd.Series(age_col)
print(s.describe())


count    947.000000
mean      40.089757
std       15.352014
min       18.000000
25%       30.000000
50%       45.000000
75%       60.000000
max       60.000000
Name: int_age, dtype: float64
AGE GROUP COUNTS, null values filtered out:
45 - 59    269
60+        258
30 - 44    235
18 - 29    185
Name: Age, dtype: int64
----MOST FREQUENT 'AGE' BRACKET WITHIN TOTAL COUNT----
count         947
unique          4
top       45 - 59
freq          269
Name: Age, dtype: object


#### Findings about "Age"
The age groups are too wide to get a solid view of which age group celebrates Thanksgiving the most/least. Breaking the age groups into small groups of 5 years would allow us to get a more accurate picture. 

## How much combined total income did all members of your HOUSEHOLD earn last year?

In [29]:
income = data["How much total combined money did all members of your HOUSEHOLD earn last year?"]
print("Household Earning Group Counts")
print(income.value_counts())

Household Earning Group Counts
$25,000 to $49,999      166
$75,000 to $99,999      127
$50,000 to $74,999      127
Prefer not to answer    118
$100,000 to $124,999    109
$200,000 and up          76
$10,000 to $24,999       60
$0 to $9,999             52
$125,000 to $149,999     48
$150,000 to $174,999     38
$175,000 to $199,999     26
Name: How much total combined money did all members of your HOUSEHOLD earn last year?, dtype: int64


In [31]:
import re

# Remove "Prefer not to answer" from dataframe and return only earnings with a dollar value range
household_income = data.loc[data["How much total combined money did all members of your HOUSEHOLD earn last year?"] != "Prefer not to answer", "How much total combined money did all members of your HOUSEHOLD earn last year?"]

# Clean string values and convert to integers
def get_income(income:str) -> int:
    if pd.isnull(income):
        return None
    income_list = income.split(" ")
    income_cleaned = re.sub('[$,]', "", income_list[0])
    return int(income_cleaned)

# Apply function to all values in column
data["int_income"] = household_income.apply(get_income)

# Assess column values
print(data["int_income"].describe())

print("----MOST FREQUENT EARNING BRACKET WITHIN TOTAL COUNT----")
x = pd.Series(household_income)
print(x.describe())

count       829.000000
mean      75965.018094
std       59068.636748
min           0.000000
25%       25000.000000
50%       75000.000000
75%      100000.000000
max      200000.000000
Name: int_income, dtype: float64
----MOST FREQUENT EARNING BRACKET WITHIN TOTAL COUNT----
count                    829
unique                    10
top       $25,000 to $49,999
freq                     166
Name: How much total combined money did all members of your HOUSEHOLD earn last year?, dtype: object


#### Findings about Household Income

It appears that Thanksgiving is celebrated more by middle and upper-middle income families earning from $25,000 - $100,000 since they total 420 out of 829 people who celebrate Thanksgiving.  They are, however, a large portion of the US population.

Higher earning families earning from $100,000 - $200,000 total 221 out of the 829.  Adding earners making over $200,000, we get 297.  It raises the question whether or not they have more a higher percentage of people celebrating the holiday within the smallest earning group of the US population.

The lowest earning families making from $0 - $25,000 total 112.  Same question of what percentage they represent within this earning group.

With the inclusion of the answer "Prefer not to answer", the data becomes harder to assess with 118 people who refrained from sharing.  118 answers could tip the scales in other interesting directions esp. if a majority of them were of the lowest or highest earning group.

## How far will you travel for Thanksgiving?

In [34]:
# People earning < $150,000

# Filter data, and only select rows where int_income is less than 150000.
income_lt_150k = data["int_income"] < 150000.0


# Select the 'How far will you travel for Thanksgiving?' column.
how_far_travel_lt150k = data.loc[income_lt_150k, "How far will you travel for Thanksgiving?"]

# Use the value_counts() method to count up how many times each value occurs in the column.
print(how_far_travel_lt150k.value_counts())
print("---TOTAL COUNT---")
print(how_far_travel_lt150k.describe())

Thanksgiving is happening at my home--I won't travel at all                         281
Thanksgiving is local--it will take place in the town I live in                     203
Thanksgiving is out of town but not too far--it's a drive of a few hours or less    150
Thanksgiving is out of town and far away--I have to drive several hours or fly       55
Name: How far will you travel for Thanksgiving?, dtype: int64
---TOTAL COUNT---
count                                                   689
unique                                                    4
top       Thanksgiving is happening at my home--I won't ...
freq                                                    281
Name: How far will you travel for Thanksgiving?, dtype: object


In [35]:
# People earning > $150,000

#select rows where int_income is greater than 150000.
income_gt_150k = data["int_income"] > 150000.0

# Within those earners, select the 'How far will you travel for Thanksgiving?' column.
how_far_travel_gt150k = data.loc[income_gt_150k, "How far will you travel for Thanksgiving?"]

# how many times does each value occur in the column
print(how_far_travel_gt150k.value_counts())
print("---TOTAL COUNT---")
print(how_far_travel_gt150k.describe())

Thanksgiving is happening at my home--I won't travel at all                         49
Thanksgiving is local--it will take place in the town I live in                     25
Thanksgiving is out of town but not too far--it's a drive of a few hours or less    16
Thanksgiving is out of town and far away--I have to drive several hours or fly      12
Name: How far will you travel for Thanksgiving?, dtype: int64
---TOTAL COUNT---
count                                                   102
unique                                                    4
top       Thanksgiving is happening at my home--I won't ...
freq                                                     49
Name: How far will you travel for Thanksgiving?, dtype: object


#### Findings about "Travel Distance"
48% of people earning more than 150000 celebrate Thanksgiving at home compared to 41% of people earning less than $150,000 celebrate at home.

If we consider people who are away at school, doing military service abroad, or other low-or-no-paying work,  traveling back home for Thanksgiving is not easy and expensive which could account for a lower percentage of people earning less than 150000 celebrating back at home.

**_Same results but in other words_**:

People earning more than 150000 ----> 52% celebrate outside/away from home 

People earning less than 150000 ----> 59% celebrate outside/away from home



## Meet up with hometown friends on Thanksgiving night and Friendsgiving

In [36]:
meetup_friends_avg_age = data.pivot_table(index="Have you ever tried to meet up with hometown friends on Thanksgiving night?", columns='Have you ever attended a "Friendsgiving?"', values="int_age")
round(meetup_friends_avg_age)

"Have you ever attended a ""Friendsgiving?""",No,Yes
Have you ever tried to meet up with hometown friends on Thanksgiving night?,Unnamed: 1_level_1,Unnamed: 2_level_1
No,42.0,37.0
Yes,41.0,34.0


In [37]:
meetup_friends_avg_income = data.pivot_table(index="Have you ever tried to meet up with hometown friends on Thanksgiving night?", columns='Have you ever attended a "Friendsgiving?"', values="int_income")
round(meetup_friends_avg_income)

"Have you ever attended a ""Friendsgiving?""",No,Yes
Have you ever tried to meet up with hometown friends on Thanksgiving night?,Unnamed: 1_level_1,Unnamed: 2_level_1
No,78915.0,72895.0
Yes,78750.0,66020.0


#### Findings about "Travel Distance"
The average age of a person attending both Friendsgiving and Thanksgiving meet up with friends is 34. A mixed Yes/No response shows a slightly older late 30s/early 40s average age!  These aren't college kids or 20-somethings attending these Thanksgiving events outside of the home nor are they a 50+ age.  Perhaps the 50+ and the 20-somethings are having Thanksgiving dinner together at home.

The average income did not suggest a lower-income bracket but more middle-class income earnings of 66000 to 78000 suggesting that today's 30 to 40 year olds may be more independent, perhaps too busy, and seeking change from traditional holiday experiences.

# Other assignments
- Figure out the most common dessert people eat.
- Figure out the most common complete meal people eat.
- Identify how many people work on Thanksgiving.
- Find regional patterns in the dinner menus.
- Find age, gender, and income based patterns in dinner menus.


## Most common desserts people eat

In [81]:
# new dataframe by selecting the range of columns mentioning the 'dessert' question
desserts = data[data["Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Apple cobbler"] : data["Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Other (please specify)"]]

# do value_counts on this new df

# filter out rows/people who don't eat dessert and NaN rows

# display value_counts and .describe



TypeError: cannot do slice indexing on <class 'pandas.indexes.numeric.Int64Index'> with these indexers [0                 NaN
1                 NaN
2                 NaN
3                 NaN
4                 NaN
5                 NaN
6                 NaN
7                 NaN
8                 NaN
9                 NaN
11                NaN
12                NaN
13                NaN
14                NaN
15                NaN
16                NaN
17      Apple cobbler
18                NaN
19                NaN
20                NaN
21      Apple cobbler
23                NaN
24                NaN
25      Apple cobbler
26                NaN
27                NaN
28                NaN
29                NaN
30                NaN
32      Apple cobbler
            ...      
1024    Apple cobbler
1025              NaN
1026              NaN
1027              NaN
1029              NaN
1030    Apple cobbler
1031    Apple cobbler
1033              NaN
1034              NaN
1035              NaN
1037              NaN
1038              NaN
1039    Apple cobbler
1040              NaN
1041              NaN
1042              NaN
1043              NaN
1044    Apple cobbler
1045              NaN
1046              NaN
1047              NaN
1048              NaN
1049              NaN
1050              NaN
1051              NaN
1053              NaN
1054              NaN
1055              NaN
1056              NaN
1057              NaN
Name: Which of these desserts do you typically have at Thanksgiving dinner? Please select all that apply.   - Apple cobbler, dtype: object] of <class 'pandas.core.series.Series'>