# Analyzing American Thanksgiving Dinner data

dataset: https://github.com/fivethirtyeight/data/tree/master/thanksgiving-2015

In [3]:
import pandas as pd

data = pd.read_csv('thanksgiving.csv', encoding='Latin-1') # encoding Latin-1 is required 
print(data.head(3)) # displaying first three rows

   RespondentID Do you celebrate Thanksgiving?  \
0    4337954960                            Yes   
1    4337951949                            Yes   
2    4337935621                            Yes   

  What is typically the main dish at your Thanksgiving dinner?  \
0                                             Turkey             
1                                             Turkey             
2                                             Turkey             

  What is typically the main dish at your Thanksgiving dinner? - Other (please specify)  \
0                                                NaN                                      
1                                                NaN                                      
2                                                NaN                                      

  How is the main dish typically cooked?  \
0                                  Baked   
1                                  Baked   
2                                Roas

In [4]:
print(data.columns) # printing all column names of the dataframe

Index([u'RespondentID', u'Do you celebrate Thanksgiving?',
       u'What is typically the main dish at your Thanksgiving dinner?',
       u'What is typically the main dish at your Thanksgiving dinner? - Other (please specify)',
       u'How is the main dish typically cooked?',
       u'How is the main dish typically cooked? - Other (please specify)',
       u'What kind of stuffing/dressing do you typically have?',
       u'What kind of stuffing/dressing do you typically have? - Other (please specify)',
       u'What type of cranberry saucedo you typically have?',
       u'What type of cranberry saucedo you typically have? - Other (please specify)',
       u'Do you typically have gravy?',
       u'Which of these side dishes aretypically served at your Thanksgiving dinner? Please select all that apply. - Brussel sprouts',
       u'Which of these side dishes aretypically served at your Thanksgiving dinner? Please select all that apply. - Carrots',
       u'Which of these side dishes arety

In [5]:
# how many times each category (yes/no) occurs
print(data['Do you celebrate Thanksgiving?'].value_counts())

Yes    980
No      78
Name: Do you celebrate Thanksgiving?, dtype: int64


In [6]:
# filtering out information pertaining to people who do not celebrate Thanksgiving
yes_data = data['Do you celebrate Thanksgiving?'] == 'Yes'
data = data.loc[yes_data]


# Exploring main dishes at Thanksgiving

In [7]:
# looking at what people have as main dishes during Thanksgiving
print(data['What is typically the main dish at your Thanksgiving dinner?'].value_counts())

Turkey                    859
Other (please specify)     35
Ham/Pork                   29
Tofurkey                   20
Chicken                    12
Roast beef                 11
I don't know                5
Turducken                   3
Name: What is typically the main dish at your Thanksgiving dinner?, dtype: int64


In [16]:
# Only those who have Tofurkey for dinner
print(data[data['What is typically the main dish at your Thanksgiving dinner?'] == 'Tofurkey'])

# print(data['Do you typically have gravy?'])

     RespondentID Do you celebrate Thanksgiving?  \
4      4337931983                            Yes   
33     4337771439                            Yes   
69     4337553422                            Yes   
72     4337540484                            Yes   
77     4337490067                            Yes   
145    4337191550                            Yes   
175    4337139327                            Yes   
218    4337078951                            Yes   
243    4337044348                            Yes   
275    4336997445                            Yes   
393    4336894719                            Yes   
399    4336891075                            Yes   
571    4336760110                            Yes   
594    4336736562                            Yes   
628    4336692873                            Yes   
774    4336400854                            Yes   
820    4336238126                            Yes   
837    4336175740                            Yes   
860    43361

# Exploring what type of pies people eat

In [9]:
# defining the column titles
apple_pie_column_name = 'Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Apple'
pumpkin_pie_column_name = 'Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pumpkin'
pecan_pie_column_name = 'Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pecan'

# getting rows where each 'pie' columns had null values
apple_isnull = data[apple_pie_column_name].isnull()
pumpkin_isnull = data[pumpkin_pie_column_name].isnull()
pecan_isnull = data[pecan_pie_column_name].isnull()

# combining the previous selections to generate boolean indexes 
ate_pies = apple_isnull & pumpkin_isnull & pecan_isnull

# displaying unique values and how many times they occured
print(ate_pies.value_counts())

False    876
True     104
dtype: int64


# Converting age data to numeric values


In [10]:

def get_int_age(age_str):
    """
    Converts an age value in string format to integer.
    Args:
        age_str (str): the age string to convert to integer.
    Returns:
        int: the converted integer value corresponding to age_str
    """
    
    if pd.isnull(age_str):
        return None
    
    age_str = age_str.split(' ')[0]
    age_str = age_str.replace('+', ' ')
    
    return int(age_str)

# applying the get_int_age function on the Age column and assigning them to the new int_age column in Data
data['int_age'] = data['Age'].apply(get_int_age)

data['int_age'].describe()



count    947.000000
mean      40.089757
std       15.352014
min       18.000000
25%       30.000000
50%       45.000000
75%       60.000000
max       60.000000
Name: int_age, dtype: float64

# Findings

The age is skewed toward the lower bound since only the lower bounds of the age ranges were taken

# Converting income data to numeric values

In [11]:
def get_int_income(income_str):
    """
    Converts an income value in string format to integer.
    Args:
        income_str (str): the age string to convert to integer.
    Returns:
        int: the converted integer value corresponding to income_str
    """
    if pd.isnull(income_str):
        return None
    
    income_str = income_str.split(' ')[0]
    
    # if the user did not prefer to disclose this information
    if income_str == 'Prefer':
        return None
    
    income_str = income_str.replace('$', ' ')
    income_str = income_str.replace(',', '')
    
    return int(income_str)

data['int_income'] = data['How much total combined money did all members of your HOUSEHOLD earn last year?'].apply(get_int_income)

print(data['int_income'].describe())
    

count       829.000000
mean      75965.018094
std       59068.636748
min           0.000000
25%       25000.000000
50%       75000.000000
75%      100000.000000
max      200000.000000
Name: int_income, dtype: float64


# Findings

Once again, the values are skewed toward the lower bounds, since we ignored the upper bounds. The incomes deviate a lot, although the average is income is high.

In [12]:
# selecting those with income less than 150k
income_below_150k = data[data['int_income'] < 150000]


# number of entries per category
print(income_below_150k['How far will you travel for Thanksgiving?'].value_counts())

Thanksgiving is happening at my home--I won't travel at all                         281
Thanksgiving is local--it will take place in the town I live in                     203
Thanksgiving is out of town but not too far--it's a drive of a few hours or less    150
Thanksgiving is out of town and far away--I have to drive several hours or fly       55
Name: How far will you travel for Thanksgiving?, dtype: int64


In [13]:
#selecting those wiht income above 150k
income_above_150k = data[ data['int_income'] > 150000] 

# number of entries per category
print(income_above_150k['How far will you travel for Thanksgiving?'].value_counts())

Thanksgiving is happening at my home--I won't travel at all                         49
Thanksgiving is local--it will take place in the town I live in                     25
Thanksgiving is out of town but not too far--it's a drive of a few hours or less    16
Thanksgiving is out of town and far away--I have to drive several hours or fly      12
Name: How far will you travel for Thanksgiving?, dtype: int64


# Findings

For both groups (earning over 150k and below 150k), the ratio of people who did not travel far to those who did is higher.

# Linking friendship and age


In [14]:
import numpy as np
friend_meets = 'Have you ever tried to meet up with hometown friends on Thanksgiving night?'
friends_giving = 'Have you ever attended a "Friendsgiving?"'

data.pivot_table(index=friend_meets, columns=friends_giving, values='int_age', aggfunc=np.mean)


"Have you ever attended a ""Friendsgiving?""",No,Yes
Have you ever tried to meet up with hometown friends on Thanksgiving night?,Unnamed: 1_level_1,Unnamed: 2_level_1
No,42.283702,37.010526
Yes,41.47541,33.976744


In [15]:
data.pivot_table(index=friend_meets, columns=friends_giving, values='int_income', aggfunc=np.mean)

"Have you ever attended a ""Friendsgiving?""",No,Yes
Have you ever tried to meet up with hometown friends on Thanksgiving night?,Unnamed: 1_level_1,Unnamed: 2_level_1
No,78914.549654,72894.736842
Yes,78750.0,66019.736842


# Findings

It can be seen that younger people tend to attend 'friendsgiving' and meet up with hometworn friends on Thanksgiving night as expected. Also, these people have relatively low income.