In [2]:
import pandas as pd

#https://github.com/fivethirtyeight/data/tree/master/thanksgiving-2015
data = pd.read_csv("thanksgiving.csv", encoding="Latin-1")

#print(data[0:2])

In [26]:
col_names = data.columns


In [4]:
data['Do you celebrate Thanksgiving?'].value_counts()

Yes    980
No      78
Name: Do you celebrate Thanksgiving?, dtype: int64

In [5]:
data = data[data['Do you celebrate Thanksgiving?']=="Yes"]

In [6]:
data['What is typically the main dish at your Thanksgiving dinner?'].value_counts()

tofurkey = data[data["What is typically the main dish at your Thanksgiving dinner?"]=="Tofurkey"]

tofurkey_gravy = tofurkey["Do you typically have gravy?"]

print(tofurkey_gravy.value_counts())

Yes    12
No      8
Name: Do you typically have gravy?, dtype: int64


In [7]:
apple_isnull = data['Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Apple'].isnull()
pumpkin_isnull = data['Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pumpkin'].isnull()
pecan_isnull = data['Which type of pie is typically served at your Thanksgiving dinner? Please select all that apply. - Pecan'].isnull()

ate_pies = apple_isnull & pumpkin_isnull & pecan_isnull
values = ate_pies.value_counts()

print(values)


total = (values[0] + values[1])

not_three = (values[0] / total) * 100

print("Do not buy all three", not_three)





False    876
True     104
dtype: int64
Do not buy all three 89.387755102


In [8]:
import numpy as np

#function to convert a single string to appropriate integer value
def convert_int(string):
    if pd.isnull(string):
        return None
    else:
        split = string.split()
        extract = split[0]
        if split[0] == "60+":
            extract = extract.replace("60+","60")
        extract = int(extract)
        return extract

#function to convert value counts to percentages
def convert_value_counts(input):
    from pandas import Series
    counts = input.value_counts()
    count_index = counts.index.tolist()
    count_total = sum(counts[:])
    count_percent = [(x/count_total) * 100 for x in counts]
    s1 = Series(count_percent, index=count_index, name='percent')
    s2 = Series(counts, index=count_index, name='count')
    s3 = Series(count_total, index=count_index, name='total')
    df = pd.concat([s2, s1, s3], axis=1)
    
    return(df)


age = data['Age']

data["int_age"] = age.apply(convert_int)

int_age = data["int_age"]

print(convert_value_counts(int_age))
print()

region = data['US Region']
print(convert_value_counts(region))
    

      count    percent  total
45.0    269  28.405491    947
60.0    258  27.243928    947
30.0    235  24.815206    947
18.0    185  19.535375    947

                    count    percent  total
South Atlantic        203  21.804511    931
Middle Atlantic       145  15.574651    931
East North Central    145  15.574651    931
Pacific               130  13.963480    931
West South Central     85   9.129968    931
West North Central     71   7.626208    931
East South Central     56   6.015038    931
New England            55   5.907626    931
Mountain               41   4.403867    931


### Findings
One could ask the question how many pies does one usually eat on thanksgiving? Also which is the most popular type of pie? According to the data above roughly 89.4% of people do not buy an apple, pumpkin and pecan pies for Thanksgiving. I would imagine it would be useful to understand which pie is more popular overall and which pie is the least popular. 

The survey seems to draw a sample relatively proportional to the national population. At first the survey seems skewed to the 60+ age group. In fact according to [Age and Sex Composition in the United States: 2012](https://www.census.gov/population/age/data/2012comp.html) with similar age ranges, the percent of civilians noninstitutionalized looks like this:
+ 20-29 = 13.9%
+ 30-44 = 19.5%
+ 45-59 = 20.8%
+ 60+ = 19.2%

The Thanksgiving survey looks like this:
+ 18-29 = 19.54%
+ 30-44 = 24.82%
+ 45-59 = 28.41%
+ 60+ = 27.24%

Besides the 60+ age group being 3% higher than the 30-44 age group in the Thanksgiving survey compared to the national population which the 30-44 age group is 3% higher, the survey seems to draw a sample relatively proportional to the national population.

I also thought the sampled seemed skewed to the East Coast rather than the West Coast. The South Atlatic and Middle Atlantic Regions account for 37.38% of the samples, while the Pacific region including California only accounts for 13.96%. However when I looked at 2010 census data here [Annual Estimates of the Resident Population for the United States, Regions, States, and Puerto Rico: April 1, 2010 to July 1, 2016](https://www2.census.gov/programs-surveys/popest/tables/2010-2016/state/totals/nst-est2016-01.xlsx) the samples sizes seem rather proportionate. According the 2010 census 15.4% of population lived in the states of California, Oregon and Washington. That is pretty close to the 13.96% sample size in this exercise. So it would appear that Pacific region is not underepresented. 

Going foward I have questions about how the skewness of where people lived and income. I also wonder if they are well represented in the survey sample. 


In [9]:
income = data['How much total combined money did all members of your HOUSEHOLD earn last year?']

def convert_int_value(string):
    if pd.isnull(string):
        return None 
    else:
        split = string.split()
        extract = split[0]
        if split[0] == "Prefer":
            return None
        else:
            extract1 = extract.replace("$","")
            extract2 = extract1.replace(",","")
            extract = int(extract2)
            return extract
data['int_income'] = income.apply(convert_int_value)

int_income_counts = data['int_income']

int_income_counts = convert_value_counts(int_income_counts)
sorted_df = int_income_counts.sort()
print(sorted_df)
print()

print(data['int_income'].describe())




          count    percent  total
0.0          52   6.272618    829
10000.0      60   7.237636    829
25000.0     166  20.024125    829
50000.0     127  15.319662    829
75000.0     127  15.319662    829
100000.0    109  13.148372    829
125000.0     48   5.790109    829
150000.0     38   4.583836    829
175000.0     26   3.136309    829
200000.0     76   9.167672    829

count       829.000000
mean      75965.018094
std       59068.636748
min           0.000000
25%                NaN
50%                NaN
75%                NaN
max      200000.000000
Name: int_income, dtype: float64




## Findings
There is a high deviation in income. A standard deviation of $59,068 is quite large. It seems like because of NaN data types in the *int_income* series, the describe method could not output 25%, 50% or 75% quartiles. That makes it hard to see how income is distributed. 

The mean income of \$75,965 for the Thanksgiving survey seems rather normal compared to the 2015 household national average of $79,263 [HINC-01. Selected Characteristics of Households by Total Money Income](http://www2.census.gov/programs-surveys/cps/tables/hinc-01/2016/hinc01_1.xls). Also mean income can be easily skewed by higher incomes. It would probably be better to look at the median income rather than the mean income in this case. 

With 76/829 or roughly 9.17% people making over $200,000, that seems like a high proportion of people with that income compared to the proportion of people in the United States as a whole who make that income which is 6.07% [HINC-01. Selected Characteristics of Households by Total Money Income](http://www2.census.gov/programs-surveys/cps/tables/hinc-01/2016/hinc01_1.xls). That is likely skewing the mean. 

Also the method applied gave 52 people with an income of \$0.00. That skewed the mean income as well. It would have been better to group the people who make $0 of income into another group that would be less likely to skew the mean. 


In [10]:
less_50000 = data[data['int_income']< 50000]

less_50000_travel = less_50000['How far will you travel for Thanksgiving?']

counts = less_50000_travel.value_counts()
count_index = counts.index.tolist()

print(count_index)
print()

print("Income less than $50,000")
print(convert_value_counts(less_50000_travel))
print()
greater_150000 = data[data['int_income']> 150000]
greater_150000_travel = greater_150000['How far will you travel for Thanksgiving?']
print("Income greater than $150,000")
print(convert_value_counts(greater_150000_travel))

["Thanksgiving is happening at my home--I won't travel at all", 'Thanksgiving is local--it will take place in the town I live in', "Thanksgiving is out of town but not too far--it's a drive of a few hours or less", 'Thanksgiving is out of town and far away--I have to drive several hours or fly']

Income less than $50,000
                                                    count    percent  total
Thanksgiving is happening at my home--I won't t...    106  38.129496    278
Thanksgiving is local--it will take place in th...     92  33.093525    278
Thanksgiving is out of town but not too far--it...     64  23.021583    278
Thanksgiving is out of town and far away--I hav...     16   5.755396    278

Income greater than $150,000
                                                    count    percent  total
Thanksgiving is happening at my home--I won't t...     49  48.039216    102
Thanksgiving is local--it will take place in th...     25  24.509804    102
Thanksgiving is out of town but not too

## Findings

The sample size for the income greater than \$150,000 is pretty small at 102 people. The margin of error is probably higher in the income bracket greater than \$150,000 than the income bracket less than \$50,000.

The income bracket less than \$50,000 has a higher percentage of people who travel locally to a different house than the income bracket greater than \$150,000. The income bracket greater than \$150,000 is also travels a distance that will require driving several hours or flying at a signficant higher percentage than the income bracket less than \$50,000. 

In [23]:
ave_respondents = data.pivot_table(index="Have you ever tried to meet up with hometown friends on Thanksgiving night?", columns='Have you ever attended a "Friendsgiving?"', values="int_age", aggfunc=np.mean) 

ave_income = data.pivot_table(index="Have you ever tried to meet up with hometown friends on Thanksgiving night?", columns='Have you ever attended a "Friendsgiving?"', values="int_income", aggfunc=np.mean)

print(ave_respondents)
print()
print(ave_income)



Have you ever attended a "Friendsgiving?"                  No        Yes
Have you ever tried to meet up with hometown fr...                      
No                                                  42.283702  37.010526
Yes                                                 41.475410  33.976744

Have you ever attended a "Friendsgiving?"                     No           Yes
Have you ever tried to meet up with hometown fr...                            
No                                                  78914.549654  72894.736842
Yes                                                 78750.000000  66019.736842


## Findings
It seems like people who attending a "Friendsgiving" tend to be younger and have a lower income on average than those who don't. That could be because "Friendsgiving" is a new celebration celebrated by younger people. I personally have not heard about it until now. It could also be because older people work and have the income and means to to make it back to their hometown. 

It also seems like there is not a big difference between the average of income and who have tried to meet up with hometown friends and have never attended a "Friendsgiving." This could mean income does influence if people can meet up friends in their hometown since a higher income could allow the respondent the means to make it back to their hometown.

### Next Steps
+ Figure out the most common dessert people eat.
+ Figure out the most common complete meal people eat.
+ Identify how many people work on Thanksgiving.
+ Find regional patterns in the dinner menus.
+ Find age, gender, and income based patterns in dinner menus.
