# Scott Breitbach
## 24-Jan-2021
## DSC540, Weeks 7-8

In [2]:
import pandas as pd
import numpy as np
import pycountry

https://www.scq.ubc.ca/so-much-candy-data-seriously/

In [3]:
candy16url = "https://www.scq.ubc.ca/wp-content/uploads/2016/10/BOING-BOING-CANDY-HIERARCHY-2016-SURVEY-Responses.xlsx"
df = pd.read_excel(candy16url)

## Clean up Data:

### Remove any columns consisting entirely of NaNs:

In [4]:
for header in df.columns:
    if df[header].isnull().all() == True:
        df.drop(header, axis=1, inplace=True)
    else:
        continue

### Check for duplicates in each column:

In [5]:
print("Duplicates in:")
for name in df.columns:
    print("{:.<60}".format(name), any(df[name].duplicated()))

Duplicates in:
Timestamp................................................... False
Are you going actually going trick or treating yourself?.... True
Your gender:................................................ True
How old are you?............................................ True
Which country do you live in?............................... True
Which state, province, county do you live in?............... True
 [100 Grand Bar]............................................ True
 [Anonymous brown globs that come in black and orange wrappers] True
 [Any full-sized candy bar]................................. True
 [Black Jacks].............................................. True
 [Bonkers (the candy)]...................................... True
 [Bonkers (the board game)]................................. True
 [Bottle Caps].............................................. True
 [Box'o'Raisins]............................................ True
 [Broken glow stick].....................................

##### No duplicates in 'Timestamp', all other columns contain duplicates

### Check for columns without NaNs:

In [6]:
print("NaN's in:")
for name in df.columns:
    print("{:.<60} {}".format(name, df[name].isna().any()))

NaN's in:
Timestamp................................................... False
Are you going actually going trick or treating yourself?.... False
Your gender:................................................ True
How old are you?............................................ True
Which country do you live in?............................... True
Which state, province, county do you live in?............... True
 [100 Grand Bar]............................................ True
 [Anonymous brown globs that come in black and orange wrappers] True
 [Any full-sized candy bar]................................. True
 [Black Jacks].............................................. True
 [Bonkers (the candy)]...................................... True
 [Bonkers (the board game)]................................. True
 [Bottle Caps].............................................. True
 [Box'o'Raisins]............................................ True
 [Broken glow stick]........................................ 

##### No NaNs in 'Timestamp' or 'Are you going actually going trick or treating yourself?' columns. All others contain at least 1 NaN.

### Replace NaNs in candy columns with 'UNKNOWN', because survey said to "leave blank if you have no idea what the item is":

In [7]:
startCol = df.columns[6]
endCol = df.columns[105]
df.loc[:, startCol:endCol] = df.loc[:, startCol:endCol].fillna('UNKNOWN')

In [12]:
# df.tail()

### Create a smaller DF with just User and Candy info:

In [9]:
dfLite = df.loc[:, :df.columns[105]]

### Make column names easier to read:

In [10]:
oldCols = list(df.columns[0:6])
newCols = ['Timestamp', 'TrickOrTreat', 'Gender', 'Age', 'Country', 'Locale']
dictCols = dict(zip(oldCols, newCols))

In [11]:
dfLite.rename(columns=dictCols, inplace=True)

### Clean up Country names:

In [17]:
countryDF = dfLite[['Timestamp', 'Country']].copy()

In [18]:
countryDF.head()

Unnamed: 0,Timestamp,Country
0,2016-10-24 05:09:23.033,Canada
1,2016-10-24 05:09:54.798,usa
2,2016-10-24 05:13:06.734,US
3,2016-10-24 05:14:17.192,usa
4,2016-10-24 05:14:24.625,USA


In [30]:
countryDF.Country.unique()

array(['Canada', 'usa', 'US', 'USA', 'UK', 'United States of America',
       'uSA', 'Japan', 'united states', 'USA ', 'canada', 'United States',
       'us', 'france', 'USSA', 'United States of America ', 'U.S.A.',
       'A tropical island south of the equator', 'england', 'uk',
       'Switzerland', 'Murica', 'United Kingdom', 'Neverland', 'USA!',
       'this one',
       "USA (I think but it's an election year so who can really tell)",
       'Korea', 51, 'Usa', nan, 'U.S.', 'Us', 'America ', 'Units States',
       'belgium', 'croatia', 'United states', 'Portugal', 'England',
       'USA USA USA', 'the best one - usa', 'USA! USA! USA!', 47,
       'united states ', 'Cascadia', 'españa', 'u.s.',
       "there isn't one for old men", 'United States ', 'Panama',
       'one of the best ones', 'The Yoo Ess of Aaayyyyyy',
       'United Kindom', 'France', 'America', 'Australia', 'hungary',
       'united states of america', 'UK ', 'Austria', 'Somewhere',
       'New Zealand', 54, 'Germ

#### Remove punctuation
Also replace fields that start wtih 'us' or end with 'usa' with just 'usa'.  
This should cover most instances of repeated USA, or other variants.

In [19]:
countryDF['CountryCleaned'] = countryDF['Country'].str.lower().str.replace('[.,!]', '')
countryDF['CountryCleaned'] = np.where(countryDF['CountryCleaned'].str.startswith('us'), 
                                       'usa', countryDF['CountryCleaned'])
countryDF['CountryCleaned'] = np.where(countryDF['CountryCleaned'].str.endswith('usa'), 
                                       'usa', countryDF['CountryCleaned'])

In [20]:
countryDF.CountryCleaned = countryDF.CountryCleaned.str.strip()

#### Perform replacements for remaining oddballs:
Note: 'uk' changed to 'united kingdom' to avoid conversion to 'Ukraine'.

In [22]:
replacements = {
    'uk': 'united kingdom',
    'units states': 'usa',
    'america': 'usa',
    'the yoo ess of aaayyyyyy': 'usa',
    'españa': 'spain',
    'murica': 'usa',
    'united kindom': 'united kingdom',
    'brasil': 'brazil',
    "god's country": 'usa',
    'united sates': 'usa',
    "sub-canadian north america 'merica": 'usa',
    'trumpistan': 'usa',
    'united stetes': 'usa',
    'united  states of america': 'usa'
}

countryDF['CountryCleaned'].replace(replacements, inplace=True)

In [23]:
countryDF.CountryCleaned = countryDF.CountryCleaned.str.strip()

#### Perform Fuzzy Matching on Countries:

In [24]:
def fuzzyCountry(countryName):
    try:
        return pycountry.countries.search_fuzzy(countryName)[0].name
    except:
#         return ("NOT FOUND")  # for troubleshooting matching
        return(np.nan)

countryDF['FuzzyMatch'] = countryDF.apply(lambda row: fuzzyCountry(row.CountryCleaned), axis=1)

#### A couple more fixes:

In [25]:
countryDF.FuzzyMatch.replace("Korea, Democratic People's Republic of", "North Korea", inplace=True)

In [26]:
countryDF.FuzzyMatch.iloc[725] = 'South Korea'

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


In [27]:
countryDF.FuzzyMatch.unique()

array(['Canada', 'United States', 'United Kingdom', 'Japan', 'France',
       nan, 'Switzerland', 'North Korea', 'Belgium', 'Croatia',
       'Portugal', 'Spain', 'Panama', 'Australia', 'Hungary', 'Austria',
       'New Zealand', 'Germany', 'Mexico', 'Brazil', 'South Korea',
       'Philippines', 'Tonga', 'Sweden', 'Netherlands', 'Finland',
       'American Samoa', 'China', 'Kenya'], dtype=object)

#### Perform a merge for non-null values:

In [28]:
countryDF['Combined'] = countryDF['FuzzyMatch'].combine_first(countryDF['Country'])

In [29]:
countryDF[countryDF['FuzzyMatch'].isnull()]

Unnamed: 0,Timestamp,Country,CountryCleaned,FuzzyMatch,Combined
57,2016-10-24 05:54:40.128,A tropical island south of the equator,a tropical island south of the equator,,A tropical island south of the equator
89,2016-10-24 06:15:22.247,Neverland,neverland,,Neverland
99,2016-10-24 06:21:18.515,this one,this one,,this one
310,2016-10-24 08:50:32.535,Cascadia,cascadia,,Cascadia
411,2016-10-24 10:21:01.946,there isn't one for old men,there isn't one for old men,,there isn't one for old men
444,2016-10-24 11:00:42.575,one of the best ones,one of the best ones,,one of the best ones
612,2016-10-24 14:48:10.100,Somewhere,somewhere,,Somewhere
1071,2016-10-27 10:42:49.888,See above,see above,,See above
1124,2016-10-27 12:23:56.227,The republic of Cascadia,the republic of cascadia,,The republic of Cascadia
1156,2016-10-27 13:53:10.380,Not the USA or Canada,not the usa or canada,,Not the USA or Canada


#### Merge back to dataset:

In [32]:
dfLite['Country'] = countryDF['FuzzyMatch']

In [33]:
dfLite.iloc[0:3, 0:6]

Unnamed: 0,Timestamp,TrickOrTreat,Gender,Age,Country,Locale
0,2016-10-24 05:09:23.033,No,Male,22,Canada,Ontario
1,2016-10-24 05:09:54.798,No,Male,45,United States,il
2,2016-10-24 05:13:06.734,No,Female,48,United States,Colorado


### Clean up Age column:

In [35]:
dfLite['Age'].unique()

array([22, 45, 48, 57, 42, 41, 47, 28, 44, 34, 46, 40, 31, 33, 35, 49, 16,
       60, 30, 51, 38, 54, 43, 50, 37, 55, 58, 32,
       'Old enough to know better ', 64, 61, 65, 26, 36, 78, 39, 52, 29,
       63, 'old enough',
       'As old as my tongue a few years older than my teeth', '50s',
       'old', nan, 10, 62, '0x2A', 23, 20, 24, 17, 27, 53, 18, 13, 56, 66,
       'Fifty.  Nine.  Ish.', 25, 59, 74, 19, 'Ancient',
       'I remember the Nixon administration', 'over retirement age', 14,
       79, 'Old enough', '50+', 70, '55+', 'over 40', 'Hahahahahaha', 68,
       81, 1000000000000000000, 'Old', 'Older than i act', 'really old',
       12, 67, 'blah', 23.2, 11, 'older than I want to be', 21, 71,
       'Not as old as you...', 82, 'Never ask a woman that question.',
       'old ', 'Same as yo mama', 15,
       'Too old to trick or treat without it being creepy', 'ancient',
       142, 7, 'Old enough to not Trick or Treat.', '49 11/12ths'],
      dtype=object)

##### Lots of weird ones here. Let's see what we can do.

In [36]:
dfAge = dfLite.copy()

#### Take a look at the non-numeric values:

In [37]:
dfAge['Age'][dfAge['Age'].str.isnumeric() == False]

57                             Old enough to know better 
99                                             old enough
100     As old as my tongue a few years older than my ...
102                                                   50s
104                                                   old
135                                                  0x2A
265                                   Fifty.  Nine.  Ish.
310                                               Ancient
415                   I remember the Nixon administration
423                                   over retirement age
444                                                   old
478                                            Old enough
495                                                   50+
512                                                   55+
516                                               over 40
524                                          Hahahahahaha
618                                            old enough
636           

#### Estimate ages given as text, rounding ranges down:

In [38]:
replacements = {
    '49 11/12ths': 49,
    '50+': 50,
    'over 40': 40,
    '55+': 55,
    'over retirement age': 65,
    'I remember the Nixon administration': 60, # estimated
    'Fifty.  Nine.  Ish.': 59,
    '0x2A': 42,  # in hexadecimal
    '50s': 50
}

dfAge['Age'].replace(replacements, inplace=True)

#### Replace remaining unknowns with null value:

In [39]:
dfAge[dfAge['Age'].str.isnumeric() == False] = np.nan

In [40]:
dfAge.Age.unique()

array([22, 45, 48, 57, 42, 41, 47, 28, 44, 34, 46, 40, 31, 33, 35, 49, 16,
       60, 30, 51, 38, 54, 43, 50, 37, 55, 58, 32, nan, 64, 61, 65, 26,
       36, 78, 39, 52, 29, 63, 10, 62, 23, 20, 24, 17, 27, 53, 18, 13, 56,
       66, 59, 25, 74, 19, 14, 79, 70, 68, 81, 1000000000000000000, 12,
       67, 23.2, 11, 21, 71, 82, 15, 142, 7], dtype=object)

#### Still some outliers.
Oldest person to ever live was 122; replace anything older with null:

In [41]:
dfAge[dfAge['Age'] > 122] = np.nan

#### Merge back to data set:

In [42]:
dfLite['Age'] = pd.to_numeric(dfAge['Age'])

## Heirarchical Index

In [44]:
dfIndexed = dfLite.copy()

#### Add TrickOrTreat, Gender, and Age indices:

In [45]:
indices = [dfIndexed.columns[1], dfIndexed.columns[2], dfIndexed.columns[3]]
dfIndexed.set_index(indices, inplace=True)

In [46]:
type(dfIndexed.index)

pandas.core.indexes.multi.MultiIndex

In [47]:
dfIndexed.head()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Timestamp,Country,Locale,[100 Grand Bar],[Anonymous brown globs that come in black and orange wrappers],[Any full-sized candy bar],[Black Jacks],[Bonkers (the candy)],[Bonkers (the board game)],[Bottle Caps],...,[Three Musketeers],[Tolberone something or other],[Trail Mix],[Twix],"[Vials of pure high fructose corn syrup, for main-lining into your vein]",[Vicodin],[Whatchamacallit Bars],[White Bread],[Whole Wheat anything],[York Peppermint Patties]
TrickOrTreat,Gender,Age,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
No,Male,22.0,2016-10-24 05:09:23.033,Canada,Ontario,JOY,DESPAIR,JOY,MEH,MEH,MEH,MEH,...,JOY,JOY,DESPAIR,JOY,DESPAIR,JOY,DESPAIR,DESPAIR,DESPAIR,JOY
No,Male,45.0,2016-10-24 05:09:54.798,United States,il,MEH,MEH,JOY,JOY,DESPAIR,MEH,JOY,...,JOY,JOY,MEH,JOY,DESPAIR,JOY,JOY,DESPAIR,DESPAIR,JOY
No,Female,48.0,2016-10-24 05:13:06.734,United States,Colorado,JOY,DESPAIR,JOY,MEH,MEH,JOY,JOY,...,JOY,JOY,DESPAIR,JOY,DESPAIR,DESPAIR,MEH,DESPAIR,DESPAIR,JOY
No,Male,57.0,2016-10-24 05:14:17.192,United States,il,JOY,MEH,JOY,MEH,MEH,DESPAIR,DESPAIR,...,JOY,JOY,JOY,JOY,DESPAIR,JOY,JOY,DESPAIR,DESPAIR,JOY
Yes,Male,42.0,2016-10-24 05:14:24.625,United States,South Dakota,MEH,DESPAIR,JOY,DESPAIR,MEH,JOY,MEH,...,JOY,JOY,MEH,JOY,JOY,JOY,MEH,DESPAIR,DESPAIR,MEH


#### See what data we have for 40-year-old Males who go Trick-or-Treating:

In [48]:
dfIndexed.loc[('Yes', 'Male', 40)]

  dfIndexed.loc[('Yes', 'Male', 40)]


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Timestamp,Country,Locale,[100 Grand Bar],[Anonymous brown globs that come in black and orange wrappers],[Any full-sized candy bar],[Black Jacks],[Bonkers (the candy)],[Bonkers (the board game)],[Bottle Caps],...,[Three Musketeers],[Tolberone something or other],[Trail Mix],[Twix],"[Vials of pure high fructose corn syrup, for main-lining into your vein]",[Vicodin],[Whatchamacallit Bars],[White Bread],[Whole Wheat anything],[York Peppermint Patties]
TrickOrTreat,Gender,Age,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
Yes,Male,40.0,2016-10-24 07:02:40.720,United States,Missouri,JOY,DESPAIR,JOY,DESPAIR,DESPAIR,MEH,MEH,...,JOY,JOY,DESPAIR,JOY,DESPAIR,JOY,MEH,DESPAIR,DESPAIR,MEH
Yes,Male,40.0,2016-10-24 16:22:55.590,United States,CA,MEH,DESPAIR,JOY,MEH,MEH,MEH,DESPAIR,...,MEH,JOY,DESPAIR,MEH,DESPAIR,MEH,DESPAIR,DESPAIR,DESPAIR,DESPAIR
Yes,Male,40.0,2016-10-24 16:48:26.992,United States,ca,JOY,DESPAIR,JOY,DESPAIR,MEH,MEH,JOY,...,MEH,MEH,DESPAIR,MEH,MEH,MEH,JOY,DESPAIR,DESPAIR,MEH


## Pivot Tables

In [49]:
pivotDF = dfLite.copy()