In [1]:
import pandas as pd
import numpy as np
import pycountry

In [2]:
candy16url = "https://www.scq.ubc.ca/wp-content/uploads/2016/10/BOING-BOING-CANDY-HIERARCHY-2016-SURVEY-Responses.xlsx"
df = pd.read_excel(candy16url)

### Remove any columns consisting entirely of NaNs:

In [3]:
for header in df.columns:
    if df[header].isnull().all() == True:
        df.drop(header, axis=1, inplace=True)
    else:
        continue

### Check for duplicates in each column:

In [4]:
# print("Duplicates in:")
# for name in df.columns:
#     print("{:.<60}".format(name), any(df[name].duplicated()))

No duplicates in 'Timestamp', all other columns contain duplicates

### Check for columns without NaNs:

In [5]:
# print("NaN's in:")
# for name in df.columns:
#     print("{:.<60} {}".format(name, df[name].isna().any()))

No NaNs in 'Timestamp' or 'Are you going actually going trick or treating yourself?' columns. All others contain at least 1 NaN.

### Replace NaNs in candy columns with 'UNKNOWN', because survey said to "leave blank if you have no idea what the item is":

In [6]:
startCol = df.columns[6]
endCol = df.columns[105]
df.loc[:, startCol:endCol] = df.loc[:, startCol:endCol].fillna('UNKNOWN')

In [7]:
df.tail()

Unnamed: 0,Timestamp,Are you going actually going trick or treating yourself?,Your gender:,How old are you?,Which country do you live in?,"Which state, province, county do you live in?",[100 Grand Bar],[Anonymous brown globs that come in black and orange wrappers],[Any full-sized candy bar],[Black Jacks],...,What is your favourite font?,Please estimate the degree(s) of separation you have from the following celebrities [JK Rowling],Please estimate the degree(s) of separation you have from the following celebrities [JJ Abrams],Please estimate the degree(s) of separation you have from the following celebrities [Beyoncé],Please estimate the degree(s) of separation you have from the following celebrities [Bieber],Please estimate the degree(s) of separation you have from the following celebrities [Kevin Bacon],Please estimate the degree(s) of separation you have from the following celebrities [Francis Bacon (1561 - 1626)],"Which day do you prefer, Friday or Sunday?","Do you eat apples the correct way, East to West (side to side) or do you eat them like a freak of nature, South to North (bottom to top)?","When you see the above image of the 4 different websites, which one would you most likely check out (please be honest)."
1254,2016-10-29 16:53:52.516,No,Female,52.0,USA,TX,JOY,DESPAIR,JOY,MEH,...,Candara,3 or higher,3 or higher,3 or higher,3 or higher,2,3 or higher,Friday,East to West,Science: Latest News and Headlines
1255,2016-10-30 06:53:54.735,No,Male,33.0,united states,minnesota,JOY,DESPAIR,JOY,DESPAIR,...,Trebuchet,"Actually, that's me.",3 or higher,3 or higher,3 or higher,3 or higher,"Actually, that's me.",Friday,Sinusoidally around the equator,Science: Latest News and Headlines
1256,2016-10-30 11:06:10.827,No,Male,,,,JOY,MEH,JOY,UNKNOWN,...,Comic Sans,,,,,,,Sunday,nne to east to nnw to s to n,Science: Latest News and Headlines
1257,2016-10-30 16:07:26.539,No,Male,48.0,canada,BC,UNKNOWN,DESPAIR,JOY,DESPAIR,...,,1,2,3 or higher,3 or higher,2,3 or higher,Sunday,East to West,Science: Latest News and Headlines
1258,2016-10-30 17:06:45.660,Yes,Female,44.0,Us,Nh,JOY,MEH,JOY,JOY,...,,3 or higher,3 or higher,3 or higher,3 or higher,3 or higher,3 or higher,Sunday,East to West,Daily Dish


### Create a smaller DF with just User and Candy info:

In [8]:
dfLite = df.loc[:, :df.columns[105]]

### Clean up Country names:

In [9]:
tempDF = dfLite[['Timestamp', 'Which country do you live in?']].copy()
tempDF.columns = ['Timestamp', 'Country']

In [10]:
tempDF.head()

Unnamed: 0,Timestamp,Country
0,2016-10-24 05:09:23.033,Canada
1,2016-10-24 05:09:54.798,usa
2,2016-10-24 05:13:06.734,US
3,2016-10-24 05:14:17.192,usa
4,2016-10-24 05:14:24.625,USA


In [11]:
tempDF['CountryCleaned'] = tempDF['Country'].str.lower().str.replace('[.,!]', '')
tempDF['CountryCleaned'] = np.where(tempDF['CountryCleaned'].str.startswith('us'), 'usa', tempDF['CountryCleaned'])
tempDF['CountryCleaned'] = np.where(tempDF['CountryCleaned'].str.endswith('usa'), 'usa', tempDF['CountryCleaned'])

In [13]:
replacements = {
    'uk': 'united kingdom',
    'units states': 'usa',
    'america': 'usa',
    'the yoo ess of aaayyyyyy': 'usa',
    'españa': 'spain',
    'murica': 'usa',
    'united kindom': 'united kingdom',
    'brasil': 'brazil',
    "god's country": 'usa',
    'united sates': 'usa',
    "sub-canadian north america 'merica": 'usa',
    'trumpistan': 'usa',
    'united stetes': 'usa',
    'united  states of america': 'usa'
}

tempDF['CountryCleaned'].replace(replacements, inplace=True)

In [14]:
tempDF.CountryCleaned = tempDF.CountryCleaned.str.strip()

In [15]:
def fuzzyCountry(countryName):
    try:
        return pycountry.countries.search_fuzzy(countryName)[0].name
    except:
#         return ("NOT FOUND")  # for troubleshooting matching
        return(np.nan)

tempDF['FuzzyMatch'] = tempDF.apply(lambda row: fuzzyCountry(row.CountryCleaned), axis=1)

In [16]:
tempDF.head()

Unnamed: 0,Timestamp,Country,CountryCleaned,FuzzyMatch
0,2016-10-24 05:09:23.033,Canada,canada,Canada
1,2016-10-24 05:09:54.798,usa,usa,United States
2,2016-10-24 05:13:06.734,US,usa,United States
3,2016-10-24 05:14:17.192,usa,usa,United States
4,2016-10-24 05:14:24.625,USA,usa,United States


In [17]:
# tempDF[tempDF['FuzzyMatch'] == 'NOT FOUND']

In [18]:
tempDF['Combined'] = tempDF['FuzzyMatch'].combine_first(tempDF['Country'])

In [19]:
tempDF[tempDF['FuzzyMatch'].isnull()]

Unnamed: 0,Timestamp,Country,CountryCleaned,FuzzyMatch,Combined
57,2016-10-24 05:54:40.128,A tropical island south of the equator,a tropical island south of the equator,,A tropical island south of the equator
89,2016-10-24 06:15:22.247,Neverland,neverland,,Neverland
99,2016-10-24 06:21:18.515,this one,this one,,this one
310,2016-10-24 08:50:32.535,Cascadia,cascadia,,Cascadia
411,2016-10-24 10:21:01.946,there isn't one for old men,there isn't one for old men,,there isn't one for old men
444,2016-10-24 11:00:42.575,one of the best ones,one of the best ones,,one of the best ones
612,2016-10-24 14:48:10.100,Somewhere,somewhere,,Somewhere
725,2016-10-24 19:21:39.300,South Korea,south korea,,South Korea
1071,2016-10-27 10:42:49.888,See above,see above,,See above
1124,2016-10-27 12:23:56.227,The republic of Cascadia,the republic of cascadia,,The republic of Cascadia
