In [None]:
import os.path
from pyspark.mllib.stat import Statistics
import matplotlib as mpl
import matplotlib.pyplot as plt

In [None]:
headers = 'Age,"AttendedBootcamp","BootcampFinish","BootcampFullJobAfter","BootcampLoanYesNo","BootcampMonthsAgo","BootcampName","BootcampPostSalary","BootcampRecommend","ChildrenNumber","CityPopulation","CodeEventBootcamp","CodeEventCoffee","CodeEventConferences","CodeEventDjangoGirls","CodeEventGameJam","CodeEventGirlDev","CodeEventHackathons","CodeEventMeetup","CodeEventNodeSchool","CodeEventNone","CodeEventOther","CodeEventRailsBridge","CodeEventRailsGirls","CodeEventStartUpWknd","CodeEventWomenCode","CodeEventWorkshop","CommuteTime","CountryCitizen","CountryLive","EmploymentField","EmploymentFieldOther","EmploymentStatus","EmploymentStatusOther","ExpectedEarning","FinanciallySupporting","Gender","HasChildren","HasDebt","HasFinancialDependents","HasHighSpdInternet","HasHomeMortgage","HasServedInMilitary","HasStudentDebt","HomeMortgageOwe","HoursLearning","ID.x","ID.y","Income","IsEthnicMinority","IsReceiveDiabilitiesBenefits","IsSoftwareDev","IsUnderEmployed","JobApplyWhen","JobPref","JobRelocateYesNo","JobRoleInterest","JobRoleInterestOther","JobWherePref","LanguageAtHome","MaritalStatus","MoneyForLearning","MonthsProgramming","NetworkID","Part1EndTime","Part1StartTime","Part2EndTime","Part2StartTime","PodcastChangeLog","PodcastCodeNewbie","PodcastCodingBlocks","PodcastDeveloperTea","PodcastDotNetRocks","PodcastHanselminutes","PodcastJSJabber","PodcastJsAir","PodcastNone","PodcastOther","PodcastProgrammingThrowDown","PodcastRubyRogues","PodcastSEDaily","PodcastShopTalk","PodcastTalkPython","PodcastWebAhead","ResourceBlogs","ResourceBooks","ResourceCodeWars","ResourceCodecademy","ResourceCoursera","ResourceDevTips","ResourceEdX","ResourceEggHead","ResourceFCC","ResourceGoogle","ResourceHackerRank","ResourceKhanAcademy","ResourceLynda","ResourceMDN","ResourceOdinProj","ResourceOther","ResourcePluralSight","ResourceReddit","ResourceSkillCrush","ResourceSoloLearn","ResourceStackOverflow","ResourceTreehouse","ResourceUdacity","ResourceUdemy","ResourceW3Schools","ResourceYouTube","SchoolDegree","SchoolMajor","StudentDebtOwe"'
headers = map((lambda (i, header): '{0} {1}'.format(i, header.replace('"', ''))), enumerate(headers.split(',')))

for h in headers:
    print h

In [None]:
base_dir = os.path.join('data')
input_path = os.path.join('2016-FCC-New-Coders-Survey-Data.csv')

file_name = os.path.join(base_dir, input_path)

# Code to read each character and parse it
# Results in a beautifully split line! :)
def split(l):
    mappedLine = []
    tmp = ''
    in_string = False
    
    for c in l:
        if c == ',':
            if in_string:
                tmp += c
            else:
                mappedLine.append(tmp)
                tmp = ''
        elif c == '"':
            in_string = not in_string
        else:
            tmp += c
    
    return mappedLine

filtered_set = sc.textFile(file_name).filter(lambda l: 'AttendedBootcamp' not in l).map(split)

print 'Count of records : ' + str(filtered_set.count())

In [None]:
def make_features_gender(line):
    features = []
    
    gender = line[36].lower()
    
    if gender == 'male':
        features.extend([1, 0, 0, 0])
    elif gender == 'female':
        features.extend([0, 1, 0, 0])
    elif gender == 'na':
        features.extend([0, 0, 0, 1])
    else:
        features.extend([0, 0, 1, 0])
    return features

featured_gender = filtered_set.map(make_features_gender)

In [None]:
%matplotlib inline

# Make the percentages bigger! (readability)
mpl.rcParams['font.size'] = 20.0

summary1 = Statistics.colStats(featured_gender)
mean1 = summary1.mean()
fracs1 = [mean1[0], mean1[1], mean1[2], mean1[3]]

explode = (0.0, 0.3, 0.3, 0.3)
labels = ['Male', 'Female', 'Other', 'Not answered']
colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral']
fig = plt.figure(figsize=(15, 7))
fig.suptitle('Gender distribution', fontsize=14, fontweight='bold')
ax1 = fig.add_subplot(121)
ax1.pie(fracs1, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=90)

plt.show()

pass

In [None]:
"""
Gender calculations
Not meant to be used like this!
"""

from collections import defaultdict
genders = defaultdict(lambda : 0)
collected = filtered_set.collect()
for l in collected:
    genders[l[36]] += 1
for x in genders:
    print '{} {}'.format(x, genders[x])

In [None]:
"""
Age barrplot calculations
"""
def make_features_age(line):
    features_age = []
    
    
    
    age = line[0]
    
    if age == 'NA':
        features_age.extend([1, 0, 0, 0, 0])
    else:
        age = int(age)
        if age <= 20:
            features_age.extend([0, 1, 0, 0, 0])
        elif age > 20 and age <= 35:
            features_age.extend([0, 0, 1, 0, 0])
        elif age > 35 and age <= 50:
            features_age.extend([0, 0, 0, 1, 0])
        else:
            features_age.extend([0, 0, 0, 0, 1])
    return features_age

featured_age = filtered_set.map(make_features_age)

#for x in featured_age.collect(): 
#    print x

In [None]:
%matplotlib inline

# Make the percentages bigger! (readability)
mpl.rcParams['font.size'] = 20.0

summary_age = Statistics.colStats(featured_age)
mean_age = summary_age.mean()
fracs_age = [mean_age[0], mean_age[1], mean_age[2], mean_age[3], mean_age[4]]

explode = (0.3, 0.3, 0.3, 0.3, 0.3)
labels = ['NA', '<= 20', ' 20 < ... < 35', ' 35 < ... < 50', '50 <']
colors = ['yellowgreen', 'gold', 'lightskyblue', 'lightcoral', 'green']
fig = plt.figure(figsize=(15, 7))
fig.suptitle('Age distribution', fontsize=14, fontweight='bold')
ax1 = fig.add_subplot(121)
ax1.pie(fracs_age, explode=explode, labels=labels, colors=colors, autopct='%1.1f%%', shadow=True, startangle=90)

plt.show()

pass