In [35]:
import pandas as pd
import re

# Pre-Study Processing

In [36]:
responses = pd.read_csv('pre_study.csv')
responses.head()

Unnamed: 0,Timestamp,Participant ID,I have read and signed the Informed Consent Form.,Age,Gender,"If currently a student, what academic degree and program of study are you currently pursuing? What is your degree progress?","If you've worked in industry, what was your position and how long did you spend in this role?",How many years of experience do you have programming?,About how many hours a week do you engage in programming?,Which describe your programming background and style? (Check all that apply),...,I am comfortable with programming in these domains. Please mark N/A if you don't have experience in these. [Installation Work],Rate the following statements: [I consider myself a member of the CS community.],Rate the following statements: [I feel comfortable writing code from scratch],Rate the following statements: [I need a code skeleton to get started when I code.],Rate the following statements: [I feel comfortable using version control to manage my code and/or work collaboratively.],Rate the following statements: [I feel capable as a programmer.],Rate the following statements: [I feel accepted by my peers.],Rate the following statements: [I feel comfortable sharing my code with others.],What was the last time you used Python? Describe what it was for.,Can you describe a project where you used Jupyter Notebooks? Please describe the process.
0,8/27/2018 13:35:09,11,Yes,20,Male,"4th year B.A. Computer Science, Cognitive Science",Deep learning intern (3 months),6,12,Early-exposure (Middle/high school clubs/class...,...,,Option 1,,,,,,,Yesterday. This study,I wrote a Jupyter Notebook logging extension f...
1,8/28/2018 13:20:17,111,Yes,20,Female,"B.A. in Computer Science, 3rd year",Software Development Engineer Intern for 3 months,3,6,Early-exposure (Middle/high school clubs/class...,...,,4 - Agree,3 - Neutral,3 - Neutral,4 - Agree,3 - Neutral,3 - Neutral,3 - Neutral,Spring 2018 semester for CS 188,EE16A labs used Jupyter Notebooks -- they prov...
2,8/28/2018 14:25:33,112,Yes,20,Male,B.S. Mechanical Engineering,"4 summers doing robotics design, 1 year doing ...",4,3,Light University-exposure (1-2 programming cla...,...,,3 - Neutral,3 - Neutral,4 - Agree,4 - Agree,4 - Agree,4 - Agree,4 - Agree,Summer 2018. I used it to program a 1D underwa...,in EE16A we used them for homeworks and labs
3,8/28/2018 16:20:32,113,Yes,21,Male,"B.A. in Cognitive Science, B.S. in Business Ad...",2 internships: Advertising Design at Spotify; ...,3,0,"DIY Programmer (""Picked it up along the way""),...",...,,2 - Disagree,2 - Disagree,4 - Agree,4 - Agree,3 - Neutral,4 - Agree,4 - Agree,The last time I used Python was in the Spring ...,"In COG SCI 131, I used Jupyter Notebooks to pr..."
4,8/29/2018 9:20:46,211,Yes,18,Female,"B.S. in Mechanical Engineering, 3rd year",,7,8,Early-exposure (Middle/high school clubs/class...,...,,3 - Neutral,4 - Agree,2 - Disagree,4 - Agree,3 - Neutral,4 - Agree,4 - Agree,"Personal project, getting data from an API and...","EE16B labs/homeworks. Open anaconda, open jupy..."


In [37]:
output_features = ['ID', 'Age', 'Gender',
                   'Bachelor_EECS', 'Bachelor_other', 'PhD_EECS', 'PhD_other', 
                   'Industry_Experience','Years_Programming', 'Hours_Programming_Per_Week',
                   'Early_Exposure', 'Light_Exposure', 'Heavy_Exposure', 'DIY_progranner', 'Book_learner', 
                   'Lecture_learner', 'Web_learner', 'PSet_learner', 'Project_learner', 'Group_project_learner',
                   'Collaborative_learner', 'Independent_learner', 'OH_attendee',
                   'Java', 'Python', 'C', 'Ruby', 'Javascript', 'C#', 'PHP', 'Obj-C', 'SQL', 'Swift', 
                   'C++', 'R', 'Scala', 'Go', 'HTML/CSS', 'Markup',
                   'Recursion', 'Iteration', 'Arrays', 'Hash_Map/Table', 'Dictionaries', 'Tree/Graph_Traversal', 
                   'Mem_Mgmt', 'Caches', 'Dynamic_Programming', 'Sorting_Searching', 'Stacks_Queues', 'Bit_twiddling', 'Regex', 
                   'Data_Processing', 'Graphics', 'Web_Apps', 'Web_Dev', 'OS_Mgmt', 'Network_System_Mgmt', 'Interactive_Device_Design', 
                   'Obj/Img/Activity_Recognition', 'Robotics', 'NLP', 'Interface_Prototyping', 'Vision_Sim', 'Software_App', 
                   'AR/VR', 'Installation_Work',
                   'Member_CS_Community', 'Comfortable_from_scratch', 'Need_skeleton', 'Comfortable_version_control',
                   'Capable_programmer', 'Accepted_by_peers', 'Comfortable_sharing']

In [38]:
def Bachelor(text):
    if type(text) is not str:
        return False
    
    return 'B.A.' in text or 'B.S.' in text

def PhD(text):
    if type(text) is not str:
        return False
    
    return 'phd' in text.lower() or "Ph.D." in text

def EECS(text):
    if type(text) is not str:
        return False
    
    return 'EECS' in text or 'Computer Science' in text or 'Electrical Engineering' in text

In [39]:
def process_likert(text):
    if type(text) is float:
        return 0
    else:
        return int(text[0])

In [40]:
def process_features(row):
    feats = {}
    feats['ID'] = row[1]
    feats['Age'] = row[3]
    feats['Gender'] = int(row[4] == 'Female')
    
    feats['Bachelor_EECS'] = int(Bachelor(row[5]) and EECS(row[5]))
    feats['Bachelor_other'] = int(Bachelor(row[5]) and not EECS(row[5]))
    feats['PhD_EECS'] = int(PhD(row[5]) and EECS(row[5]))
    feats['PhD_other'] = int(PhD(row[5]) and not EECS(row[5]))
    
    feats["Industry_Experience"] = int(type(row[6]) is str)
    try:
        feats['Years_Programming'] = float(row[7])
    except:
        feats['Years_Programming'] = float(re.findall(r'[0-9]+',row[7])[0]) 
    feats['Hours_Programming_Per_Week'] = int(row[8])
    
    feats['Early_Exposure'] = int('Early-exposure' in row[9])
    feats['Light_Exposure'] = int('Light University-exposure' in row[9])
    feats['Heavy_Exposure'] = int('Heavy University-exposure' in row[9])
    feats['DIY_progranner'] = int('DIY Programmer' in row[9])
    feats['Book_learner'] = int('Book-learner' in row[9])
    feats['Lecture_learner'] = int('Lecture-learner ' in row[9] or 'Lecture learner' in row[9])
    feats['Web_learner'] = int('Web-learner' in row[9])
    feats['PSet_learner'] = int('Problem set learner' in row[9])
    feats['Project_learner'] = int('Project learner' in row[9])
    feats['Group_project_learner'] = int('Group project learner' in row[9])
    feats['Collaborative_learner'] = int('Collaborative learner' in row[9])
    feats['Independent_learner'] = int('Independent learner ' in row[9])
    feats['OH_attendee'] = int('Office hours attendee' in row[9])
    
    i = 10
    for label in output_features[23:]:
        feats[label] = process_likert(row[i])
        i += 1
    
    return feats

In [41]:
featurized = pd.DataFrame(columns=output_features)
for idx, row in responses.iterrows():
    if idx == 0: # Matt's response
        continue
    features = process_features(row)
    featurized = featurized.append(features, ignore_index=True)

In [42]:
for label in output_features:
    if label != 'Hours_Programming_Per_Week':
        featurized[label] = featurized[label].astype(int)

In [43]:
featurized

Unnamed: 0,ID,Age,Gender,Bachelor_EECS,Bachelor_other,PhD_EECS,PhD_other,Industry_Experience,Years_Programming,Hours_Programming_Per_Week,...,Software_App,AR/VR,Installation_Work,Member_CS_Community,Comfortable_from_scratch,Need_skeleton,Comfortable_version_control,Capable_programmer,Accepted_by_peers,Comfortable_sharing
0,111,20,1,1,0,0,0,1,3,6.0,...,4,4,0,4,3,3,4,3,3,3
1,112,20,0,0,1,0,0,1,4,3.0,...,3,2,0,3,3,4,4,4,4,4
2,113,21,0,0,1,0,0,1,3,0.0,...,0,0,0,2,2,4,4,3,4,4
3,211,18,1,0,1,0,0,0,7,8.0,...,0,0,0,3,4,2,4,3,4,4
4,212,27,0,0,0,0,0,1,6,2.0,...,5,2,4,2,5,1,5,5,5,5
5,214,25,0,0,0,1,0,1,7,20.0,...,5,0,0,4,4,2,5,5,5,5
6,411,19,1,1,0,0,0,1,7,10.0,...,4,2,2,4,4,2,5,4,4,5
7,412,19,0,0,0,0,0,0,2,5.0,...,4,3,1,4,4,2,4,4,4,4
8,413,19,0,0,0,0,0,1,3,10.0,...,5,1,0,4,5,1,5,4,4,5
9,414,18,0,0,1,0,0,0,1,3.0,...,0,0,0,4,3,4,3,3,4,3


# Pre-Study Stats

In [44]:
demographics = output_features[1:3]
degrees = output_features[3:7]
programming_experience = output_features[7:10]
learning_types = output_features[10:23]
programming_languages = output_features[23:38]
programming_concepts = output_features[38:52]
programming_domains = output_features[52:67]
soft_hard = output_features[67:]

In [45]:
# Gender = 1 <--> female
featurized[demographics].describe()

Unnamed: 0,Age,Gender
count,17.0,17.0
mean,22.0,0.470588
std,5.926635,0.514496
min,18.0,0.0
25%,19.0,0.0
50%,20.0,0.0
75%,22.0,1.0
max,43.0,1.0


In [46]:
featurized[degrees].describe()

Unnamed: 0,Bachelor_EECS,Bachelor_other,PhD_EECS,PhD_other
count,17.0,17.0,17.0,17.0
mean,0.294118,0.294118,0.176471,0.0
std,0.469668,0.469668,0.392953,0.0
min,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0
50%,0.0,0.0,0.0,0.0
75%,1.0,1.0,0.0,0.0
max,1.0,1.0,1.0,0.0


In [47]:
featurized[programming_experience].describe()

Unnamed: 0,Industry_Experience,Years_Programming,Hours_Programming_Per_Week
count,17.0,17.0,17.0
mean,0.764706,5.411765,11.764706
std,0.437237,4.458963,9.627626
min,0.0,1.0,0.0
25%,1.0,3.0,5.0
50%,1.0,4.0,8.0
75%,1.0,7.0,20.0
max,1.0,20.0,30.0


In [48]:
featurized[learning_types].describe()

Unnamed: 0,Early_Exposure,Light_Exposure,Heavy_Exposure,DIY_progranner,Book_learner,Lecture_learner,Web_learner,PSet_learner,Project_learner,Group_project_learner,Collaborative_learner,Independent_learner,OH_attendee
count,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0
mean,0.411765,0.058824,0.705882,0.529412,0.294118,0.352941,0.470588,0.352941,0.823529,0.176471,0.529412,0.647059,0.176471
std,0.5073,0.242536,0.469668,0.514496,0.469668,0.492592,0.514496,0.492592,0.392953,0.392953,0.514496,0.492592,0.392953
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
50%,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,1.0,0.0
75%,1.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,0.0
max,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0


In [49]:
featurized[programming_languages].describe()

Unnamed: 0,Java,Python,C,Ruby,Javascript,C#,PHP,Obj-C,SQL,Swift,C++,R,Scala,Go,HTML/CSS
count,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0
mean,3.764706,4.470588,2.705882,0.941176,3.058824,0.882353,1.0,1.117647,2.529412,1.176471,2.058824,1.0,1.117647,0.823529,3.352941
std,1.393261,0.514496,1.759428,1.088037,1.519481,1.268974,1.541104,1.536325,1.419403,1.467791,1.748949,1.224745,1.653872,1.014599,1.538716
min,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,4.0,2.0,0.0,2.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,2.0
50%,4.0,4.0,3.0,1.0,3.0,0.0,0.0,0.0,3.0,1.0,2.0,1.0,0.0,0.0,4.0
75%,5.0,5.0,4.0,2.0,4.0,2.0,1.0,2.0,3.0,2.0,4.0,1.0,1.0,2.0,5.0
max,5.0,5.0,5.0,3.0,5.0,4.0,5.0,4.0,5.0,4.0,5.0,4.0,5.0,3.0,5.0


In [50]:
featurized[programming_concepts].describe()

Unnamed: 0,Markup,Recursion,Iteration,Arrays,Hash_Map/Table,Dictionaries,Tree/Graph_Traversal,Mem_Mgmt,Caches,Dynamic_Programming,Sorting_Searching,Stacks_Queues,Bit_twiddling,Regex
count,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0
mean,1.647059,4.117647,4.470588,4.588235,3.411765,3.764706,3.411765,2.941176,2.411765,2.294118,3.411765,3.529412,2.882353,3.176471
std,1.998161,0.696631,0.71743,0.618347,1.872793,1.393261,1.416811,1.297622,1.277636,1.611083,1.064121,1.545867,1.409005,1.185079
min,0.0,3.0,3.0,3.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
25%,0.0,4.0,4.0,4.0,3.0,3.0,3.0,2.0,2.0,2.0,3.0,3.0,2.0,3.0
50%,1.0,4.0,5.0,5.0,4.0,4.0,4.0,3.0,3.0,2.0,4.0,4.0,3.0,3.0
75%,4.0,5.0,5.0,5.0,5.0,5.0,4.0,4.0,3.0,3.0,4.0,4.0,4.0,4.0
max,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0,5.0


In [51]:
featurized[programming_domains].describe()

Unnamed: 0,Data_Processing,Graphics,Web_Apps,Web_Dev,OS_Mgmt,Network_System_Mgmt,Interactive_Device_Design,Obj/Img/Activity_Recognition,Robotics,NLP,Interface_Prototyping,Vision_Sim,Software_App,AR/VR,Installation_Work
count,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0,17.0
mean,3.470588,1.705882,2.588235,3.0,1.470588,1.294118,1.941176,1.882353,1.470588,1.764706,2.823529,0.823529,3.235294,1.294118,1.352941
std,1.328422,1.447615,1.970369,1.903943,1.699913,1.686887,1.638238,1.7278,1.504894,1.678147,1.740521,1.131111,1.953504,1.311712,1.497547
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,3.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,3.0,0.0,0.0
50%,4.0,2.0,3.0,4.0,1.0,0.0,2.0,2.0,1.0,2.0,3.0,0.0,4.0,1.0,1.0
75%,4.0,3.0,4.0,4.0,2.0,2.0,4.0,3.0,2.0,3.0,4.0,2.0,5.0,2.0,3.0
max,5.0,4.0,5.0,5.0,5.0,5.0,4.0,5.0,5.0,5.0,5.0,3.0,5.0,4.0,4.0


In [52]:
featurized[soft_hard].describe()

Unnamed: 0,Member_CS_Community,Comfortable_from_scratch,Need_skeleton,Comfortable_version_control,Capable_programmer,Accepted_by_peers,Comfortable_sharing
count,17.0,17.0,17.0,17.0,17.0,17.0,17.0
mean,3.823529,3.941176,2.294118,4.294118,3.764706,4.117647,4.117647
std,0.951006,0.899346,1.046704,0.771744,0.752447,0.600245,0.781213
min,2.0,2.0,1.0,3.0,3.0,3.0,3.0
25%,3.0,3.0,2.0,4.0,3.0,4.0,4.0
50%,4.0,4.0,2.0,4.0,4.0,4.0,4.0
75%,4.0,5.0,3.0,5.0,4.0,4.0,5.0
max,5.0,5.0,4.0,5.0,5.0,5.0,5.0


In [53]:
# Returns the response for each ID and aggregated response statistics
# Use this if the above is not informative enough
def get_stats(columns, df=featurized):
    print(df[['ID'] + columns])
    return df[['ID'] + columns].describe()

In [54]:
get_stats(['Robotics'])

     ID  Robotics
0   111         0
1   112         5
2   113         0
3   211         3
4   212         1
5   214         2
6   411         2
7   412         1
8   413         4
9   414         0
10  512         0
11  613         1
12  614         2
13  711         0
14  713         2
15  813         0
16  811         2


Unnamed: 0,ID,Robotics
count,17.0,17.0
mean,435.882353,1.470588
std,246.364284,1.504894
min,111.0,0.0
25%,212.0,0.0
50%,413.0,1.0
75%,614.0,2.0
max,813.0,5.0


# Post-Study Processing

In [55]:
post_study = pd.read_csv('post_study.csv')
post_study

Unnamed: 0,Timestamp,Participant ID,How would you rate your overall coding session?,"Rate the statement: During the coding session, I programmed like I would normally program.",Rate the statement: The wristband affected my ability to code.,We set up your workstation like this. Can you describe how your workstation is similar/different?,"Is there anything that you do differently that you couldn't do within this setup? e.g. A specific place you program, a specific time, a specific group of people? Would you work longer/shorter?"
0,8/28/2018 14:06:22,111,4,4,5,Trackpad. Pretty similar.,Closed room without people; work at night. Hea...
1,8/28/2018 15:20:55,112,6,5,1,If I'm doing homework I'll often sit on the co...,I usually binge program since as soon as you w...
2,8/28/2018 17:12:09,113,3,6,1,I always program on a laptop using the laptop ...,I tend to work with noise-canceling headphones...
3,8/29/2018 10:02:15,211,5,5,6,I almost entirely work on a laptop so this key...,"no specific places, times, or people. usually ..."
4,8/29/2018 14:11:26,212,7,4,1,"i have vim keybindings in jupyter notebook, an...","eh, i listen to music, otherwise i program pr..."
5,8/29/2018 16:13:24,214,6,6,4,"My desktop has two monitors, although I primar...",I normally listen to music while I program. Th...
6,8/31/2018 11:56:13,411,6,6,2,"I normally use my laptop, which is a Lenovo Th...",One difference is that during the session I fo...
7,8/31/2018 12:49:52,412,2,5,1,"I normally work on a laptop, not with a monito...",I think this was pretty representative of what...
8,8/31/2018 14:25:58,413,4,3,4,"In my workstation, I usually have 2 monitors a...","I typically program later at night, and work o..."
9,8/31/2018 16:11:15,414,3,3,4,My workstation is with a Windows laptop and wi...,Generally I listen to music when I am coding a...


In [56]:
post_study_feats = pd.DataFrame(columns=['ID', 'Overall_Rating', 'Programmed_Normally', 'Wristband'], dtype='float')
for idx, row in post_study.iterrows():
    feats = {'ID': row[1], 
             'Overall_Rating': row[2], 
             'Programmed_Normally': row[3], 
             'Wristband': row[4]}
    post_study_feats = post_study_feats.append(feats, ignore_index=True)

In [57]:
post_study_feats.describe()

Unnamed: 0,ID,Overall_Rating,Programmed_Normally,Wristband
count,17.0,17.0,17.0,17.0
mean,435.882353,4.941176,5.352941,2.352941
std,246.364284,1.599632,1.221739,1.617914
min,111.0,2.0,3.0,1.0
25%,212.0,4.0,5.0,1.0
50%,413.0,5.0,6.0,2.0
75%,614.0,6.0,6.0,4.0
max,813.0,7.0,7.0,6.0


# Predicting Success

In [58]:
f_ = featurized.set_index("ID")
p_ = post_study_feats.set_index("ID")
all_results = f_.join(p_)
all_results

Unnamed: 0_level_0,Age,Gender,Bachelor_EECS,Bachelor_other,PhD_EECS,PhD_other,Industry_Experience,Years_Programming,Hours_Programming_Per_Week,Early_Exposure,...,Member_CS_Community,Comfortable_from_scratch,Need_skeleton,Comfortable_version_control,Capable_programmer,Accepted_by_peers,Comfortable_sharing,Overall_Rating,Programmed_Normally,Wristband
ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
111,20,1,1,0,0,0,1,3,6.0,1,...,4,3,3,4,3,3,3,4.0,4.0,5.0
112,20,0,0,1,0,0,1,4,3.0,0,...,3,3,4,4,4,4,4,6.0,5.0,1.0
113,21,0,0,1,0,0,1,3,0.0,0,...,2,2,4,4,3,4,4,3.0,6.0,1.0
211,18,1,0,1,0,0,0,7,8.0,1,...,3,4,2,4,3,4,4,5.0,5.0,6.0
212,27,0,0,0,0,0,1,6,2.0,0,...,2,5,1,5,5,5,5,7.0,4.0,1.0
214,25,0,0,0,1,0,1,7,20.0,0,...,4,4,2,5,5,5,5,6.0,6.0,4.0
411,19,1,1,0,0,0,1,7,10.0,1,...,4,4,2,5,4,4,5,6.0,6.0,2.0
412,19,0,0,0,0,0,0,2,5.0,0,...,4,4,2,4,4,4,4,2.0,5.0,1.0
413,19,0,0,0,0,0,1,3,10.0,1,...,4,5,1,5,4,4,5,4.0,3.0,4.0
414,18,0,0,1,0,0,0,1,3.0,0,...,4,3,4,3,3,4,3,3.0,3.0,4.0


In [59]:
from sklearn import linear_model as lm
from sklearn.preprocessing import MinMaxScaler
import numpy as np

In [60]:
X = np.asarray(all_results.loc[:, 'Gender':'Comfortable_sharing'])
y = np.asarray(all_results['Overall_Rating'])
scaler = MinMaxScaler()
scaler.fit(X)
scaler.transform(X)
y = (y - np.min(y)) / (np.max(y) - np.min(y))

In [61]:
model = lm.LinearRegression()
model.fit(X, y)
print(model.score(X, y))
columns = all_results.columns[1:-3]
for i in range(1, len(columns)):
    print(columns[i-1], model.coef_[i])

1.0
Gender 0.02401701074206966
Bachelor_EECS -0.0003314529860138152
Bachelor_other -0.010516321261368446
PhD_EECS 2.7755575615628914e-17
PhD_other 0.012025343938313612
Industry_Experience 0.011714901345697065
Years_Programming 0.010167748840261018
Hours_Programming_Per_Week 0.013919520659723959
Early_Exposure 0.00824059478767185
Light_Exposure -0.006427265350101328
Heavy_Exposure -0.005198437603279097
DIY_progranner 0.014156215551839189
Book_learner 0.0033123584002383057
Lecture_learner 0.0022519694730697183
Web_learner -0.0047543372709860105
PSet_learner 0.0076162559613886
Project_learner 0.003168194557235079
Group_project_learner -0.009785867923607996
Collaborative_learner 0.02103605380881994
Independent_learner 0.00810383614335403
OH_attendee 0.0166440126582671
Java -0.009851638529638345
Python -0.01149693038577722
C -0.004334289032368415
Ruby 0.04318261482452068
Javascript -0.04290136030679254
C# 0.015480006022547649
PHP -0.027160006955186302
Obj-C -0.013705957602571544
SQL 0.00651

In [62]:
# Lasso (L1 regularization) encourages sparse feature selection
model = lm.Lasso(alpha=.1)
model.fit(X, y)
print(model.score(X, y))
columns = all_results.columns[1:-3]
for i in range(1, len(columns)):
    print(columns[i-1], model.coef_[i])

0.6011351063587493
Gender 0.0
Bachelor_EECS -0.0
Bachelor_other -0.0
PhD_EECS 0.0
PhD_other 0.0
Industry_Experience 0.0009556115578173016
Years_Programming 0.012714348903000145
Hours_Programming_Per_Week 0.0
Early_Exposure 0.0
Light_Exposure -0.0
Heavy_Exposure 0.0
DIY_progranner 0.0
Book_learner -0.0
Lecture_learner -0.0
Web_learner -0.0
PSet_learner 0.0
Project_learner -0.0
Group_project_learner -0.0
Collaborative_learner 0.0
Independent_learner 0.0
OH_attendee -0.0
Java -0.0
Python -0.0
C -0.0
Ruby 0.03609700866583228
Javascript -0.019736796891004403
C# 0.0
PHP -0.0
Obj-C -0.0
SQL 0.0
Swift 0.0
C++ -0.0
R 0.015203763750403486
Scala 0.0
Go -0.0
HTML/CSS 0.0
Markup 0.0
Recursion 0.0
Iteration 0.0
Arrays -0.0
Hash_Map/Table 0.0
Dictionaries -0.0
Tree/Graph_Traversal 0.0
Mem_Mgmt -0.0
Caches -0.0
Dynamic_Programming -0.0
Sorting_Searching -0.0
Stacks_Queues 0.0
Bit_twiddling -0.0
Regex 0.0
Data_Processing 0.0
Graphics 0.0
Web_Apps 0.0
Web_Dev 0.013943588880857007
OS_Mgmt -0.0
Network_Sy

In [63]:
print(all_results[['Ruby', 'Overall_Rating']])
all_results[['Ruby', 'Overall_Rating']].describe()

     Ruby  Overall_Rating
ID                       
111     3             4.0
112     0             6.0
113     0             3.0
211     0             5.0
212     1             7.0
214     0             6.0
411     3             6.0
412     1             2.0
413     2             4.0
414     2             3.0
512     1             7.0
613     1             5.0
614     0             6.0
711     0             6.0
713     2             7.0
813     0             3.0
811     0             4.0


Unnamed: 0,Ruby,Overall_Rating
count,17.0,17.0
mean,0.941176,4.941176
std,1.088037,1.599632
min,0.0,2.0
25%,0.0,4.0
50%,1.0,5.0
75%,2.0,6.0
max,3.0,7.0
