In [None]:
from datascience import *
import numpy as np

import matplotlib.pyplot as plots
plots.style.use('fivethirtyeight')
%matplotlib inline

np.warnings.filterwarnings('ignore', category=np.VisibleDeprecationWarning)                 

# Case Study Education

## Estimating Exam Scores

Scenario: </br>
In a large course with 2 midterm exams, most students took both midterms.</br>
John was sick for the second midterm. He emailed the instructor who excused him.

Now it’s time to assign John a grade in the course...</br>
The course is graded on an absolute scale that allocates 90 points to midterms:
* 40 for midterm 1
* 50 for midterm 2

There are several options: 
* Give an Incomplete Grade: Next semester, John has to take exam 2.
    * What is good and what is bad about this approach?
* Scale Up Midterm 1 Score
    * E.g., if John scored 21 out of 40 on midterm 1, assign him a score of $21/40*50 = 26.25$ out of 50 on midterm. Equivalently, assign him a total score of $21/40*90 = 47.25$.
* Use the Midterm 1 Z-Score
    * E.g., if John scored 1 standard deviation below the mean on midterm 1, assign him a midterm 2 score that is 1 standard deviation below the mean.
* Use Midterm 1 Percentile
    * E.g., if John scored in the 30th percentile on the final, assign him a midterm score that is in the 30th percentile.
* Use Linear Regression
    * E.g., if John scored 1 standard deviation below the mean on the final, and the correlation coefficient r between midterm and final scores was 0.8 for students who took both, then assign him a midterm score that is 0.8 standard deviations below the mean.

In [None]:
#Load in the Scores for both Midterms
 
scores = Table.read_table("scores.csv")
scores.drop(2).show(5)

In [None]:
#Find the mean and standard deviation for each midterm. 

mt1 = scores.column('Midterm 1')
mt2 = scores.column('Midterm 2')
print('Midterm 1 avg:', np.average(mt1), 'std dev:', np.std(mt1))
print('Midterm 2 avg:', np.average(mt2), 'std dev:', np.std(mt2))

### Option 1: Scale Up

E.g., if John scored 21 out of 40 on midterm 1, assign him a score of $21/40*50 = 26.25$ out of 50 on midterm. Equivalently, assign him a total score of $21/40*90 = 47.25$.
   * What is good and what is bad about this approach?

In [None]:
#Assuming John got a 21 our of 40 on the first midterm. 
#Use that propotion to approximate the score he would have earned on the second midterm. 

mt1_actual = 21
mt2_estimate_1 = mt1_actual / 40 * 50
mt2_estimate_1

In [None]:
#Compare that value to the average score on midterm 2

mt2_estimate_1 - np.average(mt2)

In [None]:
#How many standard deviations below the mean is his assigned score?

(mt2_estimate_1 - np.average(mt2)) / np.std(mt2)

In [None]:
#Determine his actual standard deviation for midterm one. 

(mt1_actual - np.average(mt1)) / np.std(mt1)

In [None]:
#What is the median value of midterm 2?

np.median(mt2)

### Option 2: Z-Score
E.g., if John scored 1 standard deviation below the mean on midterm 1, assign him a midterm 2 score that is 1 standard deviation below the mean.
   * What is good and what is bad about this approach?

In [None]:
#Again assuming John scored 21 on the first exam. 
mt1_actual = 21

#Standardize his first midterm score and assign a midterm two score based on the z-score.
mt1_z = (mt1_actual - np.average(mt1)) / np.std(mt1)
mt2_estimate_2 = np.average(mt2) + mt1_z * np.std(mt2)
mt2_estimate_2

In [None]:
#How far from the mean is the new score?

mt2_estimate_2 - np.average(mt2)

In [None]:
#How many standard deviations is the new score?

(mt2_estimate_2 - np.average(mt2)) / np.std(mt2)

In [None]:
#Review the distribution of the midterm one scores. 

scores.hist('Midterm 1', unit='point')

In [None]:
#Review the distribution for the midterm two scores. 

scores.hist('Midterm 2', unit='point')

What can you conclude about the two exams by comparing the distributions?



### Option 3: Percentile
Since the distributions are so different, a percentile method may be a better approximation of what John would've actually scored. 

E.g., if John scored in the 30th percentile on the final, assign him a midterm score that is in the 30th percentile.
   * What is good and what is bad about this approach?



In [None]:
#Under the same assumption that John scored 21 on midterm one. 
mt1_actual = 21

#Find what percentile he was in for midterm one. 

mt1_percentile = sum(mt1 <= mt1_actual) / len(mt1) * 100
mt1_percentile

In [None]:
#Verify that value will give back John's actual score. 

percentile(mt1_percentile, mt1)

In [None]:
#What score would that percentile generate for midterm two?

percentile(mt1_percentile, mt2)

In [None]:
#Find all the people who got 21 on the midterm one and see where they scored for midterm two. 

scores.where('Midterm 1', 21).hist('Midterm 2', normed=False)

In [None]:
#What was the average for midterm two of those who scored 21 on midterm one.

np.average(scores.where('Midterm 1', 21).column('Midterm 2'))

### Option 4: Linear Regression

E.g., if John scored 1 standard deviation below the mean on the final, and the correlation coefficient r between midterm and final scores was 0.8 for students who took both, then assign him a midterm score that is 0.8 standard deviations below the mean.
   * What is good and what is bad about this approach?

In [None]:
#Familiar functions to find r, the slope, and the intercept for the linear regression line. 
def standard_units(arr):
    """Converts an array to standard units """
    return (arr - np.average(arr))/np.std(arr)

def correlation(t, x, y):
    """Computes correlation: t is a table, and x and y are column names """
    x_standard = standard_units(t.column(x))
    y_standard = standard_units(t.column(y))
    return np.average(x_standard * y_standard)

def slope(t, x, y):
    """Computes the slope of the regression line, like correlation above """
    r = correlation(t, x, y)
    y_sd = np.std(t.column(y))
    x_sd = np.std(t.column(x))
    return r * y_sd / x_sd

def intercept(t, x, y):
    """Computes the intercept of the regression line, like slope above """
    x_mean = np.mean(t.column(x))
    y_mean = np.mean(t.column(y))
    return y_mean - slope(t, x, y)*x_mean

def fitted_values(t, x, y):
    """Return an array of the regression estimates (predictions) at all the x values"""
    a = slope(t, x, y)
    b = intercept(t, x, y)
    return a*t.column(x) + b

In [None]:
#What is the correlation between midterm one and midterm two?

r = correlation(scores, 'Midterm 1', 'Midterm 2')
r

In [None]:
#Remember, John got 21 on the first midterm. 
mt1_actual = 21

#Using the stardized score of midterm one adjusted by r. 
#Apply that two midterm two to see what score he would be assigned. 

mt1_z = (mt1_actual - np.average(mt1)) / np.std(mt1)
mt2_estimate_2 = np.average(mt2) + mt1_z * r * np.std(mt2)
mt2_estimate_2

In [None]:
#Regression works best when the association is ...
#Is this a linear association? Run the scatter plot to see.

scores.scatter('Midterm 1', 'Midterm 2')

In [None]:
#Find the predicted values based on the slope and intercept of the association. 
#Linear Regression line.
#Does it seem to go thru the middle of the cloud.

a = slope(scores, 'Midterm 1', 'Midterm 2')
b = intercept(scores, 'Midterm 1', 'Midterm 2')
scores.drop(2).with_column('Fitted', a * mt1 + b ).scatter('Midterm 1')

In [None]:
#Check the scatter of the residuals (errors) to see if it truly is linear. 
#What should they be centered around? 
#Does it look like the association is linear?

scores.with_column('Residual', mt2 - (a * mt1 + b)).scatter('Midterm 1', 'Residual')

In [None]:
#Review the histogram of the midterm two scores for those who scored 21 on midterm one.

scores.where("Midterm 1", mt1_actual).hist('Midterm 2')

In [None]:
#Zoom in the histogram to look at people who scored similar to John (not exactly the same).
#How did they do on midterm two?

scores.where("Midterm 1", are.between_or_equal_to(mt1_actual-2, mt1_actual+2)).hist('Midterm 2')

In [None]:
#Function to find all the students who got similar midterm one score (within two points).
#Then find the average of their midterm two scores. 

def avg_mt2(mt1):
    near = scores.where("Midterm 1", are.between_or_equal_to(mt1-2, mt1+2))
    return near.column("Midterm 2").mean()

#Run the function for John's actual midterm one score.
#What is the average midterm two for those who have similar midterm one scores.

avg_mt2(mt1_actual)

In [None]:
#Create an array of the average score for every midterm one score.
mt2_avg = scores.apply(avg_mt2, 'Midterm 1')
mt2_avg

In [None]:
#Plot the averages on the scatter to see if it's a linear association. 

scores.drop(2).with_column('Avg', mt2_avg).scatter('Midterm 1')

Does this appear to be a linear association? 

Would the linear regression line approximate well enough?

Of all the methods of approximating a missing grade, which do you think is the fairest and most accurate? Why?


## Tutoring

The scores we were using were from Berkeley CS61A: Program Structures, shortly after they introduced optional small group tutoring. 

Fall 2017 small-group mentoring/tutoring 
* There were 84 mentors available for the 587 students in this course over 140 sections in the mentoring programs. 
* There were 1000 students who did not sign up for the mentoring.  

The question: Does the mentoring actually help student to be better prepared for midterm two?

Students were given the option to join the mentoring program after they completed midterm one. </br>
Mentoring sessions ran for several weeks between midterm one and midterm two.  </br>
Students then took midterm two. </br>


In [None]:
#Show complete table that included the mentoring column. 
#True means they joined and False means they did not join. 

scores.show(5)


In [None]:
#Run a scatter grouped by whether they were mentored or not. 

scores.scatter('Midterm 1', 'Midterm 2', group='Mentored')

#What do you notice?


In [None]:
#Run a histogram grouped by mentoring for midterm one scores.

scores.hist('Midterm 1', group='Mentored', bins=np.arange(0, 41, 5), normed=False)

#What do you notice?


In [None]:
#Run a histogram grouped by mentoring for midterm two scores.

scores.hist('Midterm 2', group='Mentored', bins=np.arange(0, 51, 5), normed=False)

#What do you notice?
 

In [None]:
#Create a graph of averages of the midterm two score for those who did NOT sign up for mentoring.
no_mentor = scores.where("Mentored", False)

def avg_mt2_no_mentor(mt1):
    near = no_mentor.where("Midterm 1", are.between_or_equal_to(mt1-2, mt1+2))
    return near.column("Midterm 2").mean()

predicted_mt2 = scores.apply(avg_mt2_no_mentor, "Midterm 1")

In [None]:
#Apply the predicted values of the midterm two score for those who were not mentored to the scatter. 

scores.drop(2).with_column('Predicted Mt2', predicted_mt2).scatter('Midterm 1')

In [None]:
#Compute all students improvement based on the control group of those who did not go to mentoring. 
#How much better did they do on midterm two compared to midterm one grouped by mentoring?
#Basically, we are looking at the residuals compared to the predicted values.

scores = scores.with_column("Improvement", scores.column('Midterm 2') - predicted_mt2)

scores.hist("Improvement", bins=np.arange(-30, 31, 5), group="Mentored", unit="point")

#What is the shape of the graph from those who did not go to mentoring? Where is it centered?

#What can you say about those who did go to mentoring?

### The Test
Create a confidence interval for how much people tended to improve over what was expected on average.

The Hypothesis Test: Does that confidence interval contain zero?

Null Hypothesis: There is no difference between those who mentored and those who did not. </br>
Alternate Hypothesis: There is a difference (or those who mentored scored higher)

In [None]:
def of_at_least_5(values):
    return sum(values >= 5) / len(values)

scores.select('Mentored', 'Improvement').group('Mentored', of_at_least_5).set_format(1, PercentFormatter)

In [None]:
#How much did students improve, on average, based on mentoring group. 

scores.group("Mentored", np.mean)

In [None]:
#Create 95% confidence intervals for each group. 
def mean_ci(observations):
    means = []
    for i in np.arange(2000):
        means.append(observations.sample().column("Improvement").mean())
    lower, upper = percentile(2.5, means), percentile(97.5, means)
    print("Mean improvement:", observations.column("Improvement").mean())
    print("95% CI of mean improvement:", lower, "to", upper)

mentored = scores.where("Mentored", True)
mean_ci(mentored)

#Would you reject the null? Why or Why not?


In [None]:
#What about for students who scored below 20 on midterm one?

mean_ci(mentored.where("Midterm 1", are.below(20)))

#What do you notice?

In [None]:
#What about for students who scored between 20 and 30 on midterm one?

mean_ci(mentored.where("Midterm 1", are.between(20, 30)))

#What do you notice?

In [None]:
#What about for students who scored above 30 on midterm one?

mean_ci(mentored.where("Midterm 1", are.above_or_equal_to(30)))

#What do you notice?