In [55]:
import pandas as pd

In [56]:
df = pd.read_csv("https://raw.githubusercontent.com/jiaqima/is327-resources/main/data/class_grades.csv")

In [57]:
df.head()

Unnamed: 0,name,homewk1,homewk2,midterm,partic,exam
0,Genevieve Gallegos,95,100,100,100,100
1,Masako Holley,40,55,48,78,62
2,Pearle Goodman,62,80,80,81,85
3,Douglas Schmidt,66,84,74,83,82
4,Russell Mills,8,22,27,72,40


In [58]:
len(df)

50

In [59]:
df["pass"] = df["exam"] > 60

In [60]:
df.head()

Unnamed: 0,name,homewk1,homewk2,midterm,partic,exam,pass
0,Genevieve Gallegos,95,100,100,100,100,True
1,Masako Holley,40,55,48,78,62,True
2,Pearle Goodman,62,80,80,81,85,True
3,Douglas Schmidt,66,84,74,83,82,True
4,Russell Mills,8,22,27,72,40,False


In [61]:
df["homewk1"].mean()

57.14

In [62]:
# create a "clever algorithm" to predict exam pass by homewk1

def predict(homewk1):
    
    exam_pass = homewk1 > 57.14 #True or False
    
    return exam_pass

In [63]:
predict(75)

True

In [64]:
predict(55)

False

In [65]:
predict(df["homewk1"]).head()

0     True
1    False
2     True
3     True
4    False
Name: homewk1, dtype: bool

In [66]:
predictions = predict(df["homewk1"])

In [67]:
acc = (predictions == df["pass"]).mean()
acc

0.64

In [68]:
# parameterize the model

def predict(homewk1, cutoff):
    
    exam_pass = homewk1 > cutoff #True or False
    
    return exam_pass

In [71]:
predictions = predict(df["homewk1"], 57.14)
acc = (predictions == df["pass"]).mean()
acc

0.64

In [72]:
predictions = predict(df["homewk1"], 50)
acc = (predictions == df["pass"]).mean()
acc

0.8

In [74]:
# this is a machine learning algorithm

best_acc = 0

for cutoff in range(0, 100):
    predictions = predict(df["homewk1"], cutoff)
    acc = (predictions == df["pass"]).mean()
    if acc > best_acc:
        best_acc = acc
        best_cutoff = cutoff

print("Best accuracy:", best_acc)
print("Learned cutoff value:", best_cutoff)

Best accuracy: 0.94
Learned cutoff value: 34


In [75]:
# the learned model is predict(input, best_cutoff)

predict(8, best_cutoff)

False

## But we need to evaluate the model on new data

In [77]:
df_test = pd.read_csv("https://raw.githubusercontent.com/jiaqima/is327-resources/main/data/class_grades_test.csv")

In [78]:
df_test.head()

Unnamed: 0,name,homewk1,homewk2,midterm,partic,exam
0,Amber Mccoy,67,77,88,77,85
1,Francis Tedesco,48,50,42,68,74
2,Jeffrey Doyle,77,94,74,60,94
3,Bertha Rivera,63,76,83,82,92
4,Ronald Anderson,55,60,48,71,76


In [79]:
df_test["pass"] = df_test["exam"] > 60

In [80]:
df_test.head()

Unnamed: 0,name,homewk1,homewk2,midterm,partic,exam,pass
0,Amber Mccoy,67,77,88,77,85,True
1,Francis Tedesco,48,50,42,68,74,True
2,Jeffrey Doyle,77,94,74,60,94,True
3,Bertha Rivera,63,76,83,82,92,True
4,Ronald Anderson,55,60,48,71,76,True


In [81]:
df.head()

Unnamed: 0,name,homewk1,homewk2,midterm,partic,exam,pass
0,Genevieve Gallegos,95,100,100,100,100,True
1,Masako Holley,40,55,48,78,62,True
2,Pearle Goodman,62,80,80,81,85,True
3,Douglas Schmidt,66,84,74,83,82,True
4,Russell Mills,8,22,27,72,40,False


In [83]:
# evaluate on testing data

predictions = predict(df_test["homewk1"], best_cutoff)
acc = (predictions == df_test["pass"]).mean()
acc

0.88

## Generalization: difference between training and testing

### Testing accuracy is not as good as training accuracy. Can we do better?

#### Strategy 1: try a simpler model family

In [85]:
list(range(0,100,10))

[0, 10, 20, 30, 40, 50, 60, 70, 80, 90]

In [86]:
# lets only have multiples of 10 as cutoffs

best_acc = 0

for cutoff in range(0, 100, 10):
    predictions = predict(df["homewk1"], cutoff)
    acc = (predictions == df["pass"]).mean()
    if acc > best_acc:
        best_acc = acc
        best_cutoff = cutoff

print("Best training accuracy:", best_acc)
print("Learned cutoff value:", best_cutoff)

Best training accuracy: 0.94
Learned cutoff value: 40


In [87]:
# evaluate on testing data

predictions = predict(df_test["homewk1"], best_cutoff)
acc = (predictions == df_test["pass"]).mean()
print("Testing accuracy:", acc)

Testing accuracy: 0.91


In [88]:
# What if we try multiples of 30?
print(list(range(0, 100, 30)))

best_acc = 0

for cutoff in range(0, 100, 30):
    predictions = predict(df["homewk1"], cutoff)
    acc = (predictions == df["pass"]).mean()
    if acc > best_acc:
        best_acc = acc
        best_cutoff = cutoff

print("Best training accuracy:", best_acc)
print("Learned cutoff value:", best_cutoff)

[0, 30, 60, 90]
Best training accuracy: 0.9
Learned cutoff value: 30


In [89]:
# evaluate on testing data

predictions = predict(df_test["homewk1"], best_cutoff)
acc = (predictions == df_test["pass"]).mean()
print("Testing accuracy:", acc)

Testing accuracy: 0.86


#### Strategy 2: get more data

In [90]:
df_more = pd.read_csv("https://raw.githubusercontent.com/jiaqima/is327-resources/main/data/class_grades_more.csv")

In [91]:
df_more.head()

Unnamed: 0,name,homewk1,homewk2,midterm,partic,exam
0,Savannah Goh,70,77,81,76,93
1,Tara Luna,73,80,70,64,96
2,Helen Howes,36,43,40,79,46
3,Margarita Chandler,63,76,74,76,84
4,Justin Carter,67,64,97,83,84


In [94]:
df_more["pass"] = df_more["exam"] > 60

In [95]:
df_more = pd.concat([df, df_more])

In [96]:
len(df_more), len(df)

(100, 50)

In [102]:
# calculate the two strategies

best_acc = 0

for cutoff in range(0, 100, 10):
    predictions = predict(df_more["homewk1"], cutoff)
    acc = (predictions == df_more["pass"]).mean()
    if acc > best_acc:
        best_acc = acc
        best_cutoff = cutoff

print("Best training accuracy:", best_acc)
print("Learned cutoff value:", best_cutoff)

Best training accuracy: 0.94
Learned cutoff value: 40


In [103]:
# evaluate on testing data

predictions = predict(df_test["homewk1"], best_cutoff)
acc = (predictions == df_test["pass"]).mean()
print("Testing accuracy:", acc)

Testing accuracy: 0.91
