# LINGI2262: Assignment 3

## A3.0 - Theory

### Confidence interval from the Normal approximation

In [None]:
import numpy as np
from scipy.stats import norm

In [21]:
# Consider 12 errors out of 40 test examples
n = 40
errors = np.repeat(np.array([1, 0]), [12, n-12], axis=0)
# Make it more realistic by permuting at random
np.random.shuffle(errors)
errors

array([0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0,
       0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 1])

In [4]:
p_hat = np.mean(errors) # Estimate proportion
SE = np.sqrt(p_hat*(1-p_hat)/n) # Estimate deviation
alpha = 0.05 # Set significance level
z_n = norm.ppf(1-alpha/2) # Percentile of the Normal Distribution
CI = (p_hat - z_n*SE, p_hat + z_n*SE) # Confidence interval
CI = norm.interval(1-alpha, p_hat, SE) # Equivalent computation
print(np.around(np.array(CI), 3)) # Round to 3 digits

[0.158 0.442]


### p-value or attained level of significance

In [None]:
from scipy.stats import norm

In [5]:
# Probability under Normal density
Proba = norm.cdf(2.85)-norm.cdf(-2.85)
# p-value
pvalue = 1 - Proba
print("p-value = ", pvalue)

p-value =  0.0043719229098264645


### Comparing classifier performances with the Chi-squarred test

In [15]:
import pandas as pd
from scipy import stats

In [16]:
# Store sample sizes and number of errors
n1 = 1000
m1 = 300
n2 = 1000
m2 = 360
# Store errors and correct classifications in a 2x2 table
perf = pd.DataFrame([[m1, m2], [n1-m1, n2-m2]], index=["Error", "Correct"])
perf.columns = ["S_1", "S_2"]
print(perf)
# Test for equality of error rates
pvalue = stats.chi2_contingency(perf)[1]
print("p-value = ", '{0:.6f}'.format(pvalue))

         S_1  S_2
Error    300  360
Correct  700  640
p-value =  0.005021


## A3.2

### A3.2.1 + A3.2.2

In [27]:
n = 100
errors = np.repeat(np.array([1, 0]), [16, n-16], axis=0)
# Make it more realistic by permuting at random
np.random.shuffle(errors)

In [28]:
p_hat = np.mean(errors) # Estimate proportion
SE = np.sqrt(p_hat*(1-p_hat)/n) # Estimate deviation
alpha = 0.05 # Set significance level
z_n = norm.ppf(1-alpha/2) # Percentile of the Normal Distribution
CI = (p_hat - z_n*SE, p_hat + z_n*SE) # Confidence interval
CI = norm.interval(1-alpha, p_hat, SE) # Equivalent computation
print(np.around(np.array(CI), 3)) # Round to 3 digits

[0.088 0.232]


### A3.2.4

In [47]:
# Store sample sizes and number of errors
n = 1048
n1 = n
m1 = 0.12 * n
n2 = n
m2 = 0.16 * n
# Store errors and correct classifications in a 2x2 table
perf = pd.DataFrame([[m1, m2], [n1-m1, n2-m2]], index=["Error", "Correct"])
perf.columns = ["S_1", "S_2"]
print(perf)
# Test for equality of error rates
pvalue = stats.chi2_contingency(perf)[1]
print("p-value = ", '{0:.6f}'.format(pvalue))

            S_1     S_2
Error    125.76  167.68
Correct  922.24  880.32
p-value =  0.009998


## A3.3

In [137]:
# TO RUN FIRST !!!
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from scipy import stats

In [138]:
#Import of our data sets
train_df = pd.read_csv("CollegeTrain.csv", index_col=0)
test_df = pd.read_csv("CollegeTest.csv", index_col=0)

CollegeTrain = train_df
CollegeTest = test_df

### A3.3.1

In [139]:
# Taking 5% of the training set
train_df_05 = train_df.sample(frac = 0.05, random_state=0)
X_train_05 = train_df_05.drop("Outcome", axis=1)
y_train_05 = pd.DataFrame(data=train_df_05["Outcome"], columns=["Outcome"])

# Training the Decision Tree Classifier
tree = DecisionTreeClassifier(random_state=0)
tree.fit(X_train_05, y_train_05)

# Creating a test set with 100 disctincts examples
test_df_100 = test_df.sample(n=100, random_state=0)
X_test_100 = test_df_100.drop("Outcome", axis=1)
y_test_100 = pd.DataFrame(data=test_df_100["Outcome"], columns=["Outcome"])

# Computing the accuracy of our decision tree on this test set
y_test_100_pred = tree.predict(X_test_100)
score_test = accuracy_score(y_test_100, y_test_100_pred)

### A3.3.2

In [140]:
# Parameters
n = 100

# Creating a confidence interval for the classification rate
p_hat = np.mean(score_test) # Estimate proportion
SE = np.sqrt(p_hat*(1-p_hat)/n) # Estimate deviation
alpha = 0.05 # Set significance level
z_n = norm.ppf(1-alpha/2) # Percentile of the Normal Distribution
CI = (p_hat - z_n*SE, p_hat + z_n*SE) # Confidence interval
CI = norm.interval(1-alpha, p_hat, SE) # Equivalent computation
CI_array = np.around(np.array(CI), 3) # Round to 3 digits
CI_array[1]

0.734

### A3.3.3

In [141]:
test_acc = []
for i in range(100):
    # Creation of a test set of size n=100
    test_df = CollegeTest.sample(n=100, random_state=i)
    X_test = test_df.drop("Outcome", axis=1)
    y_test = pd.DataFrame(data=test_df["Outcome"], columns=["Outcome"])
    
    # Prediction on this test set and accuracy score
    y_test_pred = tree.predict(X_test)
    score_test = accuracy_score(y_test, y_test_pred)
    test_acc.append(score_test)
    
test_accs = test_acc
mean_test_acc = np.mean(test_acc)
mean_test_acc

0.6674

### A3.3.4

In [144]:
test_acc.sort()
index = int(0.025*len(test_acc))
observed_lower_bound = test_acc[index + 1]
observed_upper_bound = test_acc[-index-2]

0.61

### A3.3.5

In [155]:
results = pd.DataFrame(columns=['indiv_test_acc', 'CI_lower_bound', 'CI_upper_bound', 
                                'mean_test_acc', 'observed_lower_bound',
                                'observed_upper_bound'])
indiv_test_acc = []
CI_lower_bound = []
CI_upper_bound = []
mean_test_acc = []
observed_lower_bound = []
observed_upper_bound = []

n = 100

for epoch in range(20):
    # Taking 5% of the training set
    train_df_05 = CollegeTrain.sample(frac=0.05, random_state=epoch)
    X_train_05 = train_df_05.drop("Outcome", axis=1)
    y_train_05 = pd.DataFrame(data=train_df_05["Outcome"], columns=["Outcome"])
    # Training the Decision Tree Classifier
    tree = DecisionTreeClassifier(random_state=epoch)
    tree.fit(X_train_05, y_train_05)
    # Creating a test set with 100 disctincts examples
    test_df_100 = CollegeTest.sample(n=100, random_state=epoch)
    X_test_100 = test_df_100.drop("Outcome", axis=1)
    y_test_100 = pd.DataFrame(data=test_df_100["Outcome"], columns=["Outcome"])
    # Computing the accuracy of our decision tree on this test set
    y_test_100_pred = tree.predict(X_test_100)
    score_test = accuracy_score(y_test_100, y_test_100_pred)
    indiv_test_acc.append(score_test)
    
    # Creating a confidence interval for the classification rate
    p_hat = np.mean(score_test) # Estimate proportion
    SE = np.sqrt(p_hat*(1-p_hat)/n) # Estimate deviation
    alpha = 0.05 # Set significance level
    z_n = stats.norm.ppf(1-alpha/2) # Percentile of the Normal Distribution
    CI = (p_hat - z_n*SE, p_hat + z_n*SE) # Confidence interval
    CI = stats.norm.interval(1-alpha, p_hat, SE) # Equivalent computation
    CI_array = np.around(np.array(CI), 3) # Round to 3 digits
    CI_lower_bound.append(CI_array[0])
    CI_upper_bound.append(CI_array[1])
    
    # 100*test sets of size n=100
    test_acc = []
    for i in range(100):
        # Creation of a test set of size n=100
        test_df = CollegeTest.sample(n=100, random_state=i*epoch)
        X_test = test_df.drop("Outcome", axis=1)
        y_test = pd.DataFrame(data=test_df["Outcome"], columns=["Outcome"])
    
        # Prediction on this test set and accuracy score
        y_test_pred = tree.predict(X_test)
        score_test = accuracy_score(y_test, y_test_pred)
        test_acc.append(score_test)
    
    mean_test_acc.append(np.mean(test_acc))
    test_acc.sort()
    index = int(0.025*len(test_acc))
    observed_lower_bound.append(test_acc[index + 1])
    observed_upper_bound.append(test_acc[-index - 2])
    
    
results['indiv_test_acc'] = indiv_test_acc
results['CI_lower_bound'] = CI_lower_bound
results['CI_upper_bound'] = CI_upper_bound
results['mean_test_acc'] = mean_test_acc
results['observed_lower_bound'] = observed_lower_bound
results['observed_upper_bound'] = observed_upper_bound

In [156]:
results

Unnamed: 0,indiv_test_acc,CI_lower_bound,CI_upper_bound,mean_test_acc,observed_lower_bound,observed_upper_bound
0,0.64,0.546,0.734,0.64,0.64,0.64
1,0.59,0.494,0.686,0.624,0.55,0.69
2,0.62,0.525,0.715,0.6753,0.61,0.73
3,0.61,0.514,0.706,0.6791,0.62,0.73
4,0.62,0.525,0.715,0.6744,0.62,0.73
5,0.75,0.665,0.835,0.6608,0.6,0.72
6,0.6,0.504,0.696,0.5766,0.52,0.64
7,0.67,0.578,0.762,0.7285,0.67,0.77
8,0.67,0.578,0.762,0.7481,0.7,0.8
9,0.7,0.61,0.79,0.6551,0.6,0.72


In [157]:
results.describe()

Unnamed: 0,indiv_test_acc,CI_lower_bound,CI_upper_bound,mean_test_acc,observed_lower_bound,observed_upper_bound
count,20.0,20.0,20.0,20.0,20.0,20.0
mean,0.6495,0.55725,0.74175,0.65774,0.602,0.711
std,0.074585,0.079133,0.070063,0.063305,0.066775,0.061379
min,0.51,0.412,0.608,0.5176,0.46,0.59
25%,0.6075,0.5115,0.7035,0.624,0.5575,0.6775
50%,0.65,0.5565,0.7435,0.65795,0.605,0.72
75%,0.705,0.6155,0.7945,0.687725,0.64,0.7425
max,0.78,0.699,0.861,0.7674,0.72,0.82
