# Initial Data Analysis - Capstone Project

In [171]:
# Imports
import os
import csv
import numpy as np
import pandas as pd
from pandas import Int64Index
import pickle
import sys
import math

In [172]:
def save_as_pkl(object, path):
	pickle.dump(object, open(path, "wb"))

def load_pkl(path):
	obj = pickle.load(open(path, "rb"))
	return obj

### Preprocessing and loading the data

In [173]:
# Processing the .csvs into dataframes and saving them as pickles for easier
# loading on future runs
files = [f for f in os.listdir("data/") if f.split('1')[0] == 'file']
for file in files:
    print ('\n'+file)
    fName = file.split('.')[0]
    df = pd.read_csv('data/' + file, delimiter=',', na_values=['NA'])
#     df["Id"] = df.reset_index().index
#     df.set_index("Id")
    print(df.shape)
    save_as_pkl(df, 'pickles/'+fName+'.pkl')


file12.csv
(4286, 392)

file13.csv
(4638, 392)

file11.csv
(3958, 392)

file10.csv
(3979, 392)

file14.csv
(4836, 392)

file15.csv
(5557, 392)

file17.csv
(5207, 392)

file16.csv
(5456, 392)


In [174]:
## Loading preprocessed dataframes
pklFiles = [f for f in os.listdir("pickles/") if f.split('1')[0] == 'file']
for file in pklFiles:
	df = load_pkl("pickles/" + file)
	print(df.shape)

(4286, 392)
(4638, 392)
(3958, 392)
(3979, 392)
(4836, 392)
(5557, 392)
(5207, 392)
(5456, 392)


# Change Year Loaded Here:

In [175]:
# YEAR:
year = 13
fname = "file"+str(year)+".pkl"
df = load_pkl("pickles/"+fname)

### Starting to analyze the data

In [176]:
# First 30 columns:
firstPart = df.iloc[:,0:30]
# print(firstPart.columns.values)

# Columns 30 to 65: (top 6 course marks)
top6CourseMarks = df.iloc[:,30:66]
# print(top6CourseMarks.columns.values)

# cols = []
# for column in top6CourseMarks[top6CourseMarks.columns[2::3]]:
#     cols.append(df[column])
# hstack = pd.concat([x for x in cols], axis=1)
# hstack.fillna(0, inplace=True)
# hstack['Sum'] = hstack.mean(axis=1)
# print(hstack)

# Columns 66 to 71:
middle = df.iloc[:,66:72]
print(middle["WAVERG1"], middle["WAVERG2"])
# print(middle.columns.values)

# Columns 72 to 391:
choices = df.iloc[:, 72:]
# print(choices.columns.values)

0         0
1         0
2       878
3       925
4         0
       ... 
4281      0
4282      0
4283      0
4284    692
4285    730
Name: WAVERG1, Length: 4286, dtype: int64 0       852
1       908
2       885
3       925
4       823
       ... 
4281    772
4282    832
4283    838
4284    692
4285    730
Name: WAVERG2, Length: 4286, dtype: int64


In [177]:
print(top6CourseMarks.head())
print(top6CourseMarks.columns)

  SECORCOD1  SECORC1 SECORM1 SECORCOD2  SECORC2 SECORM2 SECORCOD3  SECORC3  \
0     CGW4U    100.0      87     CHI4U    100.0      83     ENG4U    100.0   
1     ENG4U    100.0      88     MCV4U    100.0      95     MHF4U    100.0   
2     CHI4U    100.0      92     ENG4U    100.0      76     EWC4U      0.0   
3     ENG4U    100.0      89     MCV4U    100.0      97     MDM4U    100.0   
4     ENG4U    100.0      70     MCV4U    100.0      85     MDM4U    100.0   

  SECORM3 SECORCOD4  ...  SECORM9 SECORCOD10 SECORC10  SECORM10 SECORCOD11  \
0      74     MCV4U  ...      NaN        NaN      NaN       NaN        NaN   
1      92     SBI4U  ...      NaN        NaN      NaN       NaN        NaN   
2      31     HSB4M  ...       84        NaN      NaN       NaN        NaN   
3      92     MHF4U  ...      NaN        NaN      NaN       NaN        NaN   
4      92     MHF4U  ...      NaN        NaN      NaN       NaN        NaN   

  SECORC11  SECORM11  SECORCOD12 SECORC12  SECORM12  
0      N

In [178]:
courseCodeCols = ['SECORCOD' + str(i+1) for i in range(12)]
maxScore = ['SECORC' + str(i+1) for i in range(12)] #I'm assuming that's what this column is? (useless)
studentScoreCols = ['SECORM' + str(i+1) for i in range(12)]
goalCourses = ['MHF4U', 'MCV4U', 'ENG4U', 'SCH4U', 'SPH4U']

# Advanced functions: MHF4U
# Calculus: MCV4U
# English: ENG4U
# Chemistry: SCH4U
# Physics: SPH4U
# Next highest mark, anything

In [179]:
def averageFinder(allStudentMarks):
    studentAverages = []
    for index, row in allStudentMarks.iterrows(): 
        try:
            studentCourseCodes = [row[courseCodeCol] for courseCodeCol in courseCodeCols]
            studentGrades = [row[studentScoreCol] for studentScoreCol in studentScoreCols]
            studentCourseCodes = [val for val in studentCourseCodes if type(val) == str]
            studentGrades = [val for val in studentGrades if str(val) != 'nan']

            necessaryCourses = [int(studentGrades[i]) for i in range(len(studentGrades)) if studentCourseCodes[i] in goalCourses]
            remainder = [int(studentGrades[i]) for i in range(len(studentGrades)) if studentCourseCodes[i] not in goalCourses]

            # Some students don't seem to have 6 courses.
            # Set their average to -1 in this case.
            print(remainder)
            print(necessaryCourses)
            average = (max(remainder) + sum(necessaryCourses))/6
            studentAverages.append(average)        
        except ValueError:
#             print("Student with incorrect number of courses applied.")
            studentAverages.append(-1)
    return studentAverages
    
# print(averageFinder(top6CourseMarks))

In [180]:
# Finding the student's choice preference for the university:
# This data is found in the 'choices' dataframe.
uniChoiceCols = ['DUNI'+str(i+1) for i in range(20)]
programChoiceCols = ['DPRO1' + str(i+1) for i in range(20)]
goalUni = '196'

In [181]:
def uniChoiceFinder(studentUniChoices):
    studentAverages = []
    for index, row in studentUniChoices.iterrows():
        # Filtering student's preference for our university and our specific program.
        studentCourseCodes = [i+1 for i in range(20) if (row['DUNI'+str(i+1)] == 196
                                                                                and (row['DPRO'+str(i+1)] == 'SIA' 
                                                                                or row['DPRO'+str(i+1)] == 'SI'))]
        studentAverages.append(studentCourseCodes[0])
    return studentAverages

In [182]:
def acceptedOurUni(firstPart):
    acceptedArray = []
    for index, row in firstPart.iterrows():
        # Filtering student's preference for our university and our specific program.
        studentAccepted = int(row['CONFUNI'] == 196)
        acceptedArray.append(studentAccepted)
    return acceptedArray

# Determining grade cutoff

### Seperating relevant data
- Splitting data into students who accept and offer from mac and those who don't.
- Isolating 8 initial columns of interest for analysis

In [183]:
## Starting columns of interest:
# RESPROV - Province of Residence
# RESCNTY - County of Residence
# ZIP3 - Residence Postal Code (First 3 Digits)
# CONFUNI - Confirmed University (OurUni='196')
# CONFPR - Confirmed Program (OurProg='SI', OurProg_coop='SIA')
# CONFCHOIC - OUAC Confirmed Choice Preference
# WAVERG1 - Weighted Average (best 6 OAC / Senior Level current year finals)
# WAVERG2 - Weighted Average (best 6 OAC / Senior Level all year finals)
## First 6 columns are in first half, last 2 are in "Middle" dataframe.

COIsFirstHalf = ["GEND", "SCHOOL", "ZIP3", "CONFUNI", "CONFPR", "CONFCHOIC"]
COIsSecondHalf = ["WAVERG2"]

# Finding the student's average from the formula given by Dr. Franek.
averages = averageFinder(top6CourseMarks)
averagesDf = pd.DataFrame(averages, columns=['AVG'])

# Making a dataframe with student's preferences
preferences = uniChoiceFinder(choices)
preferencesDf = pd.DataFrame(preferences, columns=['PREF'])

# Dataframe with 1 for accepted our uni, 0 otherwise.
acceptedArray = acceptedOurUni(firstPart)
acceptedDf = pd.DataFrame(acceptedArray, columns=['ACCEPTED'])

# Adding columns of interest found in first half of the data
columnsOfInterest = pd.concat([firstPart[x] for x in COIsFirstHalf], axis=1)

# Adding the rest of the columns of interest
columnsOfInterest = pd.concat([columnsOfInterest] + [middle[x] for x in COIsSecondHalf] + 
                              [averagesDf] + [preferencesDf] + [acceptedDf],axis=1)
copy = columnsOfInterest.copy()

mcmasterVector = copy["CONFUNI"] == 196
notmcmasterVector = copy["CONFUNI"] != 196
allStudents = copy

[87, 83]
[74, 81, 84, 91, 85]
[91]
[88, 95, 92, 91, 88]
[92, 31, 90, 94, 80]
[76, 80, 91, 84]
[92]
[89, 97, 94, 91, 92]
[92]
[70, 85, 87, 74, 86]
[83, 90]
[81, 91, 89, 91, 83]
[88, 98]
[75, 89, 90, 83, 84]
[37, 82, 70]
[83, 90, 85, 91, 94]
[86]
[83, 95, 96, 96, 90]
[91, 80]
[80, 74, 85, 78, 78]
[75, 45]
[73, 83, 97, 80, 90]
[88]
[70, 78, 73, 79, 80]
[90, 82, 81, 92, 91]
[77, 84, 87, 87, 84]
[79, 87]
[78, 88, 91, 90, 85]
[97]
[79, 84, 88, 93, 82]
[95]
[74, 93, 78, 80, 78]
[94, 86]
[88, 86, 85, 74, 90]
[76, 93]
[83, 84, 88, 80, 88]
[96]
[60, 94, 94, 80, 75]
[92, 86, 99, 86]
[87, 97, 94, 90, 84]
[81]
[75, 81, 88, 86, 85]
[96, 94]
[94, 90, 94, 89, 94]
[80]
[78, 90, 73, 75, 80]
[85, 90, 83]
[78, 85, 87, 83, 88]
[98]
[87, 90, 93, 98, 100]
[96]
[90, 95, 99, 99, 93]
[91, 94, 90, 93, 90]
[93, 83, 80, 83, 81]
[95]
[91, 89, 91, 77, 88]
[85]
[80, 90, 92, 88, 90]
[73, 80]
[67, 86, 85, 75, 86]
[96, 98]
[86, 99, 99, 95, 96]
[93]
[82, 97, 97, 93, 91]
[86]
[85, 91, 90, 91, 86]
[90]
[80, 85, 88, 80, 75]

[62, 81, 68]
[78, 28, 90, 86, 75]
[81, 75, 70, 72]
[68, 74, 73, 72, 66]
[98, 96, 97, 98]
[95, 99, 100, 99, 100]
[76, 87]
[77, 68, 80, 77, 77]
[100, 100]
[93, 93, 96, 86, 91]
[96, 80, 71, 96]
[82, 81, 84, 80, 90]
[84, 97, 97, 97, 92, 95]
[93, 97, 96, 93, 92]
[97, 91, 82]
[85, 93, 90, 86, 88]
[71, 87, 74]
[74, 81, 90, 74, 76]
[70, 98, 92]
[60, 95, 90, 75, 90]
[90, 86]
[84, 82, 84, 81, 87]
[92]
[87, 91, 91, 95, 93]
[93, 86]
[83, 56, 70, 55, 62]
[77, 98, 86]
[85, 92, 81, 77]
[92, 85]
[88, 76, 71, 70, 81]
[95, 92, 98]
[85, 91, 97, 97, 93]
[95, 95, 94, 93, 85]
[85, 85, 94, 83, 87]
[96]
[80, 90, 94, 80, 88]
[93, 96]
[90, 95, 94, 91, 91]
[94, 89, 99, 90, 93]
[92, 97, 100, 96, 95]
[88, 97, 93, 91]
[90, 84, 73, 89, 89]
[88, 88]
[69, 94, 94, 83, 80]
[90, 90, 90]
[78, 85, 90, 94, 88]
[85, 83]
[82, 74, 89, 75, 78]
[90, 83, 87]
[76, 90, 75, 85, 79]
[90, 93, 94]
[86, 95, 97, 89, 90]
[91, 100, 93]
[89, 96, 92, 91]
[88, 93]
[87, 85, 90, 83, 97]
[90, 90, 87, 94, 96]
[90, 87, 92, 87, 91]
[92, 93, 95, 91]

[67, 90]
[80, 93, 94, 92, 82]
[99, 99, 98]
[99, 96, 82, 97, 93]
[86]
[86, 91, 89, 74, 85]
[92, 95, 74]
[89, 89, 93, 79, 79]
[92, 94]
[85, 84, 93, 74, 83]
[86, 88]
[90, 93, 98, 84, 90]
[91, 98]
[85, 80, 86, 70, 65]
[96, 96]
[96, 100, 100, 97, 100]
[90, 92, 91]
[80, 84, 87, 81, 87]
[89, 82]
[90, 82, 85, 80, 88]
[91]
[82, 98, 91, 88, 96]
[95]
[87, 83, 95, 91, 82]
[80, 85]
[83, 78, 90, 87, 80]
[79, 79, 79, 70, 78, 80]
[79, 80, 80, 80, 93]
[87, 85, 90]
[82, 75, 87, 75, 78]
[87, 87, 69]
[78, 83, 90, 83, 74]
[97]
[80, 96, 98, 98, 98]
[87, 90, 96]
[90, 96, 94, 85, 92]
[95, 90]
[86, 92, 97, 95, 95]
[95]
[81, 92, 90, 90, 93]
[80, 84]
[88, 75, 80, 81, 80]
[80]
[83, 84, 87, 68, 81]
[95, 90]
[81, 65, 77, 70, 61]
[76, 87]
[78, 88, 95, 80, 84]
[96]
[82, 91, 92, 86, 88]
[66, 80, 84]
[85, 55, 70, 68, 75]
[94, 94, 96]
[88, 92, 95, 93, 94]
[86, 97]
[76, 93, 94, 76, 84]
[87, 85]
[81, 90, 90, 85, 85]
[92, 82, 85, 93]
[85, 92, 89, 88, 88]
[90, 92, 90]
[83, 80, 93, 80, 87]
[86, 75, 94]
[87, 96, 93, 82, 91]
[

[84, 98, 97, 98, 91]
[94, 98, 98]
[94, 97, 98, 98, 97]
[85]
[84, 80, 80, 80, 80]
[66, 88, 90, 80, 93]
[81, 93, 92, 80, 88]
[95, 95, 94]
[91, 94, 91, 92, 78]
[89, 86, 85, 82, 90, 98, 97, 97, 92, 99]
[]
[98, 98, 92, 93]
[82, 90, 88, 90, 87]
[77, 60, 90, 93, 85, 85, 92]
[]
[94]
[90, 98, 98, 100, 96]
[81, 61]
[51, 72, 81, 76, 83]
[93, 92]
[87, 93, 93, 87, 90]
[78]
[87, 80, 86, 74, 83]
[89, 99, 75, 82]
[87, 98, 94, 88, 96]
[75, 69, 90]
[70, 98, 100, 81, 92]
[95, 97]
[78, 88, 90, 91, 85]
[83, 80]
[86, 77, 88, 73, 72]
[90, 90, 95]
[94, 90, 92, 82, 90]
[86, 81, 75]
[77, 88, 86, 75, 86]
[70, 82]
[83, 92, 81, 81, 83]
[85, 90]
[85, 90, 92, 83, 95]
[58, 87]
[71, 91, 88, 86, 81]
[40, 50]
[85, 50, 57, 60, 71]
[95, 90]
[93, 88, 90, 71, 88]
[85]
[80, 71, 85, 71, 76]
[80, 96]
[80, 96, 93, 80, 71]
[82, 94, 84]
[79, 90, 92, 80, 82]
[93, 100, 93]
[91, 99, 96, 91, 93]
[91, 84]
[87, 84, 82, 72, 73]
[91, 92]
[70, 92, 77, 86]
[85, 85]
[82, 90, 85, 81, 81]
[50, 99, 88]
[88, 94, 97, 74, 93]
[97, 82, 95, 94, 99]

[88, 99, 96, 92, 92]
[92, 94]
[92, 99, 98, 95, 93]
[94, 95]
[88, 94, 92, 92, 85]
[84]
[81, 81, 86, 73, 87]
[95, 74, 62]
[82, 82, 82, 81, 86]
[94]
[90, 95, 93, 88, 88]
[93, 95, 97]
[82, 86, 96, 97, 95]
[90]
[80, 84, 92, 87, 94]
[77]
[90, 90, 80, 89, 70]
[91, 92, 86, 90]
[93, 87, 90, 85, 88]
[82, 81, 76, 99, 77]
[77, 72, 81, 80, 81]
[94, 90]
[94, 98, 96, 97, 95]
[40, 94]
[80, 75, 94, 87, 84]
[88, 71, 64, 72]
[76, 69, 73, 62, 67]
[78, 76, 97]
[85, 76, 82, 90, 63]
[86]
[85, 96, 96, 78, 83]
[89, 89, 97]
[88, 99, 96, 84, 89]
[95, 87]
[78, 90, 87, 78, 80]
[94, 98]
[97, 98, 98, 95]
[97]
[90, 98, 98, 92, 90]
[97, 91]
[83, 96, 97, 81, 86]
[99, 85]
[86, 85, 88, 83, 75]
[83, 82]
[82, 63, 76, 70, 75]
[86, 67]
[85, 79, 83, 78, 81]
[90, 98]
[90, 90, 94, 92, 91]
[92, 85]
[88, 75, 90, 85, 92]
[65, 83, 78, 96, 85]
[85, 92, 90, 81]
[80, 66]
[74, 65, 79, 71, 65]
[98]
[85, 92, 93, 92, 92]
[93, 94]
[81, 90, 96, 82, 91]
[85, 87, 80]
[86, 81, 84, 78, 80]
[84, 85, 87, 88]
[87, 89, 87, 90, 92]
[90, 92]
[72, 98,

[94, 88]
[78, 88, 90, 78, 85]
[88, 90]
[83, 82, 83, 80, 88]
[88, 87]
[74, 71, 77, 75, 80]
[82, 85, 69, 91, 87]
[85, 91, 91, 87, 92]
[99]
[91, 96, 97, 96, 91]
[96, 70, 90, 94]
[86, 95, 97, 96, 94]
[80, 81, 44, 90, 35, 70]
[72, 81, 90, 80, 50]
[80, 94]
[91, 92, 93, 90, 92]
[98]
[77, 91, 86, 76, 78]
[90, 92]
[81, 82, 83, 87, 80]
[95, 93]
[90, 89, 92, 87, 85]
[90, 85]
[80, 83, 90, 83, 84]
[80, 80]
[75, 40, 70, 70, 57]
[86, 80, 91, 91, 92]
[80, 98, 98, 92, 97]
[83, 80]
[82, 66, 75, 76, 72]
[89, 78, 85]
[72, 60, 73, 70, 67]
[80, 65]
[85, 82, 87, 86, 70]
[91, 70]
[73, 85, 95, 69, 75]
[90, 87]
[88, 95, 96, 92, 92]
[76, 92]
[76, 82, 87, 92, 82]
[85, 82, 85, 96]
[83, 89, 90, 75, 87]
[90, 88, 92]
[81, 90, 78, 88, 95]
[86, 89, 90, 98, 90]
[86, 90, 95, 94, 93]
[80, 89]
[80, 83, 84, 80, 75]
[87, 87]
[]
[94, 82]
[83, 95, 80, 87, 87]
[90]
[73, 85, 87, 78, 86]
[90, 86]
[95, 91, 92, 90, 92]
[77, 96]
[82, 70, 50, 84, 90]
[62]
[77, 42, 60, 60, 62]
[92, 92, 86, 96]
[75, 87, 91, 90, 87]
[88]
[80, 82, 70, 75

[75]
[67, 90, 80, 78, 77]
[96, 87]
[88, 99, 98, 96, 91]
[95, 90]
[86, 80, 84, 90, 80]
[86, 95]
[86, 91, 91, 96, 89]
[83, 88]
[87, 71, 73, 75, 72]
[96, 96, 97]
[90, 98, 95, 89, 94]
[77, 82, 90, 89, 90, 83]
[82, 92, 92, 83, 90]
[91, 90]
[66, 77, 91, 68, 81]
[88]
[76, 88, 90, 90, 76]
[80, 75, 63]
[74, 74, 77, 83, 77]
[73, 70]
[87, 66, 83, 77, 74]
[97, 88]
[86, 87, 76, 86, 82]
[78, 81, 86]
[88, 77, 88, 80, 73]
[84]
[72, 88, 90, 74, 92]
[83, 77, 88]
[80, 79, 82, 71, 73]
[92, 83]
[68, 87, 93, 58, 73]
[90, 95, 83]
[97, 76, 87, 84, 81]
[94, 95]
[84, 96, 98, 98, 95]
[81, 76]
[89, 65, 81, 72, 81]
[86]
[80, 93, 84, 74, 90]
[90]
[88, 75, 86, 83, 77]
[78, 88, 86, 86, 89, 88, 93]
[86, 97, 97, 93, 97]
[84, 91, 64]
[80, 86, 87, 82, 83]
[74, 85]
[78, 80, 85, 82, 80]
[78, 88, 99]
[82, 93, 92, 86, 76]
[95, 93]
[90, 91, 83, 83, 85]
[81, 86, 95, 80]
[76, 75, 80, 87, 91]
[62, 75, 77]
[87, 80, 80, 70]
[63, 44]
[75, 69, 77, 73, 70]
[94, 97]
[84, 92, 84, 87, 90]
[80, 85]
[70, 82, 80, 69, 81]
[77, 70]
[75, 85, 

[82, 83, 87, 82, 83]
[90]
[80, 91, 95, 83, 83]
[86]
[88, 92, 95, 80, 83]
[83, 83]
[72, 71, 68, 71, 81]
[95, 93, 75, 83, 74, 64]
[87, 60, 60, 70, 68]
[92, 90, 94, 84]
[81, 88, 92, 80, 85]
[86, 76, 66]
[67, 78, 82, 65, 80]
[77, 92]
[82, 74, 78, 85, 86]
[90]
[74, 72, 90, 76, 78]
[83]
[73, 81, 82, 78, 97]
[80, 82]
[76, 68, 57, 66]
[96, 97]
[90, 96, 95, 97, 96]
[83]
[73, 85, 88, 85, 90]
[86, 95, 85, 80]
[85, 80, 83, 82, 86]
[76, 98, 90]
[80, 86, 85, 82, 78]
[70]
[73, 57, 59, 50, 51]
[74, 90]
[81, 75, 81, 78, 80]
[83, 32]
[70, 90, 74, 76, 96]
[81, 65, 12]
[70, 87, 91, 75, 78]
[95, 95, 89, 95, 77, 98, 95]
[89, 98, 98, 95, 91]
[85]
[77, 85, 83, 85, 87]
[71]
[75, 84, 85, 88, 77]
[71, 80, 84]
[80, 63, 90, 72, 76]
[89]
[78, 90, 90, 82, 77]
[80, 70]
[75, 57, 87, 71, 76]
[64, 90]
[82, 83, 80, 72, 83]
[88]
[74, 93, 80, 80, 91]
[82]
[83, 73, 80, 75, 87]
[86]
[65, 85, 77, 68, 68]
[87, 79, 90]
[70, 94, 74, 82, 71]
[82, 36, 15, 74, 83]
[85, 94, 67, 94]
[94]
[78, 99, 96, 85, 91]
[65, 86, 81]
[75, 85, 94,

[81, 72, 76, 73, 80]
[81, 85, 91]
[78, 85, 85, 78, 75]
[74, 70, 87, 70]
[63, 63, 76, 80, 72]
[78]
[77, 94, 98, 80, 88]
[80]
[71, 66, 78, 75, 68]
[93, 80, 70, 76]
[80, 88, 86, 90, 76]
[92, 84, 81]
[82, 84, 65, 76, 78]
[87, 81]
[80, 96, 90, 80, 80]
[89, 81, 83, 82, 86]
[81, 88, 75, 72]
[81, 76]
[80, 80, 78, 61, 77]
[91, 77]
[81, 81, 83, 78, 75]
[90, 88]
[81, 83, 73, 77, 75]
[80, 75, 41]
[52, 72, 80, 57, 63]
[60, 65, 64]
[67, 62, 71, 33, 50]
[90, 91, 86, 90]
[87, 77, 87, 83, 86]
[70]
[70, 80, 89, 87, 77]
[19, 5, 82]
[89, 90, 80, 88, 77]
[97, 84, 93]
[95, 95, 80, 90, 94]
[90, 64, 96, 84]
[56, 94, 84, 39, 62]
[96, 96, 97]
[90, 94, 99, 96, 95]
[95, 95, 89, 95, 90, 98, 97]
[89, 98, 98, 97, 94]
[96, 95]
[87, 87, 86, 92, 92]
[94, 83]
[67, 90, 80, 85]
[73, 87, 80]
[70, 83, 84, 76, 72]
[88, 98]
[87, 88, 90, 90, 86]
[61, 74]
[92, 75, 80, 78, 67]
[85, 80, 92]
[75, 77, 82, 86, 85]
[90, 87, 83]
[88, 82, 83, 86, 88]
[95, 90, 60, 59]
[90, 95, 95, 90, 98]
[78]
[73, 90, 87, 77, 78]
[93]
[82, 80, 81, 73, 

[75, 80, 76, 71, 76]
[82, 76, 68]
[72, 68, 70, 70, 50]
[89]
[57, 87, 95, 85, 80]
[72, 90, 85]
[73, 75, 93, 91, 91]
[78, 76]
[72, 72, 87, 75, 61]
[72, 88]
[73, 68, 73, 71, 77]
[90, 99]
[80, 87, 92, 92, 88]
[88, 87]
[75, 87, 89, 81, 81]
[90, 98]
[74, 91, 90, 82, 90]
[77, 85]
[75, 80, 77, 73, 81]
[85, 89]
[71, 67, 70, 62, 72]
[80, 90]
[79, 85, 88, 84, 97]
[76, 89]
[80, 66, 85, 82, 83]
[87, 88]
[80, 72, 72, 70, 71]
[88, 67]
[72, 77, 71, 68, 81]
[66, 64, 88]
[80, 89, 87, 85, 87]
[85, 71, 88]
[82, 61, 70, 80, 80]
[87, 70, 65]
[72, 55, 71, 81, 55]
[84, 96, 89]
[80, 91, 92, 90, 81]
[91, 86, 72]
[87, 80, 86, 76]
[85]
[81, 75, 80, 70, 80]
[73]
[62, 70, 77, 60, 93]
[81]
[75, 63, 76, 90, 72]
[93, 87]
[86, 74, 75, 73, 75]
[82]
[65, 88, 85, 70]
[71, 81]
[71, 81, 78, 60, 78]
[88, 75, 94, 85]
[80, 80, 91, 70, 78]
[80]
[72, 75, 86, 65, 70]
[94]
[70, 90, 89, 91, 82]
[80, 83]
[78, 82, 85, 74, 72]
[81, 86]
[75, 60, 63, 65]
[85, 77, 71, 88]
[74, 85, 90, 94, 88]
[92]
[79, 92, 95, 76, 75]
[90, 85]
[74, 82, 8

[63, 77, 71]
[70, 72, 80, 70, 91]
[94, 97, 93, 84, 87]
[92, 88, 83, 90, 87]
[95, 90, 92]
[88, 100, 93, 88, 93]
[90, 90, 100]
[88, 83, 89, 85]
[93, 88, 90, 92]
[83, 88, 90, 90, 83]
[97]
[88, 100, 97, 98, 96]
[92, 96, 82]
[87, 88, 80, 77, 81]
[83, 83, 84, 95, 92]
[83, 83, 83, 81, 81]
[95]
[84, 93, 95, 90, 86]
[93, 88]
[75, 76, 82, 80, 77]
[88, 86, 81, 90]
[76, 82, 80, 78, 80]
[82, 85]
[72, 88, 90, 84, 80]
[82, 85, 100]
[56, 97, 99, 70, 76]
[94, 94, 85]
[73, 81, 85, 94, 80]
[85, 89, 54]
[85, 84, 85]
[91, 94, 91, 94, 87]
[93, 80, 82, 84, 80]
[85, 90, 90, 81]
[82, 63, 86, 82, 75]
[98]
[92, 84, 88, 90, 96]
[91]
[84, 87, 85, 82, 84]
[96]
[89, 96, 98, 94, 92]
[90, 93, 78]
[78, 87, 90, 83, 78]
[80]
[69, 89, 91, 90, 92]
[95, 86, 97]
[90, 97, 97, 95, 95]
[54]
[50, 85, 65]
[98, 70, 83, 67]
[68, 87, 92, 85, 83]
[70, 50, 60, 53, 88]
[90, 70, 67, 84, 69]
[96, 94, 94]
[88, 89, 92, 92, 94]
[82, 84, 73]
[84, 65, 75, 82, 71]
[50, 91]
[73, 73, 94, 77, 72]
[22, 97]
[85, 56, 85, 65, 78]
[57, 85]
[82, 92, 83

[85]
[69, 82, 78, 85, 82]
[90, 72, 72, 73, 54, 80, 53, 62, 65, 65]
[92]
[90]
[73, 84, 82, 80, 85]
[80, 83, 86]
[89, 93, 80, 84, 91]
[93, 77]
[85, 75, 88, 77, 75]
[92, 88]
[70, 87, 76, 70]
[81, 84]
[79, 89, 92, 81, 92]
[79, 84]
[61, 86, 88, 73, 68]
[86, 87]
[86, 91, 94, 77, 94]
[50, 24]
[66, 44, 69, 20]
[91, 98]
[95, 94, 96, 91, 90]
[94, 96]
[85, 90, 87, 84, 79]
[58]
[68, 55, 67, 57, 70]
[80, 78]
[84, 92, 80, 73, 72]
[90, 90, 88]
[90, 90, 90, 86, 82]
[77, 77, 79, 69, 82, 79]
[79, 80, 80, 74, 79]
[84]
[86, 90, 87, 86, 84]
[55, 94]
[80, 80, 83, 72, 75]
[74, 91]
[78, 92, 89, 80, 88]
[70, 86]
[78, 74, 85, 83, 70]
[88]
[70, 83, 83, 71, 65]
[65, 94, 80, 75, 72]
[82, 65, 77, 81, 88]
[98, 89]
[90, 85, 94, 85, 90]
[70, 71]
[67, 65, 66, 66, 70]
[85, 90, 90, 82, 93]
[80, 81, 77, 76, 78]
[75, 96]
[78, 65, 70, 88, 76]
[73, 96, 91]
[87, 90, 95, 69, 82]
[85]
[70, 68, 76, 77, 75]
[92, 87]
[84, 84, 87, 84, 84]
[80, 72, 86]
[70, 77, 91, 65, 75]
[85, 65]
[80, 51, 81, 56, 72]
[70]
[90, 82, 88, 87, 83]
[85]

[17, 75, 68, 80]
[82, 45, 61, 71, 10]
[58, 83]
[60, 64, 93, 80, 80]
[93, 94]
[84, 93, 91, 77, 86]
[78, 83, 87, 70, 70]
[82, 87, 85, 90, 85]
[77]
[30, 40, 50, 40, 51]
[78]
[62, 86, 90, 70, 83]
[82]
[70, 65, 94, 85, 80]
[89, 90, 63]
[76, 90, 92, 84, 83]
[73, 77, 28]
[71, 20, 51, 20]
[80, 73, 97]
[79, 76, 76, 85, 76]
[51, 77]
[63, 52, 55, 78, 76]
[85, 73]
[78, 83, 68, 84, 80]
[81]
[78, 76, 86, 70, 71]
[75, 83, 85, 78, 88]
[75, 80, 93, 82, 74]
[90, 60, 93]
[80, 68, 84, 85, 81]
[86, 84, 80]
[79, 86, 70, 71]
[88]
[80, 84, 85, 67, 85]
[71, 80, 100, 93, 88]
[88, 98, 94, 95, 90]
[79]
[88, 92, 93, 85, 90]
[52, 84]
[80, 60, 75, 75, 65]
[90]
[91, 93, 92]
[90, 86]
[83, 89, 85, 80, 88]
[53]
[70, 62, 61, 60, 76]
[90, 77]
[74, 89, 89, 83, 94]
[66]
[70, 55, 63, 77, 75]
[90, 85]
[85, 96, 85, 83, 86]
[96, 85]
[80, 80, 83, 70, 81]
[50]
[69, 0, 61]
[72]
[79, 86, 80, 85, 95]
[67, 86, 55]
[62, 78, 83, 74, 67]
[80, 90]
[68, 36, 43, 60, 78]
[60, 63, 50, 70]
[68, 75, 82, 67, 70]
[90, 67]
[80, 83, 73, 65, 74]
[7

### Some min/max/average output for both student types:

In [184]:
# Students that received and accepted a McMaster offer:
acceptedMcMaster = copy[mcmasterVector]
# print(acceptedMcMaster)
# print(acceptedMcMaster[(acceptedMcMaster["WAVERG2"] > 0) & (acceptedMcMaster["CONFPR"] == "SIA")].min())
print("Students who accepted an offer from McMaster:")
print("Mean values for each column:")
print(acceptedMcMaster[(acceptedMcMaster["AVG"] > 0) & (acceptedMcMaster["CONFPR"] == "SIA")].mean())
print("\nMax values for each column:")
print(acceptedMcMaster[(acceptedMcMaster["AVG"] > 0) & (acceptedMcMaster["CONFPR"] == "SIA")].max())
print("\nLowest 5 weighted average:")
print(acceptedMcMaster[(acceptedMcMaster["AVG"] > 0) & (acceptedMcMaster["CONFPR"] == "SI")].nsmallest(20,"WAVERG2"))
print("\nShape of acceptedMcMaster df (number of students that accepted):")
print(acceptedMcMaster.shape)

# Students that didn't accept a mcmaster offer:
print('\n\n\n\n')
notMcMaster = copy[notmcmasterVector]
# print(notMcMaster)
print("Min average of students who didn't accept mac offer")
print(notMcMaster[notMcMaster["AVG"] > 0].min())
print("\nMax average of students who didn't accept mac offer")
print(notMcMaster[notMcMaster["AVG"] > 0].max())
print("\nAverage average of students who didn't accept mac offer")
print(notMcMaster[notMcMaster["AVG"] > 0].mean())


Students who accepted an offer from McMaster:
Mean values for each column:
GEND           119.842932
SCHOOL       10697.694590
CONFUNI        196.000000
CONFCHOIC        2.246073
WAVERG2        864.694590
AVG             83.816172
PREF             2.246073
ACCEPTED         1.000000
dtype: float64

Max values for each column:
GEND             120
SCHOOL         11245
CONFUNI          196
CONFPR           SIA
CONFCHOIC         10
WAVERG2          995
AVG          98.6667
PREF              10
ACCEPTED           1
dtype: object

Lowest 5 weighted average:
      GEND  SCHOOL ZIP3  CONFUNI CONFPR  CONFCHOIC  WAVERG2        AVG  PREF  \
1609   120   10698  N4D    196.0     SI        1.0      800  80.000000     1   
2484   120   10780  M6F    196.0     SI        3.0      800  80.000000     3   
2835   120   10816  M6O    196.0     SI        3.0      800  80.000000     3   
3093   120   10267  M8D    196.0     SI        4.0      800  80.000000     4   
3695   120   10719  N7I    196.0     SI   

In [185]:
# print(acceptedMcMaster.shape)
# print(notMcMaster.shape)
print('McMaster students sample:')
print(acceptedMcMaster.head(10))
print('\n\n')
print('All Students sample:')
print(allStudents.head(20))
# print(notMcMaster.head(50))

McMaster students sample:
    GEND  SCHOOL ZIP3  CONFUNI CONFPR  CONFCHOIC  WAVERG2        AVG  PREF  \
0    120   11140  N4I    196.0    SIA        5.0      852  83.666667     5   
16   120   10302  M7N    196.0     SI        2.0      882  86.166667     2   
19   119   10214  M1Q    196.0     SI        2.0      932  91.833333     2   
27   119   11148  M6N    196.0    SIA        1.0      885  88.500000     1   
39   120   11140  N4I    196.0    SIA        7.0      858  85.166667     7   
40   120   11140  N4I    196.0    SIA        5.0      853  85.333333     5   
45   120   10737  O2F    196.0     SI        2.0      878  86.500000     2   
59   120   11219  O4B    196.0     SI        2.0      842  84.166667     2   
60   119   11219  O4B    196.0    SIA        2.0      828  82.833333     2   
69   120   11090  M5K    196.0     SI        4.0      813  68.500000     4   

    ACCEPTED  
0          1  
16         1  
19         1  
27         1  
39         1  
40         1  
45        

## Exporting the dataframes to CSV

In [186]:
allStudents.to_csv('cleaned_data/allStudents_'+str(year)+'.csv',index=False)
acceptedMcMaster.to_csv('cleaned_data/acceptedOurUni_'+str(year)+'.csv',index=False)
notMcMaster.to_csv('cleaned_data/didntAccept_'+str(year)+'.csv',index=False)
print(year)

12
