# Initial Data Analysis - Capstone Project

In [33]:
# Imports
import os
import csv
import numpy as np
import pandas as pd
from pandas import Int64Index
import pickle
import sys
import math

In [34]:
def save_as_pkl(object, path):
	pickle.dump(object, open(path, "wb"))

def load_pkl(path):
	obj = pickle.load(open(path, "rb"))
	return obj

### Preprocessing and loading the data

In [35]:
# Processing the .csvs into dataframes and saving them as pickles for easier
# loading on future runs
files = [f for f in os.listdir("data/") if f.split('1')[0] == 'file']
for file in files:
    print ('\n'+file)
    fName = file.split('.')[0]
    df = pd.read_csv('data/' + file, delimiter=',', na_values=['NA'])
#     df["Id"] = df.reset_index().index
#     df.set_index("Id")
    print(df.shape)
    save_as_pkl(df, 'pickles/'+fName+'.pkl')


file12.csv
(4286, 392)

file13.csv
(4638, 392)

file11.csv
(3958, 392)

file10.csv
(3979, 392)

file14.csv
(4836, 392)

file15.csv
(5557, 392)

file17.csv
(5207, 392)

file16.csv
(5456, 392)


In [36]:
## Loading preprocessed dataframes
pklFiles = [f for f in os.listdir("pickles/") if f.split('1')[0] == 'file']
for file in pklFiles:
	df = load_pkl("pickles/" + file)
	print(df.shape)

(4286, 392)
(4638, 392)
(3958, 392)
(3979, 392)
(4836, 392)
(5557, 392)
(5207, 392)
(5456, 392)


# Change Year Loaded Here:

In [37]:
# YEAR:
year = 17
fname = "file"+str(year)+".pkl"
df = load_pkl("pickles/"+fname)

### Starting to analyze the data

In [38]:
# First 30 columns:
firstPart = df.iloc[:,0:30]
# print(firstPart.columns.values)

# Columns 30 to 65: (top 6 course marks)
top6CourseMarks = df.iloc[:,30:66]
# print(top6CourseMarks.columns.values)

# cols = []
# for column in top6CourseMarks[top6CourseMarks.columns[2::3]]:
#     cols.append(df[column])
# hstack = pd.concat([x for x in cols], axis=1)
# hstack.fillna(0, inplace=True)
# hstack['Sum'] = hstack.mean(axis=1)
# print(hstack)

# Columns 66 to 71:
middle = df.iloc[:,66:72]
print(middle["WAVERG1"], middle["WAVERG2"])
# print(middle.columns.values)

# Columns 72 to 391:
choices = df.iloc[:, 72:]
# print(choices.columns.values)

0         0
1         0
2         0
3         0
4         0
       ... 
5202    813
5203    820
5204      0
5205      0
5206      0
Name: WAVERG1, Length: 5207, dtype: int64 0       895
1       833
2       830
3       965
4       913
       ... 
5202    813
5203    820
5204    905
5205      0
5206    890
Name: WAVERG2, Length: 5207, dtype: int64


In [39]:
print(top6CourseMarks.head())
print(top6CourseMarks.columns)

  SECORCOD1  SECORC1  SECORM1 SECORCOD2  SECORC2  SECORM2 SECORCOD3  SECORC3  \
0     CGW4U      100     82.0     CHI4U    100.0     58.0     ENG4U    100.0   
1     CGW4U      100     83.0     CHI4U    100.0     71.0     ENG4U    100.0   
2     ENG4U      100     82.0     MCV4U    100.0     80.0     MDM4U    100.0   
3     BAT4M      100     96.0     ENG4U    100.0     91.0     HRE4M    100.0   
4     ENG4U      100     86.0     MCV4U    100.0     92.0     MDM4U    100.0   

   SECORM3 SECORCOD4  ...  SECORM9  SECORCOD10 SECORC10  SECORM10  SECORCOD11  \
0     80.0     MCV4U  ...      NaN         NaN      NaN       NaN         NaN   
1     76.0     MCV4U  ...      NaN         NaN      NaN       NaN         NaN   
2     84.0     MHF4U  ...      NaN         NaN      NaN       NaN         NaN   
3     90.0     MCV4U  ...     95.0         NaN      NaN       NaN         NaN   
4     93.0     MHF4U  ...      NaN         NaN      NaN       NaN         NaN   

  SECORC11  SECORM11  SECORCOD12

In [40]:
courseCodeCols = ['SECORCOD' + str(i+1) for i in range(12)]
maxScore = ['SECORC' + str(i+1) for i in range(12)] #I'm assuming that's what this column is? (useless)
studentScoreCols = ['SECORM' + str(i+1) for i in range(12)]
goalCourses = ['MHF4U', 'MCV4U', 'ENG4U', 'SCH4U', 'SPH4U']

# Advanced functions: MHF4U
# Calculus: MCV4U
# English: ENG4U
# Chemistry: SCH4U
# Physics: SPH4U
# Next highest mark, anything

In [41]:
def averageFinder(allStudentMarks):
    studentAverages = []
    for index, row in allStudentMarks.iterrows(): 
        try:
            studentCourseCodes = [row[courseCodeCol] for courseCodeCol in courseCodeCols]
            studentGrades = [row[studentScoreCol] for studentScoreCol in studentScoreCols]
            studentCourseCodes = [val for val in studentCourseCodes if type(val) == str]
            studentGrades = [val for val in studentGrades if str(val) != 'nan']

            necessaryCourses = [int(studentGrades[i]) for i in range(len(studentGrades)) if studentCourseCodes[i] in goalCourses]
            remainder = [int(studentGrades[i]) for i in range(len(studentGrades)) if studentCourseCodes[i] not in goalCourses]

            # Some students don't seem to have 6 courses.
            # Set their average to -1 in this case.
            print(remainder)
            print(necessaryCourses)
            average = (max(remainder) + sum(necessaryCourses))/6
            studentAverages.append(average)        
        except ValueError:
#             print("Student with incorrect number of courses applied.")
            studentAverages.append(-1)
    return studentAverages
    
# print(averageFinder(top6CourseMarks))

In [42]:
# Finding the student's choice preference for the university:
# This data is found in the 'choices' dataframe.
uniChoiceCols = ['DUNI'+str(i+1) for i in range(20)]
programChoiceCols = ['DPRO1' + str(i+1) for i in range(20)]
goalUni = '196'

In [43]:
def uniChoiceFinder(studentUniChoices):
    studentAverages = []
    for index, row in studentUniChoices.iterrows():
        # Filtering student's preference for our university and our specific program.
        studentCourseCodes = [i+1 for i in range(20) if (row['DUNI'+str(i+1)] == 196
                                                                                and (row['DPRO'+str(i+1)] == 'SIA' 
                                                                                or row['DPRO'+str(i+1)] == 'SI'))]
        studentAverages.append(studentCourseCodes[0])
    return studentAverages

In [44]:
def acceptedOurUni(firstPart):
    acceptedArray = []
    for index, row in firstPart.iterrows():
        # Filtering student's preference for our university and our specific program.
        studentAccepted = int(row['CONFUNI'] == 196)
        acceptedArray.append(studentAccepted)
    return acceptedArray

# Determining grade cutoff

### Seperating relevant data
- Splitting data into students who accept and offer from mac and those who don't.
- Isolating 8 initial columns of interest for analysis

In [45]:
## Starting columns of interest:
# RESPROV - Province of Residence
# RESCNTY - County of Residence
# ZIP3 - Residence Postal Code (First 3 Digits)
# CONFUNI - Confirmed University (OurUni='196')
# CONFPR - Confirmed Program (OurProg='SI', OurProg_coop='SIA')
# CONFCHOIC - OUAC Confirmed Choice Preference
# WAVERG1 - Weighted Average (best 6 OAC / Senior Level current year finals)
# WAVERG2 - Weighted Average (best 6 OAC / Senior Level all year finals)
## First 6 columns are in first half, last 2 are in "Middle" dataframe.

COIsFirstHalf = ["GEND", "SCHOOL", "ZIP3", "CONFUNI", "CONFPR", "CONFCHOIC"]
COIsSecondHalf = ["WAVERG2"]

# Finding the student's average from the formula given by Dr. Franek.
averages = averageFinder(top6CourseMarks)
averagesDf = pd.DataFrame(averages, columns=['AVG'])

# Making a dataframe with student's preferences
preferences = uniChoiceFinder(choices)
preferencesDf = pd.DataFrame(preferences, columns=['PREF'])

# Dataframe with 1 for accepted our uni, 0 otherwise.
acceptedArray = acceptedOurUni(firstPart)
acceptedDf = pd.DataFrame(acceptedArray, columns=['ACCEPTED'])

# Adding columns of interest found in first half of the data
columnsOfInterest = pd.concat([firstPart[x] for x in COIsFirstHalf], axis=1)

# Adding the rest of the columns of interest
columnsOfInterest = pd.concat([columnsOfInterest] + [middle[x] for x in COIsSecondHalf] + 
                              [averagesDf] + [preferencesDf] + [acceptedDf],axis=1)
copy = columnsOfInterest.copy()

mcmasterVector = copy["CONFUNI"] == 196
notmcmasterVector = copy["CONFUNI"] != 196
allStudents = copy

[82, 58]
[80, 95, 94, 96, 90]
[83, 71]
[76, 85, 85, 82, 89]
[84]
[82, 80, 86, 82, 84]
[96, 90, 98, 93]
[91, 97, 97, 96, 95]
[93]
[86, 92, 95, 93, 89]
[97, 99, 99]
[90, 98, 98, 99, 98]
[98]
[86, 97, 99, 98, 99]
[85, 86, 85, 92]
[83, 91, 93, 84, 87]
[82, 84]
[91, 81, 84, 77, 85]
[93, 97]
[91, 99, 100, 96, 99]
[98, 78]
[77, 98, 94, 99, 97]
[98, 98]
[93, 93, 98, 96, 95]
[70]
[78, 87, 92, 93, 85]
[77, 81]
[78, 76, 86, 80, 88]
[94]
[93, 95, 97, 96, 95]
[92, 84]
[90, 94, 92, 91, 87]
[91]
[85, 88, 89, 83, 90]
[99]
[88, 85, 86, 80, 85]
[90, 97, 99]
[100, 95, 94, 92, 93]
[95, 90]
[85, 97, 84, 94, 95]
[93, 93, 94, 92, 94]
[94, 94, 94, 94, 88]
[99, 87]
[93, 94, 97, 90, 98]
[94, 93, 88]
[94, 94, 92, 87, 86]
[91, 70, 72]
[70, 90, 94, 92, 83]
[90, 86]
[95, 89, 74, 80, 74]
[96, 94, 93]
[85, 97, 99, 93, 95]
[98, 99]
[92, 98, 100, 97, 95]
[87, 96]
[90, 82, 93, 91, 85]
[90]
[72, 73, 83, 78, 75]
[74]
[74, 67, 69, 74, 80]
[95, 98, 93]
[93, 98, 96, 94, 90]
[99, 96]
[90, 95, 90, 95, 95]
[98, 95]
[100, 100, 1

[97, 95, 93]
[94, 96, 97, 96, 97]
[91, 89, 88, 94, 88, 90, 95, 88, 94]
[]
[97, 86, 100, 99]
[90, 96, 94, 94, 92]
[95, 88, 95]
[92, 95, 94, 93, 96]
[98]
[86, 93, 94, 94, 99]
[83, 96]
[83, 93, 86, 90, 90]
[90, 95]
[86, 97, 91, 85, 88]
[83, 90]
[92, 84, 82, 92, 92]
[77, 91, 85]
[83, 90, 93, 90, 86]
[95, 96, 95, 99]
[94, 96, 95, 91, 98]
[98, 96, 91]
[89, 93, 93, 91, 94]
[96]
[78, 95, 98, 80, 91]
[92, 90]
[80, 91, 91, 80, 86]
[94, 78]
[82, 83, 88, 88, 80]
[98, 95, 84, 98, 99, 98, 97]
[95, 94, 97, 98, 98]
[78, 85, 99]
[80, 98, 96, 92, 92]
[96, 94]
[87, 90, 92, 93, 92]
[95, 100]
[90, 98, 100, 99, 98]
[90]
[82, 92, 90, 80, 91]
[95, 95, 96, 76]
[91, 83, 90, 82, 82]
[98, 96, 91]
[93, 95, 95, 93, 89]
[93, 100, 97, 90]
[90, 98, 98, 96, 97]
[87, 81, 86, 89, 91, 85, 84]
[]
[90, 96, 97, 90]
[84, 94, 97, 99, 97]
[85, 78]
[75, 82, 81, 87]
[94]
[85, 88, 91, 90, 84]
[92, 92]
[86, 87, 93, 89, 97]
[84]
[80, 96, 96, 85, 93]
[91, 96, 84, 88]
[90, 88, 91, 80, 68]
[99]
[75, 94, 92, 94, 81]
[92, 84, 79, 91, 94]

[92, 90, 93, 77, 93]
[98, 88, 91]
[80, 84, 90, 86, 90]
[85]
[78, 86, 89, 80, 86]
[85, 90, 93]
[92, 74, 78, 90, 92]
[90, 84, 92, 92, 95]
[75, 93, 95, 90, 86]
[92, 80, 99]
[86, 90, 94, 86, 90]
[90, 94, 96, 88]
[90, 90, 87, 80, 86]
[97]
[91, 97, 96, 97, 94]
[95, 96]
[70, 78, 93, 78, 82]
[94, 96]
[86, 90, 95, 95, 90]
[84]
[90, 83, 99, 90, 88]
[98, 94]
[92, 91, 96, 97, 96]
[97, 89, 95, 95]
[90, 96, 96, 95, 97]
[97, 97, 92, 83, 94, 99]
[94, 95, 96, 96, 96]
[86, 90, 93]
[90, 83, 76, 85, 96]
[90, 94, 95]
[80, 94, 93, 86, 83]
[94]
[78, 67, 86, 80, 84]
[96, 84, 84, 83, 84, 93]
[92, 99, 99, 92, 97]
[92, 92, 84]
[81, 84, 90, 84, 80]
[85, 98, 98]
[92, 99, 98, 96, 96]
[12, 89]
[75, 94, 88, 86, 91]
[93, 91]
[85, 74, 86, 78, 81]
[92]
[94, 91, 89, 85, 90]
[93]
[80, 84, 93, 85, 87]
[96, 91, 94, 98, 96, 95]
[95, 96, 97, 96, 98]
[95, 90, 82, 90]
[83, 88, 85, 82, 83]
[93, 96, 99]
[90, 92, 95, 94, 91]
[90, 90, 92]
[90, 87, 93, 88, 86]
[91, 98, 90]
[86, 90, 97, 88, 91]
[88, 91, 88]
[85, 94, 93, 90, 91]
[83, 

[87, 97, 95, 94, 94]
[86, 88, 96, 87, 95]
[96, 98, 80, 87]
[90, 98]
[90, 82, 87, 90, 83]
[87]
[80, 98, 98, 97, 89]
[96, 94, 99]
[90, 93, 91, 90]
[80]
[88, 93, 88, 90, 91]
[94]
[85, 96, 96, 97, 96]
[95, 93, 90, 94]
[88, 99, 99, 93, 92]
[92, 100]
[93, 100, 99, 98, 91]
[95, 94, 98]
[95, 96, 98, 94, 99]
[92, 98]
[88, 90, 86, 86, 87]
[93, 96, 95, 95, 86]
[95, 94, 76, 91]
[70, 83]
[68, 83, 85, 86]
[88, 91, 90, 88]
[88, 93, 96, 93, 91]
[87, 91]
[80, 88, 95, 73, 90]
[97]
[96, 99, 97, 97, 95]
[78, 93]
[87, 97, 95, 93, 95]
[96, 87]
[90, 92, 91, 84, 83]
[90, 86]
[81, 82, 94, 90, 75]
[80, 96, 86]
[81, 87, 80, 83, 88]
[95, 98, 86, 95]
[91, 93, 91, 92, 90]
[96, 92]
[90, 92, 98, 93, 95]
[92, 77]
[81, 81, 88, 84, 81]
[72]
[90, 99, 99, 99, 93]
[91, 91]
[80, 82, 86, 74, 85]
[95, 90, 97, 83]
[90, 92, 94, 91, 87]
[84, 95, 94]
[87, 90, 83, 88, 90]
[80, 86]
[81, 80, 86, 83, 76]
[90, 99, 82, 94]
[95, 90, 98, 96, 90]
[87, 91, 90, 94]
[89, 90, 89, 92, 83]
[90, 93]
[90, 90, 92, 90, 83]
[94, 89]
[84, 82, 90, 86,

[88, 90, 80, 88]
[86, 85, 90, 90, 82]
[97, 90, 94]
[91, 94, 90, 86, 82]
[82, 90]
[86, 85, 92, 90, 90]
[99, 99, 93, 90]
[88, 98, 96, 94, 99]
[92, 95]
[81, 88, 86, 87, 92]
[93, 98, 99]
[81, 92, 99, 79, 96]
[87, 88]
[80, 81, 87, 71, 74]
[90, 98]
[97, 90, 99, 88, 98]
[92, 96]
[95, 93, 98, 93, 95]
[86, 90, 85]
[88, 90, 93, 80, 87]
[95]
[84, 92, 90, 78, 92]
[97, 97]
[88, 81, 85, 90, 84]
[72]
[75, 80, 88, 80, 87]
[88, 91]
[90, 93, 98, 94, 95]
[95, 94]
[87, 96, 94, 92, 77]
[97, 86, 83]
[96, 95, 99, 93, 83]
[91]
[90, 93, 96, 90, 95]
[98, 94, 97, 91, 99, 98, 97]
[94, 96, 97, 97, 86]
[90, 86]
[90, 71, 80, 86, 78]
[98, 97]
[83, 98, 99, 93, 96]
[89, 94, 91]
[91, 96, 98, 94, 97]
[93, 92, 92, 94]
[96, 97, 95, 93, 95]
[84, 87, 93, 81]
[80, 90, 92, 84, 90]
[95, 95, 90, 93]
[90, 96, 98, 97, 98]
[85, 91, 91, 90, 94, 92]
[87, 87, 93, 95, 93]
[92, 97, 95, 96, 94]
[94, 94, 90, 95, 91]
[90, 92, 92, 95]
[85, 93, 99, 93, 97]
[98, 90, 88]
[82, 94, 90, 87, 84]
[97, 91, 94]
[94, 95, 98, 96, 96]
[91, 90, 92, 81, 8

[91, 92]
[88, 91, 97, 87, 96]
[93, 90, 85]
[92, 80]
[92]
[72, 83, 86, 70, 89]
[90, 99]
[76, 66, 77, 83, 84]
[84, 90]
[82, 83, 91, 81, 96]
[93, 90, 88]
[85, 96, 93, 93, 94]
[59, 93, 88]
[81, 80, 84, 82, 80]
[93, 95, 94]
[90, 94, 95, 95, 95]
[91]
[81, 94, 99, 92, 93]
[84, 88, 87]
[87, 88, 91, 87, 92]
[99, 93, 98]
[92, 95, 98, 90, 93]
[91, 98, 90]
[77, 81, 81, 81, 70]
[88]
[80, 83, 85, 83, 88]
[90, 51]
[80, 91, 95, 90, 80]
[95, 85]
[83, 88, 88, 83, 90]
[90, 85]
[93, 80, 88, 97, 84]
[96]
[80, 98, 99, 90, 90]
[70, 95]
[96, 71, 99, 60, 96]
[85, 80]
[84, 76, 80, 75]
[97, 87, 100]
[86, 94, 97, 81, 86]
[74, 90]
[83, 63, 88, 94, 70]
[80, 93]
[71, 92, 99, 82, 85]
[96, 94, 81]
[86, 85, 91, 84]
[94, 97]
[80, 93, 93, 86, 87]
[99, 97]
[88, 98, 98, 98, 90]
[87, 86]
[86, 76, 75, 80, 80]
[98, 90, 91, 91]
[88, 90, 88, 83, 87]
[97, 92, 94, 99, 96]
[92, 98, 98, 99, 100]
[90, 88]
[93, 95, 96, 91, 87]
[90, 83]
[90, 87, 90, 83, 85]
[94, 68, 73]
[80, 90, 82, 90, 81]
[98, 91, 80]
[85, 86, 90, 82, 91]
[95, 99, 9

[90, 70, 90, 93]
[90, 93, 91, 80, 86]
[93, 93, 96]
[87, 93, 96, 90, 90]
[93]
[87, 88, 92, 94, 93]
[96, 95]
[88, 87, 81, 82, 91]
[96, 97, 96]
[91, 92, 98, 90, 95]
[95, 90, 90]
[82, 80, 86, 65]
[90, 91]
[77, 82, 91, 90, 90]
[90, 90]
[78, 81, 81, 73, 85]
[76, 93]
[81, 86, 92, 95, 88]
[98, 93, 97, 95, 98, 88, 98, 98]
[97, 95, 95, 87]
[96, 85, 92, 98]
[100, 98, 100, 93, 90]
[86, 82, 84, 77]
[85, 82, 88, 66, 71]
[70]
[64, 51, 50, 50, 51]
[96, 90]
[90, 91, 91, 84, 83]
[96]
[75, 88, 97, 96, 98]
[86, 99, 94, 83, 92, 92, 84]
[94, 96, 96, 94, 97]
[90, 94]
[93, 96, 98, 96, 93]
[97, 54, 71]
[91, 91, 86, 95, 76]
[93, 92]
[92, 90, 88, 87, 90]
[90, 74, 88]
[85, 80, 87, 80, 88]
[88, 99, 96]
[86, 99, 99, 97, 94]
[90, 91, 83]
[75, 86, 90, 78, 91]
[83, 81]
[91, 85, 83, 87, 93]
[99]
[93, 80, 90, 89, 75]
[70, 80]
[80, 81, 85, 75, 86]
[98, 90, 93, 92]
[87, 96, 94, 91, 91]
[94, 93, 92]
[93, 91, 91, 91, 90]
[82, 82, 98]
[90, 98, 97, 98, 98]
[95, 95, 96]
[80, 92, 91, 83, 82]
[95, 87, 84, 97]
[88, 90, 94, 90, 90

[97]
[94, 99, 98, 93, 70]
[97, 96, 96, 97, 90, 98, 98]
[96, 99, 95, 98, 95]
[93, 92, 82]
[85, 76, 77, 80, 82]
[95, 97, 96, 90]
[86, 97, 97, 95, 93]
[91, 95]
[90, 88, 87, 83, 85]
[94, 89, 95]
[88, 91, 95, 90, 95]
[91, 93, 83]
[93, 98, 98, 95, 90]
[97, 87, 97]
[96, 80, 85, 80, 79]
[91, 94, 86, 97]
[93, 95, 96, 83, 90]
[73, 80, 86, 76, 77]
[80, 93, 86, 77, 71]
[55, 84]
[75, 88, 80, 81, 85]
[88, 94, 100, 82]
[82, 88, 94, 76, 90]
[91, 98, 92]
[98, 90, 91, 92, 91]
[96, 96, 85, 99, 100, 98]
[96, 98, 97, 98, 99]
[80]
[71, 90, 90, 88, 80]
[98, 91, 91, 90]
[81, 93, 96, 92, 94]
[95, 92, 93, 96, 96, 95]
[95, 94, 93, 91, 90]
[94, 51]
[81, 87, 92, 95, 96]
[86, 81, 79]
[90, 80, 90, 96, 81]
[93, 93, 97, 93]
[96, 94, 98, 95, 96]
[84, 90]
[92, 85, 87, 90, 88]
[96, 96, 92, 99, 96, 99, 95]
[92, 98, 98, 95, 95]
[85, 95, 87]
[80, 89, 95, 88, 87]
[99, 90, 97]
[90, 95, 90, 94, 92]
[95, 86, 94, 86, 94, 95]
[90, 87, 85, 84, 95]
[99, 97, 99]
[97, 97, 100, 99, 99]
[67, 86, 97, 73, 86, 82, 80]
[92, 94, 94, 76]
[96

[90, 88, 91]
[87, 98, 92, 86, 82]
[90, 86, 93]
[85, 83, 80, 86, 83]
[97, 90, 100, 93]
[90, 97, 99, 97, 95]
[91, 72, 86]
[94, 87, 90, 90, 71]
[90, 86]
[86, 97, 94, 94, 93]
[86, 80]
[70, 91, 93, 71, 91]
[91]
[90, 95, 92, 91, 93]
[90, 83]
[84, 86, 92, 81, 80]
[83]
[82, 90, 90, 90, 83]
[82, 95, 59, 84]
[80, 83, 85, 74, 88]
[88]
[86, 93, 91, 88, 85]
[98, 93, 100, 95]
[97, 99, 99, 95, 97]
[71, 79, 87]
[85, 80, 77, 82]
[84, 72, 78, 90, 90]
[84, 93, 88, 93, 97]
[80, 90, 83]
[86, 93, 83, 88, 90]
[95, 93, 85]
[86, 100, 99, 90, 95]
[92, 97, 87, 88]
[97, 96, 88, 77, 90]
[98]
[88, 97, 98, 98, 96]
[80]
[66, 85, 75, 72, 76]
[56, 86, 90, 93]
[90, 90, 98, 91, 90]
[80, 90]
[78, 93, 94, 92, 90]
[83, 76, 75]
[96, 84, 70, 91]
[92]
[96, 95, 88, 85, 89]
[96, 98, 95]
[84, 85, 95, 92, 90]
[88, 92, 95, 86]
[93, 91, 97, 93, 82]
[77]
[85, 85, 79, 82, 89]
[91, 93, 92, 92]
[91, 90, 81, 95, 90]
[93, 98, 85]
[93, 94, 95, 92, 90]
[91, 95, 95]
[93, 92, 88, 87, 94]
[93, 90, 93, 85]
[94, 88, 91, 90, 84]
[94, 89, 83, 92, 

[90, 82]
[78, 92, 92, 86, 87]
[87, 93, 91, 80]
[93, 91, 92, 88, 90]
[88, 94, 90, 95]
[88, 100, 74, 89, 87]
[83]
[85, 90, 85, 92, 82]
[83]
[78, 87, 95, 90, 90]
[92, 92, 92, 95, 85, 96]
[90, 97, 97, 96, 100]
[98, 96, 98, 94]
[91, 91, 95, 86, 93]
[93, 94, 88]
[81, 92, 99, 91, 68]
[96, 60, 80, 82]
[87, 76, 80, 50, 85]
[72, 52]
[79, 75, 73, 80]
[70, 87]
[76, 91, 87, 92, 86]
[88, 88, 96]
[90, 91, 87, 78, 87]
[91, 91, 88]
[88, 98, 95, 95, 91]
[86, 80, 93]
[88, 94, 95, 90, 77]
[88, 95, 99]
[71, 97, 92, 84, 84]
[90, 94, 87, 90, 97]
[93, 92, 91, 91, 90]
[92, 88]
[90, 97, 94, 90, 92]
[99, 98]
[87, 92, 95, 90, 88]
[94]
[80, 99, 94, 93, 99]
[88, 81, 96, 83]
[85, 90, 93, 93, 90]
[96, 98, 95]
[91, 93, 92, 91, 91]
[85, 90, 97]
[81, 88, 80, 70, 82]
[100]
[80, 90, 93, 83, 89]
[88, 94, 96, 87, 98, 98, 97]
[95, 98, 95, 97, 89]
[90, 88, 95, 94]
[92, 92, 93, 95, 94]
[85, 86, 86, 90]
[80, 83, 88, 84, 78]
[98, 94, 90]
[88, 92, 91, 84, 91]
[78]
[78, 73, 70, 77, 75]
[99, 94, 92]
[87, 98, 94, 92, 87]
[88, 92]
[8

[87]
[85, 95, 93, 83, 87]
[88, 83, 80, 91]
[93, 74, 87, 80, 90]
[81, 93, 89]
[90, 85, 84, 82, 81]
[98, 95, 96]
[90, 99, 99, 94, 97]
[56, 60]
[90, 85, 83, 85, 80]
[95, 80]
[90, 82, 98, 91, 85]
[95, 90, 78, 88, 96]
[96, 87, 94, 90, 82]
[93, 86, 93, 94]
[82, 87, 90, 91, 90]
[97, 96, 88, 95, 73, 90]
[88, 95, 95, 94, 91]
[91, 96]
[85, 95, 96, 90, 87]
[85, 86]
[83, 83, 76, 80, 78]
[74, 73]
[76, 55, 75, 81, 86]
[88]
[87, 69, 78, 80, 78]
[91, 94, 94]
[75, 72, 84, 72]
[90]
[90, 90, 90, 90, 92]
[96, 96]
[91, 97, 92, 94, 99]
[97, 97]
[90, 95, 93, 97, 97]
[95, 91, 92, 92, 93, 93, 87, 100, 100, 95, 100]
[]
[88, 97, 93]
[88, 94, 97, 91, 87]
[95, 95]
[85, 87, 95, 81, 85]
[91, 70]
[87, 87, 90, 82, 92]
[91]
[88, 83, 81, 87, 84]
[98, 95]
[90, 86, 82, 85, 83]
[83, 95, 82, 96]
[90, 98, 94, 97, 96]
[92, 92, 93, 78, 78, 92, 97]
[88, 99, 99, 96, 99]
[90, 98, 95]
[90, 97, 97, 93, 95]
[91, 95]
[88, 97, 95, 91, 93]
[95, 88]
[86, 94, 92, 93, 83]
[90, 70]
[71, 68, 82, 76, 74]
[96, 94, 61, 78, 88, 78]
[78, 83, 83,

[93, 80, 84]
[77, 75, 77, 70, 85]
[90, 80]
[80, 88, 91, 85, 94]
[95]
[80, 94, 90, 77, 85]
[91]
[72, 88, 85, 85, 86]
[85, 84, 95]
[80, 73, 70, 80, 81]
[75, 90, 64, 83]
[87, 73, 94, 83, 81]
[90, 74]
[86, 94, 92, 92, 86]
[87, 90, 96]
[87, 89, 89, 93, 88]
[92]
[80, 78, 83, 80, 90]
[70, 86, 84]
[83, 62, 83, 85, 67]
[92, 91]
[86, 89, 91, 88, 88]
[81, 90]
[88, 76, 82, 88, 81]
[95, 92, 94]
[95, 95, 96, 87, 93]
[91, 82, 85, 85, 90]
[81, 91, 88, 88, 86]
[96, 96]
[87, 91, 96, 95, 98]
[91, 92]
[80, 90, 87, 80, 82]
[62, 77, 80]
[92, 84, 86, 92, 91]
[94, 86, 91, 58, 95, 82, 95, 94]
[83, 84, 85, 94]
[95, 95, 95, 88, 88, 89, 100, 100, 97, 100]
[]
[91]
[71, 67, 83, 75, 75]
[99, 96, 99]
[85, 90, 95, 90, 91]
[90]
[90, 80, 81, 91, 95]
[56, 91]
[74, 92, 90, 80, 80]
[82]
[73, 64, 85, 72, 70]
[92, 87]
[83, 80, 84, 81, 87]
[88, 71, 79]
[90, 76, 88, 67, 80]
[93, 84, 87]
[84, 87, 91, 75, 82]
[100, 97, 91]
[93, 88, 86, 91, 85]
[90, 90, 85]
[85, 78, 88, 87, 84]
[90, 94, 96]
[92, 99, 97, 98, 95]
[82, 82, 92]
[86, 

[87, 74, 87, 86]
[72, 72, 75, 90, 70]
[98]
[85, 80, 90, 74, 88]
[98, 95]
[75, 98, 96, 92, 94]
[88, 95]
[93, 92, 92, 94, 96]
[74, 98]
[90, 90, 90, 90, 83]
[87, 84]
[80, 81, 80, 78, 84]
[97, 100, 95, 93]
[95, 100, 100, 87, 98]
[78, 66]
[80, 69, 79, 74, 80]
[84]
[75, 96, 97, 90, 96]
[65, 74, 76, 92, 86, 90, 83]
[76, 88, 68, 83]
[89, 90, 92, 97, 92]
[93, 92, 94, 93, 91]
[93]
[83, 86, 82, 80, 80]
[86, 93]
[88, 89, 95, 85, 90]
[86, 80, 93, 87, 58, 58]
[84, 92, 95, 91, 84]
[90, 84, 74, 85]
[80, 89, 97, 89, 93]
[99, 95, 99]
[86, 96, 99, 97, 97]
[91, 91, 96]
[82, 96, 93, 88, 91]
[86, 83]
[88, 98, 76, 82, 93]
[83, 92]
[80, 87, 93, 87, 87]
[97, 87, 90, 97]
[90, 97, 95, 97, 94]
[92, 83, 85]
[91, 80, 86, 76, 76]
[85, 97]
[88, 95, 96, 84, 81]
[88, 93]
[80, 78, 80, 62, 74]
[96, 86, 94]
[85, 81, 90, 80, 80]
[99, 92]
[81, 94, 95, 77, 87]
[90]
[80, 90, 92, 87, 91]
[94, 88, 95, 96, 99]
[88, 88, 87, 84, 90]
[100]
[85, 94, 90, 97, 90]
[85, 87, 92]
[85, 82, 88, 54, 74]
[68, 95]
[80, 83, 74, 91]
[93, 95]
[97

[96, 95, 98]
[93, 96, 99, 80, 87]
[94, 88, 90]
[85, 90, 90, 87, 80]
[85, 90]
[78, 86, 85, 90, 94]
[95, 94, 97]
[82, 97, 94, 93, 97]
[90]
[80, 94, 95, 91, 87]
[66, 90, 65]
[68, 81, 50]
[70, 78, 87, 92, 84, 90, 86]
[]
[82, 71, 85]
[82, 80, 83, 88, 79]
[88, 98, 96]
[90, 96, 100, 95, 91]
[100, 88]
[70, 98, 99, 94, 91]
[90, 86, 80]
[91, 80, 83, 80, 83]
[74, 83, 85, 80]
[92, 82, 91, 77, 83]
[96, 95, 94]
[81, 92, 93, 85, 88]
[98, 97, 96, 93]
[92, 85, 99, 91, 96]
[90]
[94, 86, 87, 87, 85]
[63, 68]
[82, 82, 93, 91, 78]
[60, 86, 78]
[70, 87, 88, 83, 80]
[89, 96]
[85, 89, 98, 90, 89]
[82, 81, 96]
[81, 72, 82, 74, 78]
[90, 76]
[85, 78, 83, 73, 73]
[85]
[75, 90, 90, 88, 70]
[94, 98, 92, 92, 95]
[91, 91, 90, 83, 93]
[99]
[65, 89, 97, 80, 84]
[86, 85, 92, 91]
[90, 92, 95, 93, 82]
[91, 96, 96]
[88, 96, 98, 97, 93]
[61, 85]
[70, 90, 78, 84]
[97, 93, 98]
[88, 99, 100, 86, 98]
[91, 99, 91, 93]
[77, 98, 95, 90, 71]
[92, 85, 88]
[83, 85, 90, 90, 85]
[91]
[83, 88, 97, 75, 91]
[84]
[73, 64, 87, 80, 79]
[81, 

[93, 97, 97, 92]
[86, 92, 96, 90, 89]
[93, 97]
[78, 90, 93, 91, 92]
[92, 98, 97, 92, 100, 96, 99]
[96, 98, 99, 96, 100]
[96, 93, 93, 96]
[89, 93, 95, 80, 80]
[98, 97, 100]
[95, 98, 100, 95, 99]
[81]
[83, 84, 85, 84, 80]
[88, 100]
[85, 85, 93, 86, 91]
[82, 90, 78, 65]
[90, 78, 76, 74, 87]
[98, 97, 96, 95]
[77, 83, 88, 86, 90]
[93, 91]
[80, 98, 95, 88, 97]
[80, 85]
[82, 88, 86, 80, 80]
[96, 85]
[88, 97, 97, 91, 92]
[90]
[84, 73, 92, 81, 76]
[88, 80, 77]
[82, 97, 96, 71, 80]
[80, 81]
[80, 91, 96, 94, 92]
[87]
[86, 78, 82, 79, 81]
[95, 96]
[95, 94, 83, 88]
[96]
[88, 90, 89, 93, 87]
[95, 95, 86]
[75, 83, 84, 74, 81]
[91, 95, 95]
[74, 93, 95, 84, 92]
[97, 90, 97]
[86, 80, 78, 87, 96]
[85, 84]
[88, 83, 79, 75, 86]
[98, 92, 95]
[94, 90, 92, 90, 91]
[91, 73]
[80, 91, 89, 78, 70]
[87, 75, 98, 89]
[80, 85, 82, 87, 92]
[83, 92]
[84, 87, 94, 91, 92]
[80, 82]
[87, 84, 90, 88, 84]
[93]
[87, 82, 91]
[91]
[83, 80, 80, 83, 86]
[93, 94, 96]
[92, 97, 97, 95, 97]
[76]
[81, 80, 90, 74, 72]
[73, 82, 94, 75]


[91, 95, 95]
[85, 93, 87, 90, 94]
[95, 94]
[81, 88, 90, 86, 85]
[99, 93, 96, 83]
[81, 80, 87, 83, 75]
[88, 83]
[73, 81, 80, 65, 71]
[92, 84, 82]
[83, 84, 83, 83, 88]
[94, 88, 80]
[92, 83, 91, 86]
[92, 86]
[88, 84, 85, 85, 90]
[94, 82, 85, 97, 92, 87, 94, 85]
[]
[90]
[89]
[94]
[81, 91, 96, 88, 90]
[91, 92]
[82, 90, 94, 91, 91]
[71, 90, 87, 81]
[80, 88, 95, 93, 90]
[84, 84, 90]
[80, 89, 87, 82, 80]
[94, 87]
[85, 88, 83, 84, 88]
[91]
[68, 70, 68, 78]
[70, 71]
[75, 75, 86, 70, 80]
[98, 97, 85]
[76, 93, 93, 75, 87]
[87, 80]
[90, 70, 75, 84, 72]
[85, 97]
[85, 99, 98, 86, 91]
[96]
[71, 68, 80, 60, 74]
[78]
[75, 96, 99, 90, 93]
[93]
[82, 98, 90, 90, 96]
[80, 95]
[77, 75, 80, 80, 80]
[94]
[75, 98, 96, 91, 96]
[82, 80, 85]
[70, 80, 86, 77, 80]
[92]
[78, 75, 72, 51, 63]
[96, 90, 97, 85, 80]
[84, 96, 91, 90, 95]
[66, 90]
[71, 94, 98, 87, 82]
[85, 88, 93, 94, 74]
[85, 95, 90, 72, 78]
[84]
[85, 90, 83, 83, 74]
[90, 97]
[86, 66, 73, 80, 64]
[90, 88]
[87, 78, 87, 90, 84]
[79]
[83, 71, 70, 65, 64]
[96,

[87, 92, 86, 80, 88]
[96, 92, 95]
[94, 99, 93, 96, 91]
[90, 71, 90, 80, 77, 80, 64]
[83, 87, 87, 65, 87]
[79, 83, 92]
[78, 85, 94, 85, 88]
[92]
[94, 97, 98, 96, 96]
[72]
[70, 67, 82, 75, 70]
[92, 80, 80, 95, 91]
[97, 86, 68, 73]
[78]
[85, 66, 90, 81, 77]
[80, 90]
[72, 70, 78, 90, 70]
[91, 91, 96]
[89, 92, 95, 88, 84]
[91, 77, 85]
[66, 80, 88, 81, 84]
[86, 78, 82]
[88, 82, 86, 78, 86]
[90]
[87, 80, 82, 77, 81]
[87, 86]
[92, 86, 87, 78, 78]
[87]
[78, 77, 85, 81, 78]
[76]
[84, 88, 95, 75, 90]
[78, 97, 80, 93]
[82, 82, 80, 76, 85]
[95, 92, 90, 88, 95, 90]
[92, 92, 96, 88, 84]
[93, 91, 97]
[86, 75, 87, 83, 86]
[90]
[78, 92, 92, 73, 82]
[97]
[79, 95, 93, 86, 91]
[93, 85]
[95, 95, 91, 71, 92]
[88, 88]
[75, 80, 88, 85, 88]
[92]
[74, 93, 88, 91, 93]
[99]
[80, 100, 93, 85, 90]
[93, 92]
[86, 90, 93, 70, 84]
[78]
[85, 91, 92, 85, 93]
[80, 94]
[78, 91, 94, 92, 92]
[87, 85, 90]
[81, 89, 90, 92, 85]
[65, 37, 87]
[86, 85, 91, 80, 85]
[96, 84, 83]
[80, 87, 88, 88]
[91]
[83, 94, 96, 99, 94]
[88, 84]
[90

[85, 94]
[80, 90, 90, 81, 90]
[90, 78]
[80, 77, 85, 78, 77]
[99]
[75, 97, 97, 85, 87]
[92, 93]
[85, 96, 90, 84, 96]
[77, 88, 91]
[84, 80, 80, 85, 87]
[75]
[70, 78, 87, 65, 83]
[61]
[70, 80, 81, 84, 73]
[45, 57]
[60, 38, 42]
[90]
[90, 93, 88, 79, 82]
[85, 88]
[75, 88, 93, 80, 87]
[94]
[77, 87, 91, 90, 82]
[67]
[75, 80, 81, 87, 71]
[88, 85]
[86, 90, 85, 88, 88]
[80]
[78, 87, 81, 77]
[95, 90]
[80, 84, 88, 92, 88]
[88, 84]
[94, 80, 60, 94]
[88, 90]
[93, 84, 90, 79, 92]
[92, 65]
[81, 93, 94, 91, 86]
[90, 87, 81]
[72, 83, 90, 80, 87]
[88, 87, 83]
[88, 90, 97, 81, 84]
[92, 92]
[91, 71, 75, 77, 88]
[81, 68]
[92, 82, 81, 76, 80]
[94, 93, 98, 90, 96, 95, 91]
[97, 95, 93, 94, 97]
[60, 90, 75]
[68, 80, 76, 75]
[71, 84, 93, 74]
[94, 91, 92, 91, 90]
[98, 88, 87]
[96, 90, 86, 65]
[96, 91, 90]
[80, 85, 75, 90, 90]
[95, 95, 95, 88]
[93, 90, 92, 89, 93]
[90, 81]
[80, 74, 80, 77]
[83, 85]
[80, 88, 85, 77, 85]
[91]
[80, 82, 78, 77, 92]
[80]
[83, 90, 93, 88, 88]
[91]
[71]
[75, 85]
[75, 50, 70, 95, 73]
[88]

### Some min/max/average output for both student types:

In [46]:
# Students that received and accepted a McMaster offer:
acceptedMcMaster = copy[mcmasterVector]
# print(acceptedMcMaster)
# print(acceptedMcMaster[(acceptedMcMaster["WAVERG2"] > 0) & (acceptedMcMaster["CONFPR"] == "SIA")].min())
print("Students who accepted an offer from McMaster:")
print("Mean values for each column:")
print(acceptedMcMaster[(acceptedMcMaster["AVG"] > 0) & (acceptedMcMaster["CONFPR"] == "SIA")].mean())
print("\nMax values for each column:")
print(acceptedMcMaster[(acceptedMcMaster["AVG"] > 0) & (acceptedMcMaster["CONFPR"] == "SIA")].max())
print("\nLowest 5 weighted average:")
print(acceptedMcMaster[(acceptedMcMaster["AVG"] > 0) & (acceptedMcMaster["CONFPR"] == "SI")].nsmallest(20,"WAVERG2"))
print("\nShape of acceptedMcMaster df (number of students that accepted):")
print(acceptedMcMaster.shape)

# Students that didn't accept a mcmaster offer:
print('\n\n\n\n')
notMcMaster = copy[notmcmasterVector]
# print(notMcMaster)
print("Min average of students who didn't accept mac offer")
print(notMcMaster[notMcMaster["AVG"] > 0].min())
print("\nMax average of students who didn't accept mac offer")
print(notMcMaster[notMcMaster["AVG"] > 0].max())
print("\nAverage average of students who didn't accept mac offer")
print(notMcMaster[notMcMaster["AVG"] > 0].mean())


Students who accepted an offer from McMaster:
Mean values for each column:
GEND           119.784044
SCHOOL       10701.856946
CONFUNI        196.000000
CONFCHOIC        2.486933
WAVERG2        907.513067
AVG             88.983494
PREF             2.486933
ACCEPTED         1.000000
dtype: float64

Max values for each column:
GEND             120
SCHOOL         11245
CONFUNI          196
CONFPR           SIA
CONFCHOIC         10
WAVERG2          990
AVG          98.3333
PREF              10
ACCEPTED           1
dtype: object

Lowest 5 weighted average:
      GEND  SCHOOL ZIP3  CONFUNI CONFPR  CONFCHOIC  WAVERG2        AVG  PREF  \
2785   120   10735  M2U    196.0     SI        2.0      850  85.000000     2   
2797   120   10781  M7N    196.0     SI        2.0      852  85.166667     2   
3878   120   11073  N3I    196.0     SI        1.0      858  85.833333     1   
1858   120   10799  L2T    196.0     SI        5.0      867  86.666667     5   
3526   120   11093  N0D    196.0     SI   

In [47]:
# print(acceptedMcMaster.shape)
# print(notMcMaster.shape)
print('McMaster students sample:')
print(acceptedMcMaster.head(10))
print('\n\n')
print('All Students sample:')
print(allStudents.head(20))
# print(notMcMaster.head(50))

McMaster students sample:
    GEND  SCHOOL ZIP3  CONFUNI CONFPR  CONFCHOIC  WAVERG2        AVG  PREF  \
4    120   11219  O2M    196.0    SIA        3.0      913  91.333333     3   
7    120   10329  M7Q    196.0    SIA        2.0      890  88.333333     2   
8    119   10329  M7Q    196.0     SN        3.0      845  83.666667     1   
18   119   10593  M1S    196.0    SIA        1.0      963  95.500000     1   
19   120   10587  N2X    196.0    SIA        5.0      927  91.666667     5   
38   120   11021  M0U    196.0    SIA        3.0      928  90.500000     3   
48   120   10327  N3O    196.0    SIA        3.0      932  93.166667     3   
51   120   10252  M6S    196.0    SIA        1.0      945  91.333333     1   
67   120   10779  M8B    196.0    SIA        3.0      942  91.333333     3   
69   119   10974  L3M    196.0    SIA        3.0      908  90.166667     3   

    ACCEPTED  
4          1  
7          1  
8          1  
18         1  
19         1  
38         1  
48        

## Exporting the dataframes to CSV

In [48]:
allStudents.to_csv('cleaned_data/allStudents_'+str(year)+'.csv',index=False)
acceptedMcMaster.to_csv('cleaned_data/acceptedOurUni_'+str(year)+'.csv',index=False)
notMcMaster.to_csv('cleaned_data/didntAccept_'+str(year)+'.csv',index=False)
print(year)

17
