# PISA2012 DATA ANALYSIS
## by Steve

## Preliminary Wrangling

> .

In [1]:
# import all packages and set plots to be embedded inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

In [30]:
# read PISA data in chunks
lst = []

for chunk in pd.read_csv('data/pisa2012.csv', encoding='ISO-8859-1', index_col=0, low_memory=False, chunksize=50000):
    lst.append(chunk)
    
df = pd.concat(lst, axis=0)

# clearing memory occupied by lst
del lst

# check
df.head()

Unnamed: 0,CNT,SUBNATIO,STRATUM,OECD,NC,SCHOOLID,STIDSTD,ST01Q01,ST02Q01,ST03Q01,...,W_FSTR75,W_FSTR76,W_FSTR77,W_FSTR78,W_FSTR79,W_FSTR80,WVARSTRR,VAR_UNIT,SENWGT_STU,VER_STU
1,Albania,80000,ALB0006,Non-OECD,Albania,1,1,10,1.0,2,...,13.7954,13.9235,13.1249,13.1249,4.3389,13.0829,19,1,0.2098,22NOV13
2,Albania,80000,ALB0006,Non-OECD,Albania,1,2,10,1.0,2,...,13.7954,13.9235,13.1249,13.1249,4.3389,13.0829,19,1,0.2098,22NOV13
3,Albania,80000,ALB0006,Non-OECD,Albania,1,3,9,1.0,9,...,12.7307,12.7307,12.7307,12.7307,4.2436,12.7307,19,1,0.1999,22NOV13
4,Albania,80000,ALB0006,Non-OECD,Albania,1,4,9,1.0,8,...,12.7307,12.7307,12.7307,12.7307,4.2436,12.7307,19,1,0.1999,22NOV13
5,Albania,80000,ALB0006,Non-OECD,Albania,1,5,9,1.0,10,...,12.7307,12.7307,12.7307,12.7307,4.2436,12.7307,19,1,0.1999,22NOV13


In [32]:
df.shape

(485490, 635)

### Testing

In [23]:
# This is for test. Loading only 10 thousand rows from whole data set
df =  pd.read_csv('data/pisa2012.csv', encoding='ISO-8859-1', index_col=0, nrows=10000)

#check
df.head()

Unnamed: 0,CNT,SUBNATIO,STRATUM,OECD,NC,SCHOOLID,STIDSTD,ST01Q01,ST02Q01,ST03Q01,...,W_FSTR75,W_FSTR76,W_FSTR77,W_FSTR78,W_FSTR79,W_FSTR80,WVARSTRR,VAR_UNIT,SENWGT_STU,VER_STU
1,Albania,80000,ALB0006,Non-OECD,Albania,1,1,10,1,2,...,13.7954,13.9235,13.1249,13.1249,4.3389,13.0829,19,1,0.2098,22NOV13
2,Albania,80000,ALB0006,Non-OECD,Albania,1,2,10,1,2,...,13.7954,13.9235,13.1249,13.1249,4.3389,13.0829,19,1,0.2098,22NOV13
3,Albania,80000,ALB0006,Non-OECD,Albania,1,3,9,1,9,...,12.7307,12.7307,12.7307,12.7307,4.2436,12.7307,19,1,0.1999,22NOV13
4,Albania,80000,ALB0006,Non-OECD,Albania,1,4,9,1,8,...,12.7307,12.7307,12.7307,12.7307,4.2436,12.7307,19,1,0.1999,22NOV13
5,Albania,80000,ALB0006,Non-OECD,Albania,1,5,9,1,10,...,12.7307,12.7307,12.7307,12.7307,4.2436,12.7307,19,1,0.1999,22NOV13


In [20]:
# read pisa - dictionaries (column descriptions)
df_colname = pd.read_csv('data/pisadict2012.csv', encoding='ISO-8859-1')

In [21]:
for i in df_colname.index:
    print('{}-{} : {}'.format(i,df_colname['Unnamed: 0'].iloc[i], df_colname['x'].iloc[i]))

0-CNT : Country code 3-character
1-SUBNATIO : Adjudicated sub-region code 7-digit code (3-digit country code + region ID + stratum ID)
2-STRATUM : Stratum ID 7-character (cnt + region ID + original stratum ID)
3-OECD : OECD country
4-NC : National Centre 6-digit Code
5-SCHOOLID : School ID 7-digit (region ID + stratum ID + 3-digit school ID)
6-STIDSTD : Student ID
7-ST01Q01 : International Grade
8-ST02Q01 : National Study Programme
9-ST03Q01 : Birth - Month
10-ST03Q02 : Birth -Year
11-ST04Q01 : Gender
12-ST05Q01 : Attend <ISCED 0>
13-ST06Q01 : Age at <ISCED 1>
14-ST07Q01 : Repeat - <ISCED 1>
15-ST07Q02 : Repeat - <ISCED 2>
16-ST07Q03 : Repeat - <ISCED 3>
17-ST08Q01 : Truancy - Late for School
18-ST09Q01 : Truancy - Skip whole school day
19-ST115Q01 : Truancy - Skip classes within school day
20-ST11Q01 : At Home - Mother
21-ST11Q02 : At Home - Father
22-ST11Q03 : At Home - Brothers
23-ST11Q04 : At Home - Sisters
24-ST11Q05 : At Home - Grandparents
25-ST11Q06 : At Home - Others
26-ST13Q0

In [27]:
# 
df.iloc[:5,0:13]

Unnamed: 0,CNT,SUBNATIO,STRATUM,OECD,NC,SCHOOLID,STIDSTD,ST01Q01,ST02Q01,ST03Q01,ST03Q02,ST04Q01,ST05Q01
1,Albania,80000,ALB0006,Non-OECD,Albania,1,1,10,1,2,1996,Female,No
2,Albania,80000,ALB0006,Non-OECD,Albania,1,2,10,1,2,1996,Female,"Yes, for more than one year"
3,Albania,80000,ALB0006,Non-OECD,Albania,1,3,9,1,9,1996,Female,"Yes, for more than one year"
4,Albania,80000,ALB0006,Non-OECD,Albania,1,4,9,1,8,1996,Female,"Yes, for more than one year"
5,Albania,80000,ALB0006,Non-OECD,Albania,1,5,9,1,10,1996,Female,"Yes, for more than one year"


In [25]:
df.iloc[:,1].value_counts()

80000      4743
7840200    2450
7840100    1572
7840000    1235
Name: SUBNATIO, dtype: int64

In [None]:
# columns to drop 
# 8 - National Study Program [1 to 4]
col_drop = [4, 12, 13, 14, 15, 16, ]

In [27]:
# This is for test. Reading selective columns
df =  pd.read_csv('data/pisa2012.csv', encoding='ISO-8859-1', index_col=0, low_memory=False, usecols=[0, 1, 4, 8, 11, 12], nrows=10_000)

#check
df.head()

Unnamed: 0,CNT,OECD,ST01Q01,ST03Q02,ST04Q01
1,Albania,Non-OECD,10,1996,Female
2,Albania,Non-OECD,10,1996,Female
3,Albania,Non-OECD,9,1996,Female
4,Albania,Non-OECD,9,1996,Female
5,Albania,Non-OECD,9,1996,Female


In [20]:
del df

In [18]:
# number of total students participated
df.shape

(485490, 7)

In [10]:
# number of students participated in each country
df.groupby('CNT').count()

Unnamed: 0_level_0,SUBNATIO,STRATUM
CNT,Unnamed: 1_level_1,Unnamed: 2_level_1
Albania,4743,4743
Argentina,5908,5908
Australia,14481,14481
Austria,4755,4755
Belgium,8597,8597
Brazil,19204,19204
Bulgaria,5282,5282
Canada,21544,21544
Chile,6856,6856
China-Shanghai,5177,5177


In [3]:
# ggplot2 example
'''
install.packages('ggplot2')
library(ggplot2)

df = read.csv(file.choose()) #select your dataset 
df2 = head(df, 30)


qplot(df2$Math.SAT, df2$Verbal.SAT, xlab = 'Math SAT Score', 
      ylab = 'Verbal SAT Score', main = 'Average SAT Scores By College')

qplot(df2$Math.SAT, df2$Verbal.SAT, xlab = 'Math SAT Score', 
      ylab = 'Verbal SAT Score', main = 'Average SAT Scores By College', 
      color = as.factor(df2$Public..1...Private..2.))

qplot(df2$Math.SAT, df2$Verbal.SAT, xlab = 'Math SAT Score',
      ylab = 'Verbal SAT Score', main = 'Average SAT Scores By College',
      shape = as.factor(df2$Public..1...Private..2.), color = df2$stud..fac..ratio)

ggplot(df2, aes(x=Math.SAT, y=Verbal.SAT, group=stud..fac..ratio)) +
  geom_point(aes(shape=stud..fac..ratio, color=as.factor(df2$Public..1...Private..2.))
'''

"\ninstall.packages('ggplot2')\nlibrary(ggplot2)\n\ndf = read.csv(file.choose()) #select your dataset \ndf2 = head(df, 30)\n\n\nqplot(df2$Math.SAT, df2$Verbal.SAT, xlab = 'Math SAT Score', \n      ylab = 'Verbal SAT Score', main = 'Average SAT Scores By College')\n\nqplot(df2$Math.SAT, df2$Verbal.SAT, xlab = 'Math SAT Score', \n      ylab = 'Verbal SAT Score', main = 'Average SAT Scores By College', \n      color = as.factor(df2$Public..1...Private..2.))\n\nqplot(df2$Math.SAT, df2$Verbal.SAT, xlab = 'Math SAT Score',\n      ylab = 'Verbal SAT Score', main = 'Average SAT Scores By College',\n      shape = as.factor(df2$Public..1...Private..2.), color = df2$stud..fac..ratio)\n\nggplot(df2, aes(x=Math.SAT, y=Verbal.SAT, group=stud..fac..ratio)) +\n  geom_point(aes(shape=stud..fac..ratio, color=as.factor(df2$Public..1...Private..2.))\n"

### Things to Consider
1. Color-blind friendly: Use blue-orange palette over red-green
2. Features for additional infor: different encodings
 - color & shape for categorical variables
 - marker size for quantitative variables

### Sample Questions
- How does the choice of school play into academic performance?
- Are there differences in achievement based on gender, location, or student attitudes?
- Are there differences in achievement based on teacher practices and attitudes?
- Does there exist inequality in academic achievement?

### Reference

1. PISA Contest: http://mi2.mini.pw.edu.pl:8080/SmarterPoland/PISAcontest/
2. PISA Survey Design: http://www.oecd.org/pisa/data/pisa2012technicalreport.htm