In [75]:
import pandas as pd
import pyreadstat as pystat
import matplotlib.pyplot as plt
import seaborn as sns
import utility_fns as utils

# Importing the dataset (Student Context data of students at 8th grade in Ireland)


In [76]:
org_data = pd.read_spss("./data/orig/SPSS/bsgirlm8.sav")
org_data_copy = org_data.copy()

# Construction of Dataset

## Setting up  a new focused data set

### Adding Basic contextual variables(Gender, Language, Books at Home, Computers at Home, Shared_Computers, Background of Parents etc) - Categorical



In [77]:

columns = [ "BSBG01", "BSBG03", "BSBG04",
    "BSBG05A", "BSBG05B", "BSBG05C", "BSBG05D",
    "BSBG05E", "BSBG05F", "BSBG06A",
    "BSBG06B", "BSBG07",
    "BSBG08A", "BSBG08B"]
df = org_data_copy[columns]
rename_dict = {
    "BSBG01": "Gender_Student",
    "BSBG03": "Lang_Spoken_Home",
    "BSBG04": "Books_Home",
    "BSBG05A": "Own_Computer",
    "BSBG05B": "Shared_Computer",
    "BSBG05C": "Smartphone",
    "BSBG05D": "Internet_Access",
    "BSBG05E": "Study_Desk",
    "BSBG05F": "Own_Room",
    "BSBG06A": "ParentA_Edu_Level",
    "BSBG06B": "ParentB_Edu_Level",
    "BSBG07": "Education_Expectation",
    "BSBG08A": "ParentA_Born_Country",
    "BSBG08B": "ParentB_Born_Country"
}
df = df.rename(columns=rename_dict)
df.head()

Unnamed: 0,Gender_Student,Lang_Spoken_Home,Books_Home,Own_Computer,Shared_Computer,Smartphone,Internet_Access,Study_Desk,Own_Room,ParentA_Edu_Level,ParentB_Edu_Level,Education_Expectation,ParentA_Born_Country,ParentB_Born_Country
0,Girl,Sometimes,Enough to fill one bookcase (26–100 books),Yes,Yes,Yes,Yes,Yes,Yes,I don’t know,I don’t know,Finish <Postgraduate degree: Master’s—ISCED Le...,No,No
1,Girl,Sometimes,Enough to fill one bookcase (26–100 books),Yes,Yes,Yes,Yes,Yes,Yes,I don’t know,I don’t know,Finish <Postgraduate degree: Master’s—ISCED Le...,No,No
2,Girl,Almost always,Enough to fill one bookcase (26–100 books),Yes,No,Yes,Yes,Yes,Yes,I don’t know,I don’t know,Finish <Short-cycle tertiary education—ISCED L...,No,No
3,Girl,Always,None or very few (0–10 books),Yes,Yes,Yes,Yes,Yes,Yes,<Lower secondary education—ISCED Level 2>,<Lower secondary education—ISCED Level 2>,Finish <Postgraduate degree: Master’s—ISCED Le...,Yes,Yes
4,<Other>,Almost always,Enough to fill one bookcase (26–100 books),Yes,Yes,Yes,Yes,Yes,Yes,<Upper secondary education—ISCED Level 3>,<Short-cycle tertiary education—ISCED Level 5>,Finish <Postgraduate degree: Master’s—ISCED Le...,No,No


### Adding Scale variables of Plausible Values for each subject

In [78]:
scale_vars = [
    "BSSSCI01", "BSSSCI02", "BSSSCI03", "BSSSCI04", "BSSSCI05",
    "BSSBIO01", "BSSBIO02", "BSSBIO03", "BSSBIO04", "BSSBIO05",
    "BSSCHE01", "BSSCHE02", "BSSCHE03", "BSSCHE04", "BSSCHE05",
    "BSSPHY01", "BSSPHY02", "BSSPHY03", "BSSPHY04", "BSSPHY05",
    "BSSEAR01", "BSSEAR02", "BSSEAR03", "BSSEAR04", "BSSEAR05"
]
scale_var_dict = {
    "BSSSCI01": "1ST_PV_SCIENCE",
    "BSSSCI02": "2ND_PV_SCIENCE",
    "BSSSCI03": "3RD_PV_SCIENCE",
    "BSSSCI04": "4TH_PV_SCIENCE",
    "BSSSCI05": "5TH_PV_SCIENCE",
    "BSSBIO01": "1ST_PV_BIOLOGY",
    "BSSBIO02": "2ND_PV_BIOLOGY",
    "BSSBIO03": "3RD_PV_BIOLOGY",
    "BSSBIO04": "4TH_PV_BIOLOGY",
    "BSSBIO05": "5TH_PV_BIOLOGY",
    "BSSCHE01": "1ST_PV_CHEMISTRY",
    "BSSCHE02": "2ND_PV_CHEMISTRY",
    "BSSCHE03": "3RD_PV_CHEMISTRY",
    "BSSCHE04": "4TH_PV_CHEMISTRY",
    "BSSCHE05": "5TH_PV_CHEMISTRY",
    "BSSPHY01": "1ST_PV_PHYSICS",
    "BSSPHY02": "2ND_PV PHYSICS",
    "BSSPHY03": "3RD_PV_PHYSICS",
    "BSSPHY04": "4TH_PV_PHYSICS",
    "BSSPHY05": "5TH_PV_PHYSICS",
    "BSSEAR01": "1ST_PV_EARTH SCIENCE",
    "BSSEAR02": "2ND_PV_EARTH SCIENCE",
    "BSSEAR03": "3RD_PV_EARTH SCIENCE",
    "BSSEAR04": "4TH_PV_EARTH SCIENCE",
    "BSSEAR05": "5TH_PV_EARTH SCIENCE"
}

scale_variables = org_data_copy[scale_vars].rename(columns=scale_var_dict)
df = pd.concat([df, scale_variables], axis=1)


### Adding remaining categorical variables - (Behavioural attributes including, dissorderly behaviour, bullying, likeness for subjects, confidence etc)

In [79]:
columns = [
    "BSBGHER", "BSBGSSB", "BSBGSB", "BSBGSLM", "BSBGICM", "BSBGDML",
    "BSBGSCM", "BSBGSVM", "BSBGSLS", "BSBGICS", "BSBGSCS", "BSBGSVS",
    "BSBGSLB", "BSBGICB", "BSBGSCB", "BSBGSLE", "BSBGICE", "BSBGSCE",
    "BSBGSLC", "BSBGICC", "BSBGSCC", "BSBGSLP", "BSBGICP", "BSBGSCP",
    "BSBGSEC", "BSBGDSL", "BSBGDBL", "BSBGDEL", "BSBGDCL", "BSBGDPL",
    "BSBGVEP"
]
behav_attr =  {
    "BSBGHER": "Home_Educational_Resources",
    "BSBGSSB": "Students_Sense_of_School_Belonging",
    "BSBGSB": "Student_Bullying",
    "BSBGSLM": "Students_Like_Learning_Mathematics",
    "BSBGICM": "Instructional_Clarity_in_Mathematics_Lessons",
    "BSBGDML": "Disorderly_Behavior_during_Math_Lessons",
    "BSBGSCM": "Students_Confident_in_Mathematics",
    "BSBGSVM": "Students_Value_Mathematics",
    "BSBGSLS": "Students_Like_Learning_Science",
    "BSBGICS": "Instructional_Clarity_in_Science_Lessons",
    "BSBGSCS": "Students_Confident_in_Science",
    "BSBGSVS": "Students_Value_Science",
    "BSBGSLB": "Students_Like_Learning_Biology",
    "BSBGICB": "Instructional_Clarity_in_Biology_Lessons",
    "BSBGSCB": "Students_Confident_in_Biology",
    "BSBGSLE": "Students_Like_Learning_Earth_Science",
    "BSBGICE": "Instructional_Clarity_in_Earth_Science_Lessons",
    "BSBGSCE": "Students_Confident_in_Earth_Science",
    "BSBGSLC": "Students_Like_Learning_Chemistry",
    "BSBGICC": "Instructional_Clarity_in_Chemistry_Lessons",
    "BSBGSCC": "Students_Confident_in_Chemistry",
    "BSBGSLP": "Students_Like_Learning_Physics",
    "BSBGICP": "Instructional_Clarity_in_Physics_Lessons",
    "BSBGSCP": "Students_Confident_in_Physics",
    "BSBGSEC": "Digital_Self_Efficacy",
    "BSBGDSL": "Disorderly_Behavior_during_Science_Lessons",
    "BSBGDBL": "Disorderly_Behavior_during_Biology_Lessons",
    "BSBGDEL": "Disorderly_Behavior_during_Earth_Science_Lessons",
    "BSBGDCL": "Disorderly_Behavior_during_Chemistry_Lessons",
    "BSBGDPL": "Disorderly_Behavior_during_Physics_Lessons",
    "BSBGVEP": "Students_Value_Environmental_Preservation"
}

behavioral_attributes = org_data_copy[columns]
behavioral_attributes = behavioral_attributes[columns].rename(columns=behav_attr)
df = pd.concat([df, behavioral_attributes], axis=1)


## Removing Emptyl Attributes
Some empty attributes are present in the datafeame, which needs to be dropped.



In [84]:
df = df.dropna(axis=1, how='all')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5090 entries, 0 to 5089
Data columns (total 54 columns):
 #   Column                                        Non-Null Count  Dtype   
---  ------                                        --------------  -----   
 0   Gender_Student                                5043 non-null   category
 1   Lang_Spoken_Home                              5033 non-null   category
 2   Books_Home                                    5034 non-null   category
 3   Own_Computer                                  5011 non-null   category
 4   Shared_Computer                               4960 non-null   category
 5   Smartphone                                    5025 non-null   category
 6   Internet_Access                               5025 non-null   category
 7   Study_Desk                                    5027 non-null   category
 8   Own_Room                                      5017 non-null   category
 9   ParentA_Edu_Level                             4836 n

###  Managing null rows
Since scale attributes have no null rows, the null values for the categorical variables needed to be taken care of. In this analysis, null rows will be dropped fo an efficient EDA

In [88]:
df = df.dropna(axis=0)
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3875 entries, 0 to 5089
Data columns (total 54 columns):
 #   Column                                        Non-Null Count  Dtype   
---  ------                                        --------------  -----   
 0   Gender_Student                                3875 non-null   category
 1   Lang_Spoken_Home                              3875 non-null   category
 2   Books_Home                                    3875 non-null   category
 3   Own_Computer                                  3875 non-null   category
 4   Shared_Computer                               3875 non-null   category
 5   Smartphone                                    3875 non-null   category
 6   Internet_Access                               3875 non-null   category
 7   Study_Desk                                    3875 non-null   category
 8   Own_Room                                      3875 non-null   category
 9   ParentA_Edu_Level                             3875 non-nu