# Linear Regression

The linear regression model is given by 

$$y_i = \alpha + \beta x_i + \epsilon_i$$

describing a straight line through the data with normally distributed noise.

In [None]:
library(ggplot2)
library(ggfortify)
library(tidyr)
library(dplyr, warn.conflicts = F)

## Import Data

In [None]:
df <- read.csv('datasets/student-mat.csv', sep = ';') %>%
    tbl_df
df

## Rename Columns

In [None]:
df <- df %>%
    rename(Sex = sex,
           Age = age,
           School = school,
           HomeType = address,
           ParentStatus = Pstatus,
           EducationMother = Medu,
           JobMother = Mjob,
           EducationFather = Fedu,
           JobFather = Fjob,
           Guardian = guardian,
           FamilySize = famsize,
           FamilyRelationship = famrel,
           SchoolChoiceReason = reason,
           TravelTime = traveltime,
           StudyTime = studytime,
           ClassFailed = failures,
           EducationalSchoolSupport = schoolsup,
           EducationalFamilySupport = famsup,
           ExtraCurricularActivities = activities,
           ExtraPaidClass = paid,
           InternetAccess = internet,
           AttendedNurserySchool = nursery,
           TargetsHigherEducation = higher,
           RelationshipStatus = romantic,
           LeisureTime = freetime,
           SocialInteractionIntensity = goout,
           AlcoholConsumptionWeekend = Walc,
           AlcoholConsumptionWorkday = Dalc,
           HealthStatus = health,
           SchoolAbsences = absences,
           FirstPeriodGrade = G1,
           SecondPeriodGrade = G2,
           FinalGrade = G3) 

df

## Recode Values

In [None]:
RecodeEducation <- function(x) recode(x, `0` = 'None', `1` = 'Primary', `2` = 'PrimaryExtended', `3` = 'SecondaryExtended', `4` = 'Higher') 
RecodeJob <- function(x) recode(x, teacher = 'Education', services = 'Services', at_home = 'Home', other = 'Other', health = 'Health')
RecodeBinary <- function(x) recode(x, yes = 'Yes', no = 'No')
RecodeLikert <- function(x) recode(x, `1` = 'VeryLow', `2` = 'Low', `3` = 'Medium', `4` = 'High', `5` = 'VeryHigh')
    
df <- df %>%
    mutate(Sex = recode(Sex, F = 'Female', M = 'Male'),
           School = recode(School, GP = 'GabrielPereira', MS = 'MousinhoDaSilveira'),
           HomeType = recode(HomeType, U = 'Urban', R = 'Rural'),
           ParentStatus = recode(ParentStatus, T = 'Together', A = 'Apart'),
           EducationMother = RecodeEducation(EducationMother),
           JobMother = RecodeJob(JobMother),
           EducationFather = RecodeEducation(EducationFather),
           JobFather = RecodeJob(JobFather),
           Guardian = recode(Guardian, mother = 'Mother', father = 'Father', other = 'Other'),
           FamilySize = recode(FamilySize, GT3 = 'Large', LE3 = 'Small'),
           FamilyRelationship = recode(FamilyRelationship, `1` = 'VeryBad', `2` = 'Bad', `3` = 'Ok', `4` = 'Good', `5` = 'VeryGood'),
           SchoolChoiceReason = recode(SchoolChoiceReason, course = 'CoursePreference', other = 'Other', home = 'HomeProximity', reputation = 'Reputation'),
           TravelTime = recode(TravelTime, `1` = 'x < 15', `2` = '15 <= x < 30', `3` = '30 <= x < 60', `4` = 'x >= 60'),
           StudyTime = recode(StudyTime, `1` = 'x < 120', `2` = '120 <= x < 300', `3` = '300 <= x < 600', `4` = 'x >= 600')) %>%
    mutate_at(vars(EducationMother, EducationFather), .funs = RecodeEducation) %>%
    mutate_at(vars(JobMother, JobFather), .funs = RecodeJob) %>%
    mutate_at(vars(EducationalFamilySupport, 
                   EducationalSchoolSupport, 
                   ExtraCurricularActivities, 
                   ExtraPaidClass, 
                   InternetAccess, 
                   AttendedNurserySchool, 
                   TargetsHigherEducation, 
                   RelationshipStatus),
              .funs = RecodeBinary) %>%
    mutate_at(vars(LeisureTime,
                   SocialInteractionIntensity,
                   AlcoholConsumptionWeekend,
                   AlcoholConsumptionWorkday,
                   HealthStatus),
              .funs = RecodeLikert)
    
df

In [None]:
summary(df)

In [None]:
vf <- df %>% 
    gather(key = GradeName, value = Grade, FirstPeriodGrade, SecondPeriodGrade, FinalGrade)


In [None]:
vf %>%
    ggplot(aes(x = Grade, fill = GradeName)) +
    geom_histogram(bins = 10, 
                   color = 'black', 
                   alpha = .5, 
                   position = 'stack')


In [None]:
vf %>%
    ggplot(aes(x = Grade, fill = GradeName)) +
    geom_bar(color = 'black', 
             alpha = .5, 
             position = 'dodge',
             width = .8)

In [None]:
# Does not work as 'green' is interpreted as variable that should be mapped
vf %>%
    ggplot(aes(x = Grade, fill = 'green')) +
    geom_histogram(bins = 10, 
                   color = 'black', 
                   alpha = .5, 
                   position = 'stack')

# use the asthetics in the geom
vf %>%
    ggplot(aes(x = Grade)) +
    geom_histogram(bins = 10, 
                   color = 'black', 
                   alpha = .5, 
                   position = 'stack',
                   fill = 'green')


In [None]:
# Prepare data before plotting
# Y scale of data does not fit to the scale of Y
vf %>%
    ggplot(aes(x = Grade)) +
    geom_histogram(fill = 'green',
                   bins = 10, 
                   color = 'black', 
                   alpha = .5, 
                   position = 'stack') +
    stat_function(fun = dnorm, args = list(mean = mean(vf$Grade), sd = sd(vf$Grade)))

# ggplot has some computed variables that can be accessed via ..variables..
# These can be mapped just like dataframe variables
# Look up the documentation for the exact values
vf %>%
    ggplot() +
    geom_histogram(aes(x = Grade, y = ..density..),
                   fill = 'green',
                   bins = 10, 
                   color = 'black', 
                   alpha = .5, 
                   position = 'stack') +
    stat_function(fun = dnorm, 
                  color = 'red',
                  args = list(mean = mean(vf$Grade), 
                              sd = sd(vf$Grade)))


In [None]:
vf %>%
    ggplot(aes(x = SchoolAbsences, y = Grade, color = GradeName)) +
    geom_point(shape = 18)
