# Exam Prediction Using Linear Regression
![title](test.jpg)

In [48]:
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

#Sklearn Processing Packages:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import f1_score, accuracy_score, log_loss
from sklearn import datasets, linear_model, preprocessing
from sklearn.preprocessing import LabelEncoder

#Data Loading and Processing Packages:
import pandas as pd
import numpy as np

#Data Visualization Packages:
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import cm as cm

## Importing and Preparing Data

In [40]:
INIT_students=pd.read_csv("student-mat.csv", sep=';')
INIT_students.head(3)

Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,...,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,GP,F,18,U,GT3,A,4,4,at_home,teacher,...,4,3,4,1,1,3,6,5,6,6
1,GP,F,17,U,GT3,T,1,1,at_home,other,...,5,3,3,1,1,3,4,5,5,6
2,GP,F,15,U,LE3,T,1,1,at_home,other,...,4,3,2,2,3,3,10,7,8,10


### Attributes for student-mat.csv (Math course)
- 1 school - student's school (binary: 'GP' - Gabriel Pereira or 'MS' - Mousinho da Silveira) 
- 2 sex - student's sex (binary: 'F' - female or 'M' - male) 
- 3 age - student's age (numeric: from 15 to 22) 
- 4 address - student's home address type (binary: 'U' - urban or 'R' - rural) 
- 5 famsize - family size (binary: 'LE3' - less or equal to 3 or 'GT3' - greater than 3) 
- 6 Pstatus - parent's cohabitation status (binary: 'T' - living together or 'A' - apart) 
- 7 Medu - mother's education (numeric: 0 - none, 1 - primary education (4th grade), 2 â€“ 5th to 9th grade, 3 â€“ secondary education or 4 â€“ higher education) 
- 8 Fedu - father's education (numeric: 0 - none, 1 - primary education (4th grade), 2 â€“ 5th to 9th grade, 3 â€“ secondary education or 4 â€“ higher education) 
- 9 Mjob - mother's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other') 
- 10 Fjob - father's job (nominal: 'teacher', 'health' care related, civil 'services' (e.g. administrative or police), 'at_home' or 'other') 
- 11 reason - reason to choose this school (nominal: close to 'home', school 'reputation', 'course' preference or 'other') 
- 12 guardian - student's guardian (nominal: 'mother', 'father' or 'other') 
- 13 traveltime - home to school travel time (numeric: 1 - <15 min., 2 - 15 to 30 min., 3 - 30 min. to 1 hour, or 4 - >1 hour) 
- 14 studytime - weekly study time (numeric: 1 - <2 hours, 2 - 2 to 5 hours, 3 - 5 to 10 hours, or 4 - >10 hours) 
- 15 failures - number of past class failures (numeric: n if 1<=n<3, else 4) 
- 16 schoolsup - extra educational support (binary: yes or no) 
- 17 famsup - family educational support (binary: yes or no) 
- 18 paid - extra paid classes within the course subject (Math or Portuguese) (binary: yes or no) 
- 19 activities - extra-curricular activities (binary: yes or no) 
- 20 nursery - attended nursery school (binary: yes or no) 
- 21 higher - wants to take higher education (binary: yes or no) 
- 22 internet - Internet access at home (binary: yes or no) 
- 23 romantic - with a romantic relationship (binary: yes or no) 
- 24 famrel - quality of family relationships (numeric: from 1 - very bad to 5 - excellent) 
- 25 freetime - free time after school (numeric: from 1 - very low to 5 - very high) 
- 26 goout - going out with friends (numeric: from 1 - very low to 5 - very high) 
- 27 Dalc - workday alcohol consumption (numeric: from 1 - very low to 5 - very high) 
- 28 Walc - weekend alcohol consumption (numeric: from 1 - very low to 5 - very high) 
- 29 health - current health status (numeric: from 1 - very bad to 5 - very good) 
- 30 absences - number of school absences (numeric: from 0 to 93) 

### Converting Categorical Data Using Dummy Variables

In [41]:
df=INIT_students.drop(['G2','G1','Fedu','school','famsize','Medu','Walc','Dalc','goout','paid','activities','nursery','higher','internet','romantic'],axis=1)
df=pd.get_dummies(df)
#Remove Colinearity
df.columns
df=df.drop(['sex_F','address_R','Mjob_other','Pstatus_T','Fjob_other','reason_other','guardian_other','schoolsup_no','famsup_no'],axis=1)
df.head()

Unnamed: 0,age,traveltime,studytime,failures,famrel,freetime,health,absences,G3,sex_M,...,Fjob_health,Fjob_services,Fjob_teacher,reason_course,reason_home,reason_reputation,guardian_father,guardian_mother,schoolsup_yes,famsup_yes
0,18,2,2,0,4,3,3,6,6,0,...,0,0,1,1,0,0,0,1,1,0
1,17,1,2,0,5,3,3,4,6,0,...,0,0,0,1,0,0,1,0,0,1
2,15,1,2,3,4,3,3,10,10,0,...,0,0,0,0,0,0,0,1,1,0
3,15,1,3,0,3,2,5,2,15,0,...,0,1,0,0,1,0,0,1,0,1
4,16,1,2,0,4,3,5,4,10,0,...,0,0,0,0,1,0,1,0,0,1


## Test and Train Data

In [46]:
# STEP 1: split X and y into training and testing sets
X=df.drop(['G3'],axis=1)
y=df.G3

X_train, X_test, y_train, y_test=train_test_split(X,y, test_size=.1, random_state=4)
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(355, 26)
(355,)
(40, 26)
(40,)


In [47]:
from sklearn import datasets, linear_model
from sklearn.linear_model import LinearRegression
import statsmodels.api as sm
from scipy import stats

#X_train = sm.add_constant(X_train)
est = sm.OLS(y_train, X_train)
est2 = est.fit()
print(est2.summary())

                            OLS Regression Results                            
Dep. Variable:                     G3   R-squared:                       0.868
Model:                            OLS   Adj. R-squared:                  0.857
Method:                 Least Squares   F-statistic:                     82.99
Date:                Sat, 05 Jan 2019   Prob (F-statistic):          7.49e-128
Time:                        16:43:21   Log-Likelihood:                -1010.8
No. Observations:                 355   AIC:                             2074.
Df Residuals:                     329   BIC:                             2174.
Df Model:                          26                                         
Covariance Type:            nonrobust                                         
                        coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------------
age                   0.2583      0.10

___________________________

___________________________