## Reading the datasets and creating the sets

In [9]:
import pandas as pd
from sklearn.model_selection import train_test_split
exam_data = pd.read_csv("StudentsPerformance.csv")




750 250
     gender race/ethnicity      ...      reading score writing score
894  female        group E      ...                 62            69
941  female        group D      ...                 91            96
285    male        group B      ...                 82            82
462  female        group E      ...                 70            76
370    male        group E      ...                 77            71

[5 rows x 8 columns]
     gender race/ethnicity      ...      reading score writing score
131    male        group C      ...                 37            40
203  female        group B      ...                 69            68
50     male        group E      ...                 55            48
585  female        group C      ...                 76            76
138  female        group C      ...                 66            67

[5 rows x 8 columns]


## Cleaning the datasets 

In [19]:
print(exam_data["parental level of education"].unique())

# Function that would map the string value of parental education to interger
def refine_ParentalEdu(value):
    count =0
    if(value == "master's degree"):
        count = 6
    elif (value == "bachelor's degree"):
        count = 5
    elif(value == "associate's degree"):
        count =4
    elif(value == "some college"):
        count =3
    elif(value == "high school"):
        count =2
    else:
        count =1
    return count


def gender(val):
    if(val=="female"):
        return 0
    else:
        return 1

#Mapping the integer values to string values of Parent education for the linear Regression
parental_int = exam_data["parental level of education"]
values = [refine_ParentalEdu(x) for x in parental_int]
exam_data["parent_level"] =values

#Mapping the integer values to string values of Gender for the linear Regression
gender_int = exam_data["gender"]
value = [gender(x) for x in gender_int]
exam_data["gender_int"] =value

["bachelor's degree" 'some college' "master's degree" "associate's degree"
 'high school' 'some high school']


## Splitting the datasets to train and test model

In [20]:
train_set, test_set = train_test_split(exam_data, test_size=0.25, random_state=123)
print(len(train_set), len(test_set))
print(train_set.head())
print(test_set.head())
working_set1 = train_set.copy()

750 250
     gender race/ethnicity    ...     parent_level gender_int
894  female        group E    ...                4          0
941  female        group D    ...                6          0
285    male        group B    ...                4          1
462  female        group E    ...                3          0
370    male        group E    ...                3          1

[5 rows x 10 columns]
     gender race/ethnicity    ...     parent_level gender_int
131    male        group C    ...                1          1
203  female        group B    ...                4          0
50     male        group E    ...                3          1
585  female        group C    ...                4          0
138  female        group C    ...                4          0

[5 rows x 10 columns]


## Feature for X and Y
Looking at the datasets, I would like to see how does Parental Education and Math Score is related. Well, I think there is something realted between the Parental Education level and student performance in Math exam or maybe not. Parents education could impact the children score. 

In [21]:
#Performing Linear Regression
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
X = working_set1[["parent_level"]]
Y = working_set1["math score"]
print(reg.fit(X, Y))
print(reg.score(X,Y))

from sklearn.metrics import mean_squared_error
y_pred = reg.predict(X)
mse = mean_squared_error(Y, y_pred)
print("mean squared error is ",  mse)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
0.01570145862054706
mean squared error is  222.4838848338551


### Linear Regression Analysis:
This looks like only 1.5% of the parent education level could predict the correct Math score. Looking at the mean square error, it is 222.5ish which doesn't look good. Let's see if we can do better. 

In [31]:
X1=working_set1[["parent_level","gender_int","writing score"]]
Y1 = working_set1["math score"]
print(reg.fit(X1, Y1))
print(reg.score(X1,Y1))
from sklearn.metrics import mean_squared_error
y_pred1 = reg.predict(X1)
mse1 = mean_squared_error(Y1, y_pred1)
print("mean squared error is ",  mse1)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
0.8261985760520337
mean squared error is  39.284845363488756


### Analysis:
It looks like gender, writing score, and parental level of education plays a great role in determining the math score. The R^2 value is 82% from 1.5% which is amazing. Also the mean squared error is 39.28 which it less.

## Let's see the test sets

In [36]:
working_set2 = test_set.copy()
X2 = working_set2[["parent_level","gender_int","writing score"]]
Y2 = working_set2["math score"]
print(reg.fit(X2, Y2))
print(reg.score(X2,Y2))
from sklearn.metrics import mean_squared_error
y_pred2 = reg.predict(X2)
mse2 = mean_squared_error(Y2, y_pred2)
print("mean squared error is ",  mse2)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None,
         normalize=False)
0.8463670333428962
mean squared error is  36.9079997670408


## Analysis for Test Set:
The test set looks good with 84.7% which is 2.5% more than the training set. Also the mean square error is 36.9 which is 3 unit less than the training set. 

## Things that I have learned:
1. Computing the linear regression should need both numeric value and all the string values should be mapped to numeric values.
2. I thought that parental level of education could impact the child's math score. The results shows that it won't affect a lot. Looks like only 1.5% were affected. But when the gender and writing score is taken in consideration, it looks like the math score is affected by 82%, which is amazing and the mean square error is decreased to 39 from 222.
3. The test set shows good result. It has 84.7% which is 2.5% more than the training set. Also the mean square error is 36.9 which is 3 unit less than the training set.