In [None]:
# Importing Libraries
import pandas as pd
import numpy as np
import sklearn
from sklearn import linear_model
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns
# Standard ML Models for comparison
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
# Metrics
from sklearn.metrics import mean_squared_error, mean_absolute_error

In [None]:
# Load Dataset
students = pd.read_csv('C:/Users/HP1/Downloads/student/student-mat.csv',sep=';')

In [None]:
# The dataset info such as no of attributes or features, 
# type of attributes, memory of an dataset 
students.info()

In [None]:
# statistical details of the dataset
students.describe()

In [None]:
# Check for any missing values for these attributes
students.isnull().any()

In [None]:
# Exploratory Data Analysis
# How much % of students going to school 1 and school 2  
students['school'].value_counts()

In [None]:
# Number of Male and female students
female_students = len(students[students['sex'] == 'F'])
print(" No of female students",female_students)
male_students = len(students[students['sex'] == 'M'])
print(" No of male students",male_students)

In [None]:
# students from urban or rural areas
urban_stud = len(students[students['address'] == 'U'])
print('Number of Urban students:',urban_stud)
rural_stud = len(students[students['address'] == 'R'])
print('Number of Rural students:',rural_stud)

In [None]:
# Age of students
plot = sns.kdeplot(students['age'])    # Kernel Density Estimations
plot.axes.set_title('Ages of students')
plot.set_xlabel('Age')
plot.set_ylabel('Count')
plt.show()
#Observation:Plot shows the median grades of the three age groups are similar
#Age groups: 15,16,17

In [None]:
# Do urban students perform better than rural students?
# Grade distribution by address
sns.kdeplot(students.loc[students['address'] == 'U', 'G3'], label='Urban', shade = True)
sns.kdeplot(students.loc[students['address'] == 'R', 'G3'], label='Rural', shade = True)
plt.title('Do urban students score higher than rural students?')
plt.xlabel('Grade');
plt.ylabel('Density')
plt.show()
#Observation:Graph clearly shows 
#There is not much difference between the grades based on location.

In [None]:
# Correlation with respect to G3
students.corr()['G3'].sort_values()

In [None]:
# drop the school and grade columns
students = students.drop(['school', 'G1', 'G2'], axis='columns')
# Obserevations: Although G1 and G2 which are period grades of a student
# and are highly correlated to the final grade G3, we drop them. 
# It is more difficult to predict G3 without G2 and G1, 
# but such prediction is much more useful 
# because we want to find other factors affect the grade.

In [None]:
# Find correlations with the Grade G3
most_correlated = students.corr().abs()['G3'].sort_values(ascending=False)
# Maintain the top 5 most correlation features with Grade3
most_correlated = most_correlated[:6]
most_correlated

In [None]:
students = students.loc[:, most_correlated.index]
students.head()

In [None]:
# Failures
plot = sns.stripplot(x=students['failures'],y=students['G3'],palette='autumn')
plot.axes.set_title('Previous Failures vs Final Grade(G3)')
# Observation: Student with less previous failures usually score higher

In [None]:
# Does age affetcs final grade
plot = sns.stripplot(x=students['age'],y=students['G3'],palette='autumn')
plot.axes.set_title('Age vs Final Grade(G3)')
# Observation:
# Age group 20 seems to score highest grades among all.

In [None]:
# Family Education Attribute i,e Mother Education and Father Education
family_education = students['Fedu'] + students['Medu']
plot = sns.stripplot(x=family_education,y=students['G3'],palette='autumn')
plot.axes.set_title('Family Education vs Final Grade(G3)')
# Observation: Educated Families results in highest grade

In [None]:
# Going out 
plot = sns.stripplot(x=students['goout'],y=students['G3'],palette='autumn')
plot.axes.set_title('Go Out vs Final Grade(G3)')
# Observation: Students goes out lott scores less

In [None]:
# Splitting the dataset into a training set and test set
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(students,students['G3'], test_size=0.2, random_state=20)

In [None]:
X_train.head()

In [None]:
# Evaluate several ml models by training on training set and testing on testing set
def evaluate(X_train, X_test, y_train, y_test):
    # Names of models
    model_name_list = ['Linear Regression','Random Forest','SVM']
    X_train = X_train.drop('G3', axis='columns')
    X_test = X_test.drop('G3', axis='columns')
    # Instantiate the models
    model_1 = LinearRegression()
    model_2 = RandomForestRegressor(n_estimators=100)
    model_3 = SVR(kernel='rbf', degree=3, C=1.0, gamma='auto')
    # Dataframe for results
    results = pd.DataFrame(columns=['mae', 'rmse'], index = model_name_list)
    # Train and predict with each model
    for i, model in enumerate([model_1, model_2, model_3]):
        model.fit(X_train, y_train)
        predictions = model.predict(X_test)
        # Calculating Mean Absolute Error and Root Mean squared Error Metrics
        mae = np.mean(abs(predictions - y_test))
        rmse = np.sqrt(np.mean((predictions - y_test) ** 2))
        # Insert results into the dataframe
        model_name = model_name_list[i]
        results.loc[model_name, :] = [mae, rmse]
    return results

In [None]:
results = evaluate(X_train, X_test, y_train, y_test)
results

In [None]:
plt.figure(figsize=(12, 7))
# mean absolute error
ax =  plt.subplot(1, 2, 1)
results.sort_values('mae', ascending = True).plot.bar(y = 'mae', color = 'violet', ax = ax)
plt.title('Model Mean Absolute Error') 
plt.ylabel('MAE')
# Root mean squared error
ax = plt.subplot(1, 2, 2)
results.sort_values('rmse', ascending = True).plot.bar(y = 'rmse', color = 'pink', ax = ax)
plt.title('Model Root Mean Squared Error') 
plt.ylabel('RMSE')
plt.show()

# Conclusion: 
# As we see both Model Mean Absolute Error & Model Root Mean Squared Error
# The linear regression is performing the best in both cases

In [None]:
# Create Linear Regression object
regressor = linear_model.LinearRegression()
# Fitting the linear regression model to the training set
regressor.fit(X_train, y_train)

In [None]:
r_squared = regressor.score(X_test,y_test)
print(r_squared)

In [None]:
# Making Predictions and Displaying
predictions = regressor.predict(X_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
df

In [None]:
# How Linear regression model Predicts For different Features
students = pd.read_csv('C:/Users/HP1/Downloads/student/student-mat.csv',sep=';')
# Changing Features
students = students[["G1", "G2", "G3", "age", "studytime", "failures", "absences"]]
predict = "G3"
X = np.array(students.drop(columns='G3'))
y = np.array(students[predict])
x_train, x_test, y_train, y_test = sklearn.model_selection.train_test_split(
    X, y, test_size=0.2, random_state=20
)
regressor = linear_model.LinearRegression()
regressor.fit(x_train, y_train)
r_squared = regressor.score(x_test,y_test)
print(r_squared)
predictions = regressor.predict(x_test)
df = pd.DataFrame({'Actual': y_test, 'Predicted': predictions})
df 
# a list of all predictions
#for x in range(len(predictions)):
    #print(round(predictions[x]), y_test[x])
# Conclusion:
# 86% of the data fit the regression model