In [29]:
import pandas as pd

In [30]:
df = pd.read_csv("/content/Student_Performance.csv")
df

Unnamed: 0,Hours Studied,Previous Scores,Extracurricular Activities,Sleep Hours,Sample Question Papers Practiced,Performance Index
0,7,99,Yes,9,1,91.0
1,4,82,No,4,2,65.0
2,8,51,Yes,7,2,45.0
3,5,52,Yes,5,2,36.0
4,7,75,No,8,5,66.0
...,...,...,...,...,...,...
9995,1,49,Yes,4,2,23.0
9996,7,64,Yes,8,5,58.0
9997,6,83,Yes,8,5,74.0
9998,9,97,Yes,7,0,95.0


In [31]:
X = df.drop(labels=['Performance Index'],axis=1)
Y = df[['Performance Index']]

In [32]:
#seperate Categorical and continous features
#Separates the columns of a DataFrame Beacause 
#This separation of columns based on their data 
#types can be useful in machine learning workflows, 
#especially when handling different types of features 
#for preprocessing or modeling purposes.
def catconsep(df):
    cat = list(df.columns[df.dtypes=='object'])
    con = list(df.columns[df.dtypes!='object'])
    return cat,con
cat,con = catconsep(df)

In [33]:
# seperate categorical and continous features from X
cat1,con1 = catconsep(X)
cat1

['Extracurricular Activities']

In [34]:
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [35]:
#This code segment constructs preprocessing pipelines using 
#scikit-learn's Pipeline and ColumnTransformer classes for 
#handling numerical and categorical features separately. 

num_pipe = Pipeline(steps=[('SimpleImputer',SimpleImputer(strategy='mean')),
                           ('Scaler',StandardScaler())])
cat_pipe = Pipeline(steps = [('SimpleImputer', SimpleImputer(strategy='most_frequent')),
                             ('OHE',OneHotEncoder(handle_unknown='ignore'))])
pre = ColumnTransformer([('num',num_pipe,con1),
                        ('cat',cat_pipe,cat1)])

In [36]:
X_pre = pre.fit_transform(X)
X_pre

array([[ 0.77518771,  1.70417565,  1.45620461, -1.24975394,  0.        ,
         1.        ],
       [-0.38348058,  0.72391268, -1.49229423, -0.90098215,  1.        ,
         0.        ],
       [ 1.16141048, -1.06362569,  0.27680507, -0.90098215,  0.        ,
         1.        ],
       ...,
       [ 0.38896495,  0.7815752 ,  0.86650484,  0.1453332 ,  0.        ,
         1.        ],
       [ 1.54763324,  1.5888506 ,  0.27680507, -1.59852572,  0.        ,
         1.        ],
       [ 0.77518771,  0.26261245,  0.86650484, -1.24975394,  1.        ,
         0.        ]])

In [37]:
#The cols variable contains a list of column names that are 
#likely used as a result of the column transformation or encoding performed on the dataset

cols = ['num__Hours Studied', 'num__Previous Scores', 'num__Sleep Hours',
       'num__Sample Question Papers Practiced',
       'cat__Extracurricular Activities_No',
       'cat__Extracurricular Activities_Yes']
cols

['num__Hours Studied',
 'num__Previous Scores',
 'num__Sleep Hours',
 'num__Sample Question Papers Practiced',
 'cat__Extracurricular Activities_No',
 'cat__Extracurricular Activities_Yes']

In [38]:
# compose a dataframe
X_pre = pd.DataFrame(X_pre,columns=cols)
X_pre.head()

Unnamed: 0,num__Hours Studied,num__Previous Scores,num__Sleep Hours,num__Sample Question Papers Practiced,cat__Extracurricular Activities_No,cat__Extracurricular Activities_Yes
0,0.775188,1.704176,1.456205,-1.249754,0.0,1.0
1,-0.383481,0.723913,-1.492294,-0.900982,1.0,0.0
2,1.16141,-1.063626,0.276805,-0.900982,0.0,1.0
3,0.002742,-1.005963,-0.902594,-0.900982,0.0,1.0
4,0.775188,0.320275,0.866505,0.145333,1.0,0.0


In [39]:
#This code snippet utilizes the train_test_split function from scikit-learn 
#to split the preprocessed feature dataset X_pre and the target variable 
#dataset Y into training and testing sets.

from sklearn.model_selection import train_test_split
xtrain,xtest,ytrain,ytest = train_test_split(X_pre,Y,test_size=0.2,random_state=21)

Linear Regression Model

In [40]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(xtrain,ytrain)

In [41]:
model.score(xtrain,ytrain)

0.988716967469377

In [42]:
#computes the coefficient of determination (R-squared) 
#of the linear regression model on the provided testing data
model.score(xtest,ytest)

0.9888763071047979

In [43]:
# In this Code Lines evaluates a given model's performance metrics using scikit-learn's evaluation metrics: mean absolute 
#error (MAE), mean squared error (MSE), root mean squared error (RMSE), 
#and R-squared score (R2 score) on both training and testing datasets.
from sklearn.metrics import mean_absolute_error,mean_squared_error,r2_score
def model_evaluation(xtrain,ytrain,xtest,ytest,model):

    ypred_tr = model.predict(xtrain)
    ypred_ts = model.predict(xtest)

    tr_mae = mean_absolute_error(ytrain,ypred_tr)
    tr_mse = mean_squared_error(ytrain,ypred_tr)
    tr_rmse = tr_mse**(1/2)
    tr_r2 = r2_score(ytrain,ypred_tr)

    ts_mae = mean_absolute_error(ytest,ypred_ts)
    ts_mse = mean_squared_error(ytest,ypred_ts)
    ts_rmse = ts_mse**(1/2)
    ts_r2 = r2_score(ytest,ypred_ts)

    print('Training Model Evaluation\n')
    print(f"Mean Absolute Error: {tr_mae:.2f}")
    print(f"Mean Squared Error: {tr_mse:.2f}")
    print(f"Root Mean Squared Error: {tr_rmse:.2f}")
    print(f"R2 Score: {tr_r2:.4f}")
    print ('----------------------------------')
    print('\nTesting Model Evaluation\n')
    print(f"Mean Absolute Error: {ts_mae:.2f}")
    print(f"Mean Squared Error: {ts_mse:.2f}")
    print(f"Root Mean Squared Error: {ts_rmse:.2f}")
    print(f"R2 Score: {ts_r2:.4f}")

model_evaluation(xtrain,ytrain,xtest,ytest,model)

Training Model Evaluation

Mean Absolute Error: 1.61
Mean Squared Error: 4.14
Root Mean Squared Error: 2.04
R2 Score: 0.9887
----------------------------------

Testing Model Evaluation

Mean Absolute Error: 1.63
Mean Squared Error: 4.19
Root Mean Squared Error: 2.05
R2 Score: 0.9889


In [44]:
#when we Execute this code line we can get trained linear regression model
import pickle
pickle.dump(model,open('finalModel.pkl','wb'))

In [45]:
model = pickle.load(open('finalModel.pkl','rb'))
# load the model and it can be used for future prediction works

In [46]:
pickle.dump(pre,open('finalEncoder.pkl','wb'))
# enc = pickle.load(open('finalEncoder.pkl','rb'))

In [47]:
enc = pickle.load(open('finalEncoder.pkl','rb'))

In [48]:
#Here We can we create variable for test our model and Get some prediction
import numpy as np
inPut = np.array([[1,64,'Yes',5,4]])

In [49]:
data_columns = ['hours_studied', 'previous_scores', 'extracurricular_activities', 'sleep_hours', 'sample_papers_practiced']
inPut_df = pd.DataFrame(inPut, columns=data_columns)
inPut_df

Unnamed: 0,hours_studied,previous_scores,extracurricular_activities,sleep_hours,sample_papers_practiced
0,1,64,Yes,5,4


In [50]:
X_pre

Unnamed: 0,num__Hours Studied,num__Previous Scores,num__Sleep Hours,num__Sample Question Papers Practiced,cat__Extracurricular Activities_No,cat__Extracurricular Activities_Yes
0,0.775188,1.704176,1.456205,-1.249754,0.0,1.0
1,-0.383481,0.723913,-1.492294,-0.900982,1.0,0.0
2,1.161410,-1.063626,0.276805,-0.900982,0.0,1.0
3,0.002742,-1.005963,-0.902594,-0.900982,0.0,1.0
4,0.775188,0.320275,0.866505,0.145333,1.0,0.0
...,...,...,...,...,...,...
9995,-1.542149,-1.178951,-1.492294,-0.900982,0.0,1.0
9996,0.775188,-0.314013,0.866505,0.145333,0.0,1.0
9997,0.388965,0.781575,0.866505,0.145333,0.0,1.0
9998,1.547633,1.588851,0.276805,-1.598526,0.0,1.0


In [51]:
df = pd.read_csv("/content/Student_Performance.csv") #Loading a dataset (Student_Performance.csv) into a DataFrame (df).
X = df.drop(labels=['Performance Index'],axis=1) #Separating features (X) and the target variable (Y) from the dataset.
Y = df[['Performance Index']]
#seperate Categorical and continous features
def catconsep(df):
    cat = list(df.columns[df.dtypes=='object'])
    con = list(df.columns[df.dtypes!='object'])
    return cat,con
cat,con = catconsep(df)
# seperate categorical and continous features from X
cat1,con1 = catconsep(X)
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
num_pipe = Pipeline(steps=[('SimpleImputer',SimpleImputer(strategy='mean')),
                           ('Scaler',StandardScaler())])
cat_pipe = Pipeline(steps = [('SimpleImputer', SimpleImputer(strategy='most_frequent')),
                             ('OHE',OneHotEncoder(handle_unknown='ignore'))])
pre = ColumnTransformer([('num',num_pipe,con1),
                        ('cat',cat_pipe,cat1)]) #handles both numerical and categorical columns separately with appropriate preprocessing 
cols = ['num__Hours Studied', 'num__Previous Scores', 'num__Sleep Hours',
       'num__Sample Question Papers Practiced',
       'cat__Extracurricular Activities_No',
       'cat__Extracurricular Activities_Yes']
X_pre1 = pre.fit_transform(X)

inPut_df = pd.DataFrame(inPut, columns=['Hours Studied', 'Previous Scores', 'Extracurricular Activities',
       'Sleep Hours', 'Sample Question Papers Practiced'])

X_pre = pre.transform(inPut_df) # make prediction using the preprocessed input data (X_pre) to predict the performance index using the trained model.
X_pre = pd.DataFrame(X_pre,columns=cols)

prediction = model.predict(X_pre)
print('prediction is', prediction)

prediction is [[37.75459673]]
