## 1. Problem Statement

- Predict the Performace Index (Marks) of Students based on Previous Score and Hours Studied

## 2. Importing Libraries and gathering datasets 

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go

In [2]:
data = pd.read_csv('E:\College_Exp_Codes\ML\dataset\Student_Performance.csv')

In [3]:
data.drop(['Extracurricular Activities','Sleep Hours','Sample Question Papers Practiced'],axis=1,inplace = True)

In [4]:
data = data.head(100)

In [22]:
data.head()

Unnamed: 0,Hours_Studied,Previous_Scores,Performance_Index
0,7,99,91.0
1,4,82,65.0
2,8,51,45.0
3,5,52,36.0
4,7,75,66.0


In [5]:
data.rename(columns={'Hours Studied':'Hours_Studied'},inplace=True)
data.rename(columns={'Previous Scores':'Previous_Scores'},inplace=True)
data.rename(columns={'Performance Index':'Performance_Index'},inplace=True)

In [23]:
data.shape

(100, 3)

In [24]:
data.describe()

Unnamed: 0,Hours_Studied,Previous_Scores,Performance_Index
count,100.0,100.0,100.0
mean,5.32,70.82,57.19
std,2.639559,16.589141,19.253595
min,1.0,40.0,15.0
25%,3.0,59.0,41.75
50%,5.0,72.0,59.0
75%,8.0,83.0,71.0
max,9.0,99.0,98.0


In [25]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 3 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Hours_Studied      100 non-null    int64  
 1   Previous_Scores    100 non-null    int64  
 2   Performance_Index  100 non-null    float64
dtypes: float64(1), int64(2)
memory usage: 2.5 KB


In [26]:
data.isna().sum()

Hours_Studied        0
Previous_Scores      0
Performance_Index    0
dtype: int64

In [27]:
px.scatter_3d(data,x='Hours_Studied',y='Previous_Scores',z='Performance_Index')

In [7]:
x = data.drop('Performance_Index',axis=1)

In [8]:
x

Unnamed: 0,Hours_Studied,Previous_Scores
0,7,99
1,4,82
2,8,51
3,5,52
4,7,75
...,...,...
95,3,48
96,8,64
97,6,95
98,3,55


In [9]:
y = data[['Performance_Index']]

In [10]:
y

Unnamed: 0,Performance_Index
0,91.0
1,65.0
2,45.0
3,36.0
4,66.0
...,...
95,30.0
96,57.0
97,81.0
98,36.0


In [11]:
from sklearn.model_selection import train_test_split
x_train , x_test , y_train , y_test = train_test_split(x,y,test_size=0.3,random_state=42)

In [12]:
x_train.shape

(70, 2)

In [13]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [14]:
model.fit(x_train,y_train)

In [15]:
y_pred = model.predict(x_test)

In [16]:
from sklearn.metrics import r2_score

In [17]:
r2_score(y_test,y_pred)

0.982921131041535

In [18]:
# Create 3D scatter plot
fig = px.scatter_3d(
    x=data['Hours_Studied'], 
    y=data['Previous_Scores'], 
    z=data['Performance_Index'], 
    title="3D Scatter Plot with Best Fit Plane"
)

In [19]:
# Create grid for the best fit plane
x_range = np.linspace(min(data['Hours_Studied']), max(data['Hours_Studied']), 100)
y_range = np.linspace(min(data['Previous_Scores']), max(data['Previous_Scores']), 100)
xx, yy = np.meshgrid(x_range, y_range)

In [21]:
# Ensure model.coef_ is used correctly
zz = model.intercept_ + model.coef_[0,0] * xx + model.coef_[0,1] * yy

# Add the best fit plane to the plot
fig.add_trace(go.Surface(x=xx, y=yy, z=zz, opacity=1.0, colorscale='Viridis'))

# Update layout with axis labels
fig.update_layout(
    scene=dict(
        xaxis_title='Hours Studied',
        yaxis_title='Previous Scores',
        zaxis_title='Performance Index'
    )
)

# Show plot
fig.show()