# Student Score Prediction


---



`Requirements`


- Perform data cleaning and basic visualization to understand the dataset
- Predict student's exam scores based on their study hours and other factors
- Train a **Linear or Multi Linear Regression Model** and **Polynomial Regression Model** to estimate final score
- Dataset : [Student Performance Factors](https://www.kaggle.com/datasets/lainguyn123/student-performance-factors)

## Importing Libraries  and Dataset

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import OneHotEncoder, LabelEncoder, RobustScaler
import warnings
warnings.filterwarnings("ignore")

In [None]:
df=pd.read_csv('/kaggle/input/student-performance-factors/StudentPerformanceFactors.csv')

## EDA and Pre-Processing

In [None]:
df.info()

In [None]:
df.head()

In [None]:
print(f'There are {df.duplicated().sum()} duplicated rows')

In [None]:
sns.regplot(x='Hours_Studied',y='Exam_Score',data=df,line_kws={'color':'red'})

In [None]:
for col in df.select_dtypes(include="object").columns:
    df[col] = df[col].astype("category")

In [None]:
df.info()

In [None]:
df.isna().sum()

### missing values

In [None]:
df['Teacher_Quality']=df['Teacher_Quality'].fillna(df['Teacher_Quality'].mode()[0])
df['Parental_Education_Level']=df['Parental_Education_Level'].fillna(df['Parental_Education_Level'].mode()[0])
df['Distance_from_Home']=df['Distance_from_Home'].fillna(df['Distance_from_Home'].mode()[0])

In [None]:
df.isna().sum()

In [None]:
numerical_cols = ['Hours_Studied', 'Attendance', 'Sleep_Hours',
                  'Previous_Scores', 'Tutoring_Sessions',
                  'Physical_Activity']

plt.figure(figsize=(15, 10))
for i, col in enumerate(numerical_cols, 1):
    plt.subplot(3, 3, i)
    sns.histplot(df[col],
                 kde=True,
                 bins=30,
                 color="red",
                 line_kws={"linewidth": 3}
                )
    plt.title(col)



plt.tight_layout()
plt.show()

In [None]:
# Select numeric columns for correlation
cols = ['Hours_Studied', 'Attendance', 'Sleep_Hours',
        'Previous_Scores', 'Tutoring_Sessions',
        'Physical_Activity', 'Exam_Score']

# Compute correlation matrix
corr = df[cols].corr()

# Plot heatmap with red shades
plt.figure(figsize=(7, 5))
sns.heatmap(corr, annot=True, cmap="Reds", fmt=".2f")
plt.title("Correlation between Numeric Features")
plt.show()



In [None]:
plt.figure(figsize=(15, 15))
for i, col in enumerate(numerical_cols, 1):
    plt.subplot(3, 3, i)
    sns.boxplot(df[col],
                 color="#ff4252",
                )
    plt.title(col)



plt.tight_layout()
plt.show()

### Handling Outliers

In [None]:
# Select numeric columns only
numeric_cols = df.select_dtypes(include=np.number).columns
data_numeric = df[numeric_cols]

# Compute Z-score
z = np.abs((data_numeric - data_numeric.mean()) / data_numeric.std())
threshold = 3

# Keep only rows where all numeric values are within threshold
df_clean = df[(z < threshold).all(axis=1)]

# Check how many rows were removed
print(f"Original rows: {df.shape[0]}")
print(f"Rows after outlier removal: {df_clean.shape[0]}")
print(f"Rows removed: {df.shape[0] - df_clean.shape[0]}")

# Update df
df = df_clean

### Data Distribution

In [None]:
categorical_cols = ['Parental_Involvement', 'Access_to_Resources', 'Extracurricular_Activities',
                    'Motivation_Level', 'Internet_Access', 'Family_Income', 'Teacher_Quality',
                    'School_Type', 'Peer_Influence', 'Learning_Disabilities',
                    'Parental_Education_Level', 'Distance_from_Home', 'Gender']

plt.figure(figsize=(15, 20))
for i, col in enumerate(categorical_cols, 1):
    plt.subplot(5, 3, i)
    sns.countplot(data=df, x=col, palette='Reds')
    plt.title(f'Distribution of {col}')
    plt.xticks(rotation=45)

plt.tight_layout()
plt.show()