## 1. Import Libraries

## 2. Load Data

## 3. Data Cleaning

## 4. Feature Engineering (Optional)

## 5. Split Data

In [None]:
What We Do:

Split data into training (80%) and testing (20%) sets.

## 6. Data Transformation

## 7. Instantiate Model

In [None]:
Choose a model (e.g., LinearRegression, RandomForest).

## 8. Train Model

In [None]:
Fit the model on X_train and y_train.

## 9. Make Predictions

In [None]:
Generate predictions using model.predict(X_test).

## 10. Evaluate Model

## 11. Deploy Model (Optional)

In [None]:
Save the model as a .pkl file 

In [2]:
# step 1 import labraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, r2_score
import pickle

In [3]:
# Step 2: Load Data



# Generate synthetic data for 100 students
data = {
    'Study_Hours': np.random.randint(1, 10, 100),        # Random study hours (1-10)
    'Attendance': np.random.randint(60, 100, 100),        # Random attendance (60-100%)
    'Previous_Score': np.random.randint(40, 90, 100)     # Random previous score (40-90)
}

# Calculate Final_Percentage using a formula (with noise)
data['Final_Percentage'] = (
    3 * data['Study_Hours'] + 
    0.5 * data['Attendance'] + 
    0.2 * data['Previous_Score'] + 
    np.random.normal(0, 3, 100)  # Add Gaussian noise (mean=0, std=3)
)

# Create DataFrame
df = pd.DataFrame(data)

# Verify the loaded data
print("First 5 rows of the DataFrame:")
print(df.head())
print("\nShape of the DataFrame (rows, columns):", df.shape)

First 5 rows of the DataFrame:
   Study_Hours  Attendance  Previous_Score  Final_Percentage
0            8          83              52         73.867908
1            9          99              64         88.487144
2            6          66              86         67.409281
3            1          66              67         54.681634
4            6          82              63         71.951993

Shape of the DataFrame (rows, columns): (100, 4)


In [4]:
df

Unnamed: 0,Study_Hours,Attendance,Previous_Score,Final_Percentage
0,8,83,52,73.867908
1,9,99,64,88.487144
2,6,66,86,67.409281
3,1,66,67,54.681634
4,6,82,63,71.951993
...,...,...,...,...
95,7,74,73,76.922856
96,9,87,75,87.813489
97,4,63,69,52.916893
98,6,91,87,78.468441


In [5]:
# Step 3: Data Cleaning

# Check for missing values
print("Missing Values Before Cleaning:")
print(df.isnull().sum())

# Handle missing values (if any)
# For this synthetic dataset, no missing values exist. Example code for demonstration:
# df['Attendance'].fillna(df['Attendance'].mean(), inplace=True)  # Fill missing attendance with mean



Missing Values Before Cleaning:
Study_Hours         0
Attendance          0
Previous_Score      0
Final_Percentage    0
dtype: int64


In [6]:
# Remove duplicates
duplicates = df.duplicated().sum()
print(f"\nNumber of Duplicate Rows: {duplicates}")
df.drop_duplicates(inplace=True)




Number of Duplicate Rows: 0


In [8]:
# Check for invalid/outlier values
print("\nData Summary Before Handling Outliers:")
print(df.describe())

df.describe()


Data Summary Before Handling Outliers:
       Study_Hours  Attendance  Previous_Score  Final_Percentage
count   100.000000   100.00000      100.000000        100.000000
mean      5.270000    79.05000       65.240000         68.383913
std       2.781087    10.84731       15.636089         11.122117
min       1.000000    60.00000       40.000000         44.064583
25%       3.000000    69.00000       51.000000         60.654676
50%       5.000000    79.00000       68.000000         67.780028
75%       8.000000    88.25000       78.000000         75.655091
max       9.000000    99.00000       89.000000         90.319626


Unnamed: 0,Study_Hours,Attendance,Previous_Score,Final_Percentage
count,100.0,100.0,100.0,100.0
mean,5.27,79.05,65.24,68.383913
std,2.781087,10.84731,15.636089,11.122117
min,1.0,60.0,40.0,44.064583
25%,3.0,69.0,51.0,60.654676
50%,5.0,79.0,68.0,67.780028
75%,8.0,88.25,78.0,75.655091
max,9.0,99.0,89.0,90.319626


In [9]:
# Handle outliers (example: cap "Study_Hours" between 1 and 15)
# Note: Our synthetic data has no extreme outliers, but here's how to handle them:
df['Study_Hours'] = df['Study_Hours'].clip(lower=1, upper=15)  # Cap study hours



In [10]:
# Verify cleaned data
print("\nMissing Values After Cleaning:")
print(df.isnull().sum())
print(f"\nShape After Cleaning: {df.shape}")


Missing Values After Cleaning:
Study_Hours         0
Attendance          0
Previous_Score      0
Final_Percentage    0
dtype: int64

Shape After Cleaning: (100, 4)


In [11]:
# Step 4: Feature Engineering (Optional but Powerful!)

# 1. Create Interaction Feature: Study_Hours × Attendance
df['Study_Attendance_Interaction'] = df['Study_Hours'] * df['Attendance']



In [12]:
# 2. Create Performance Consistency: Previous_Score ÷ Attendance (scaled)
df['Consistency_Score'] = (df['Previous_Score'] / df['Attendance']) * 100



In [13]:
# 3. Binning Study_Hours into Categories (Low/Medium/High)
df['Study_Effort'] = pd.cut(
    df['Study_Hours'],
    bins=[0, 3, 6, 10],
    labels=['Low', 'Medium', 'High']
)



In [14]:
# 4. Polynomial Feature: Square of Previous_Score (non-linear relationship)
df['Previous_Score_Squared'] = df['Previous_Score'] ** 2



In [15]:
# Show new features
print("\nDataFrame After Feature Engineering:")
print(df[['Study_Hours', 'Attendance', 'Previous_Score', 
          'Study_Attendance_Interaction', 'Consistency_Score', 
          'Study_Effort', 'Previous_Score_Squared']].head())


DataFrame After Feature Engineering:
   Study_Hours  Attendance  Previous_Score  Study_Attendance_Interaction  \
0            8          77              51                           616   
1            5          94              70                           470   
2            3          71              40                           213   
3            6          71              67                           426   
4            2          61              56                           122   

   Consistency_Score Study_Effort  Previous_Score_Squared  
0          66.233766         High                    2601  
1          74.468085       Medium                    4900  
2          56.338028          Low                    1600  
3          94.366197       Medium                    4489  
4          91.803279          Low                    3136  


In [16]:
df

Unnamed: 0,Study_Hours,Attendance,Previous_Score,Final_Percentage,Study_Attendance_Interaction,Consistency_Score,Study_Effort,Previous_Score_Squared
0,8,77,51,72.619896,616,66.233766,High,2601
1,5,94,70,76.522514,470,74.468085,Medium,4900
2,3,71,40,52.739293,213,56.338028,Low,1600
3,6,71,67,64.709234,426,94.366197,Medium,4489
4,2,61,56,50.066792,122,91.803279,Low,3136
...,...,...,...,...,...,...,...,...
95,9,91,83,89.162740,819,91.208791,High,6889
96,5,69,45,53.578393,345,65.217391,Medium,2025
97,3,81,62,64.594010,243,76.543210,Low,3844
98,8,90,64,83.194494,720,71.111111,High,4096


In [None]:
8, 90, 85

In [5]:
df

Unnamed: 0,Study_Hours,Attendance,Previous_Score,Final_Percentage
0,8,83,52,73.867908
1,9,99,64,88.487144
2,6,66,86,67.409281
3,1,66,67,54.681634
4,6,82,63,71.951993
...,...,...,...,...
95,7,74,73,76.922856
96,9,87,75,87.813489
97,4,63,69,52.916893
98,6,91,87,78.468441


In [7]:
# Step 5: Split Data into Training & Testing Sets

# Define features (X) and target (y)
X = df[['Study_Hours', 'Attendance', 'Previous_Score']]  # Input features
y = df['Final_Percentage']                               # Target variable

# Split into 80% training, 20% testing
X_train, X_test, y_train, y_test = train_test_split(
    X, 
    y, 
    test_size=0.2, 
    random_state=42  # Ensures the same split every time
)

# Verify the split
print("Training Data Shape:", X_train.shape)
print("Testing Data Shape:", X_test.shape)

Training Data Shape: (80, 3)
Testing Data Shape: (20, 3)


In [8]:
# Step 6: Data Transformation (Scaling)

from sklearn.preprocessing import StandardScaler

# Initialize scaler (only fit on training data to avoid data leakage)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)  # Fit + Transform
X_test_scaled = scaler.transform(X_test)        # Only Transform



In [10]:
# Verify scaling
print("\nFirst 3 rows of Original Training Data:")
print(X_train.head(3))
print("*"*40)
print("\nFirst 3 rows of Scaled Training Data:")
print(X_train_scaled[:3])
print("\nMean and Std Dev of Scaled Features (should be ~0 and 1):")
print(f"Mean: {X_train_scaled.mean(axis=0).round(2)}")
print(f"Std Dev: {X_train_scaled.std(axis=0).round(2)}")


First 3 rows of Original Training Data:
    Study_Hours  Attendance  Previous_Score
55            1          61              43
88            1          69              53
26            8          88              89
****************************************

First 3 rows of Scaled Training Data:
[[-1.50782711 -1.69135643 -1.3244943 ]
 [-1.50782711 -0.99027605 -0.6525885 ]
 [ 1.19852924  0.67478987  1.76627236]]

Mean and Std Dev of Scaled Features (should be ~0 and 1):
Mean: [-0.  0.  0.]
Std Dev: [1. 1. 1.]


In [12]:
# Step 7: Instantiate the Linear Regression Model
model = LinearRegression()

In [13]:
# Step 8: Train the Model on Scaled Training Data
model.fit(X_train_scaled, y_train)

In [14]:
# Step 9: Make Predictions on Scaled Test Data
y_pred = model.predict(X_test_scaled)  # Predict using scaled test features



In [15]:
# Step 10: Evaluate Model Performance
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Calculate metrics
mse = mean_squared_error(y_test, y_pred)
rmse = np.sqrt(mse)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)



In [16]:
# Print results
print("\nModel Evaluation Metrics:")
print(f"- Mean Squared Error (MSE)      : {mse:.2f}")
print(f"- Root Mean Squared Error (RMSE): {rmse:.2f}%")
print(f"- Mean Absolute Error (MAE)     : {mae:.2f}%")
print(f"- R-squared (Accuracy)          : {r2:.2f}")




Model Evaluation Metrics:
- Mean Squared Error (MSE)      : 6.70
- Root Mean Squared Error (RMSE): 2.59%
- Mean Absolute Error (MAE)     : 2.05%
- R-squared (Accuracy)          : 0.91


In [17]:
# Example: Predict for a new student's data (Study_Hours=8, Attendance=90, Previous_Score=85)
new_student_data = [[8, 90, 85]]  # Must be 2D array

# Scale the new data using SAME scaler used for training
new_student_scaled = scaler.transform(new_student_data)

# Predict using trained model
predicted_percentage = model.predict(new_student_scaled)[0]  # [0] to get single value

# Print result
print(f"\nPredicted Final Percentage: {predicted_percentage:.2f}%")


Predicted Final Percentage: 86.49%




In [18]:
# Step 11: Deploy Model (Save to Files)
import pickle

# Save the model
with open('student_percentage_predictor.pkl', 'wb') as model_file:
    pickle.dump(model, model_file)

# Save the scaler (critical for new data preprocessing)
with open('standard_scaler.pkl', 'wb') as scaler_file:
    pickle.dump(scaler, scaler_file)

print("Model and scaler saved successfully!")

Model and scaler saved successfully!
