In [None]:
import pandas as pd
import numpy as np
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder

In [None]:
df = pd.read_csv('/content/merged_dataset.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14003 entries, 0 to 14002
Data columns (total 16 columns):
 #   Column                Non-Null Count  Dtype
---  ------                --------------  -----
 0   StudyHours            14003 non-null  int64
 1   Attendance            14003 non-null  int64
 2   Resources             14003 non-null  int64
 3   Extracurricular       14003 non-null  int64
 4   Motivation            14003 non-null  int64
 5   Internet              14003 non-null  int64
 6   Gender                14003 non-null  int64
 7   Age                   14003 non-null  int64
 8   LearningStyle         14003 non-null  int64
 9   OnlineCourses         14003 non-null  int64
 10  Discussions           14003 non-null  int64
 11  AssignmentCompletion  14003 non-null  int64
 12  ExamScore             14003 non-null  int64
 13  EduTech               14003 non-null  int64
 14  StressLevel           14003 non-null  int64
 15  FinalGrade            14003 non-null  int64
dtypes: i

In [None]:
df.head()

Unnamed: 0,StudyHours,Attendance,Resources,Extracurricular,Motivation,Internet,Gender,Age,LearningStyle,OnlineCourses,Discussions,AssignmentCompletion,ExamScore,EduTech,StressLevel,FinalGrade
0,19,64,1,0,0,1,0,19,2,8,1,59,40,0,1,3
1,19,64,1,0,0,1,0,23,3,16,0,90,66,0,1,2
2,19,64,1,0,0,1,0,28,1,19,0,67,99,1,1,0
3,19,64,1,1,0,1,0,19,2,8,1,59,40,0,1,3
4,19,64,1,1,0,1,0,23,3,16,0,90,66,0,1,2


In [None]:
df.duplicated().sum()

np.int64(1534)

In [None]:
df.isna().sum()

Unnamed: 0,0
StudyHours,0
Attendance,0
Resources,0
Extracurricular,0
Motivation,0
Internet,0
Gender,0
Age,0
LearningStyle,0
OnlineCourses,0


**There is not null values**

In [None]:
X = df.drop('FinalGrade', axis=1)
y = df['FinalGrade']

In [None]:
df.dtypes

Unnamed: 0,0
StudyHours,int64
Attendance,int64
Resources,int64
Extracurricular,int64
Motivation,int64
Internet,int64
Gender,int64
Age,int64
LearningStyle,int64
OnlineCourses,int64


**convert numerical values with type object to numeric**

In [None]:
for col in X.columns:
    if X[col].dtype == 'object':
        try:
            X[col] = pd.to_numeric(X[col])
            print(f"{col} converted to numeric")
        except:
            pass

In [None]:
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_features = X.select_dtypes(include=['object']).columns.tolist()

** Treat numerical columns with <=10 unique values as categorical**

In [None]:
for col in numerical_features.copy():
    if X[col].nunique() <= 10:
        numerical_features.remove(col)
        categorical_features.append(col)


In [None]:
print(f"Initial numerical features ({len(numerical_features)}): {numerical_features}")
print(f"Initial categorical features ({len(categorical_features)}): {categorical_features}")

Initial numerical features (6): ['StudyHours', 'Attendance', 'Age', 'OnlineCourses', 'AssignmentCompletion', 'ExamScore']
Initial categorical features (9): ['Resources', 'Extracurricular', 'Motivation', 'Internet', 'Gender', 'LearningStyle', 'Discussions', 'EduTech', 'StressLevel']


**To fill missing values**

In [None]:
num_imputer = SimpleImputer(strategy='median')
X[numerical_features] = num_imputer.fit_transform(X[numerical_features])

In [None]:
cat_imputer = SimpleImputer(strategy='most_frequent')
X[categorical_features] = cat_imputer.fit_transform(X[categorical_features])

**Outlier Detection**

In [None]:
outliers_index = set()
for col in numerical_features:
    Q1 = X[col].quantile(0.25)
    Q3 = X[col].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = X[(X[col] < lower_bound) | (X[col] > upper_bound)].index
    outliers_index.update(outliers)
print(f"Total outliers detected: {len(outliers_index)}")

Total outliers detected: 45


**Drop outliers**

In [None]:
X = X.drop(outliers_index)
y = y.drop(outliers_index)

**Scaling Numerical features to prevent bias**

In [None]:
scaler = StandardScaler()
X_scaled = pd.DataFrame(scaler.fit_transform(X[numerical_features]), columns=numerical_features)

**OneHot encoding for Categorical**

In [None]:
from sklearn.preprocessing import OneHotEncoder

encoder = OneHotEncoder(drop='first', sparse_output=False)  # بدل sparse
X_encoded = pd.DataFrame(
    encoder.fit_transform(X[categorical_features]),
    columns=encoder.get_feature_names_out(categorical_features)
)


In [None]:
X_processed = pd.concat([X_scaled, X_encoded], axis=1)

print(f"\nProcessed dataset shape: {X_processed.shape}")
print(X_processed.head())


Processed dataset shape: (13958, 20)
   StudyHours  Attendance       Age  OnlineCourses  AssignmentCompletion  \
0   -0.159817   -1.410246 -1.289887      -0.309126             -1.059344   
1   -0.159817   -1.410246 -0.151585       0.999623              1.059540   
2   -0.159817   -1.410246  1.271293       1.490405             -0.512535   
3   -0.159817   -1.410246 -1.289887      -0.309126             -1.059344   
4   -0.159817   -1.410246 -0.151585       0.999623              1.059540   

   ExamScore  Resources_1  Resources_2  Extracurricular_1  Motivation_1  \
0  -1.715925          1.0          0.0                0.0           0.0   
1  -0.245699          1.0          0.0                0.0           0.0   
2   1.620357          1.0          0.0                0.0           0.0   
3  -1.715925          1.0          0.0                1.0           0.0   
4  -0.245699          1.0          0.0                1.0           0.0   

   Motivation_2  Internet_1  Gender_1  LearningStyle_1

In [None]:

correlation_matrix = X_processed.corr().abs()


In [None]:
high_corr = set()

for i in range(len(correlation_matrix.columns)):
    for j in range(i):
        if correlation_matrix.iloc[i, j] > 0.8:
            colname = correlation_matrix.columns[i]
            high_corr.add(colname)

print(f"Highly correlated features (>0.8) to consider removing: {high_corr}")


Highly correlated features (>0.8) to consider removing: set()


**Great...There is not columns who are correlated more thean 0.8 So, we cannot drop any cols...This step I made as multicollinearity affect linear regression and logistic regression**

In [None]:
X_selected = X_processed.drop(columns=high_corr)
print(f"Shape before: {X_processed.shape}, after removing correlated: {X_selected.shape}")


Shape before: (13958, 20), after removing correlated: (13958, 20)


**Our data is ready to pushed in Model but Firstly we should split the data 80% for training and 20% for testing**

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


**Linear Regression Model**

**Build and Train Linear Regression Model**

In [None]:
from sklearn.linear_model import LinearRegression


lr_model = LinearRegression()


lr_model.fit(X_train, y_train)


In [None]:
y_pred = lr_model.predict(X_test)


**Model Evaluation**

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

mse = mean_squared_error(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Linear Regression Performance:")
print(f"  Mean Squared Error (MSE): {mse:.3f}")
print(f"  Mean Absolute Error (MAE): {mae:.3f}")
print(f"  R^2 Score: {r2:.3f}")


Linear Regression Performance:
  Mean Squared Error (MSE): 0.078
  Mean Absolute Error (MAE): 0.238
  R^2 Score: 0.938


**Comparasion between Training Performance and Testing Performance**

In [None]:
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

# Training performance
y_train_pred = lr_model.predict(X_train)
print("Train MSE:", mean_squared_error(y_train, y_train_pred))
print("Train MAE:", mean_absolute_error(y_train, y_train_pred))
print("Train R²:", r2_score(y_train, y_train_pred))

Train MSE: 0.077821531547016
Train MAE: 0.2388647991628794
Train R²: 0.9380701138219366


**Test performance**

In [None]:
y_test_pred = lr_model.predict(X_test)
print("Test MSE:", mean_squared_error(y_test, y_test_pred))
print("Test MAE:", mean_absolute_error(y_test, y_test_pred))
print("Test R²:", r2_score(y_test, y_test_pred))

Test MSE: 0.0783123348154355
Test MAE: 0.23814309180693244
Test R²: 0.9378350960782145


In [None]:
from sklearn.metrics import classification_report
report = classification_report(y_test, y_pred)
print(report)


ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

** we cannot make classification report on linear reg. so we will put threshold as 0.5 **

In [None]:

y_pred_class = (y_pred >= 0.5).astype(int)
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred_class))


              precision    recall  f1-score   support

           0       1.00      0.92      0.96       753
           1       0.30      1.00      0.46       636
           2       0.00      0.00      0.00       745
           3       0.00      0.00      0.00       658

    accuracy                           0.47      2792
   macro avg       0.33      0.48      0.36      2792
weighted avg       0.34      0.47      0.36      2792



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
