In [None]:
import pandas as pd


data = pd.read_csv("/content/student-por.csv", delimiter=";")

# Display all 33 columns and the first 5 rows
pd.set_option('display.max_columns', None)
data.head(10)

data.to_csv("/content/student-por_output.csv", index=False)


In [None]:
from sklearn.preprocessing import LabelEncoder

# List of categorical columns
categorical_cols = [
    'school', 'sex', 'address', 'famsize', 'Pstatus', 'Mjob', 'Fjob', 'reason',
    'guardian', 'schoolsup', 'famsup', 'paid', 'activities', 'nursery', 'higher',
    'internet', 'romantic'
]

# Initialize label encoder
le = LabelEncoder()

# Encode categorical columns
for col in categorical_cols:
    data[col] = le.fit_transform(data[col])

data.head(500)


Unnamed: 0,school,sex,age,address,famsize,Pstatus,Medu,Fedu,Mjob,Fjob,reason,guardian,traveltime,studytime,failures,schoolsup,famsup,paid,activities,nursery,higher,internet,romantic,famrel,freetime,goout,Dalc,Walc,health,absences,G1,G2,G3
0,0,0,18,1,0,0,4,4,0,4,0,1,2,2,0,1,0,0,0,1,1,0,0,4,3,4,1,1,3,4,0,11,11
1,0,0,17,1,0,1,1,1,0,2,0,0,1,2,0,0,1,0,0,0,1,1,0,5,3,3,1,1,3,2,9,11,11
2,0,0,15,1,1,1,1,1,0,2,2,1,1,2,0,1,0,0,0,1,1,1,0,4,3,2,2,3,3,6,12,13,12
3,0,0,15,1,0,1,4,2,1,3,1,1,1,3,0,0,1,0,1,1,1,1,1,3,2,2,1,1,5,0,14,14,14
4,0,0,16,1,0,1,3,3,2,2,1,0,1,2,0,0,1,0,0,1,1,0,0,4,3,2,1,2,5,0,11,13,13
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
495,1,0,16,1,0,1,3,3,2,2,3,1,1,1,0,0,0,0,1,1,0,1,1,4,5,4,1,1,4,0,14,13,13
496,1,0,16,0,1,1,1,1,3,3,1,1,1,1,0,0,1,0,1,1,1,1,1,4,4,4,2,2,4,2,14,14,14
497,1,1,17,1,0,1,3,3,3,0,0,1,2,4,1,0,1,1,1,1,1,0,0,5,4,5,3,4,5,0,10,11,10
498,1,0,16,1,0,1,2,1,2,3,0,1,1,2,0,0,1,0,1,1,1,1,0,5,3,3,1,1,1,0,14,13,14


In [None]:
# Remove duplicate rows
data = data.drop_duplicates()


In [None]:
correlation_matrix = data.corr()  # Calculate correlation matrix
print(correlation_matrix['G3'])

school       -0.284294
sex          -0.129077
age          -0.106505
address       0.167637
famsize       0.045016
Pstatus      -0.000754
Medu          0.240151
Fedu          0.211800
Mjob          0.148252
Fjob          0.052953
reason        0.124969
guardian     -0.079609
traveltime   -0.127173
studytime     0.249789
failures     -0.393316
schoolsup    -0.066405
famsup        0.059206
paid         -0.054898
activities    0.059791
nursery       0.028752
higher        0.332172
internet      0.150025
romantic     -0.090583
famrel        0.063361
freetime     -0.122705
goout        -0.087641
Dalc         -0.204719
Walc         -0.176619
health       -0.098851
absences     -0.091379
G1            0.826387
G2            0.918548
G3            1.000000
Name: G3, dtype: float64


In [None]:
# Select features with an absolute correlation greater than 0.5
selected_features = [feature for feature in correlation_matrix.columns if abs(correlation_matrix['G3'][feature]) > 0.5 and feature != 'G3']

print("Selected features based on correlation with G3:")
print(selected_features)

# Now, use only the selected features for training
X = data[selected_features]
y = data['G3']

Selected features based on correlation with G3:
['G1', 'G2']


In [None]:
from sklearn.model_selection import train_test_split

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Print the sizes of the train and test datasets
print(f"Training dataset size: {X_train.shape[0]} samples")
print(f"Test dataset size: {X_test.shape[0]} samples")

Training dataset size: 519 samples
Test dataset size: 130 samples


In [None]:
from sklearn.linear_model import LinearRegression

# Initialize the model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)




In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import numpy as np

# Training Predictions
y_train_pred = model.predict(X_train)

# Testing Predictions
y_test_pred = model.predict(X_test)

# Evaluate for training data
train_mae = mean_absolute_error(y_train, y_train_pred)
train_rmse = np.sqrt(mean_squared_error(y_train, y_train_pred))
train_r2 = r2_score(y_train, y_train_pred)

# Evaluate for testing data
test_mae = mean_absolute_error(y_test, y_test_pred)
test_rmse = np.sqrt(mean_squared_error(y_test, y_test_pred))
test_r2 = r2_score(y_test, y_test_pred)

print(f"Linear Regression - Training MAE: {train_mae:.2f}, Training RMSE: {train_rmse:.2f}, Training R²: {train_r2 * 100:.2f}%")
print(f"Linear Regression - Testing MAE: {test_mae:.2f}, Testing RMSE: {test_rmse:.2f}, Testing R²: {test_r2 * 100:.2f}%")


Linear Regression - Training MAE: 0.81, Training RMSE: 1.28, Training R²: 84.34%
Linear Regression - Testing MAE: 0.73, Testing RMSE: 1.17, Testing R²: 85.99%


In [None]:
import joblib


# Save the trained model
joblib.dump(model, "linear_regression_model.pkl")
from google.colab import files
files.download('linear_regression_model.pkl')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>