In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
# Step 1: Load the dataset
url = "https://github.com/dsrscientist/dataset4/raw/main/Grades.csv"
grades_data = pd.read_csv(url)

In [None]:
# Step 2: Data analysis and preprocessing
print("Dataset Information:")
print(grades_data.info())

In [None]:
# Step 3: Feature engineering and preprocessing
X = grades_data.drop(['Seat No', 'CGPA'], axis=1)  # Features (all columns except 'Seat No' and 'CGPA')
y = grades_data['CGPA']  # Target variable ('CGPA')

In [None]:
# Extract year from course codes
X['Year'] = X['AB-XXX'].str.extract(r'(\d{2})')

In [None]:
# Step 4: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# Step 5: Perform feature encoding for categorical variables
categorical_cols = ['AB-XXX']  # Categorical column(s) to encode
one_hot_encoder = OneHotEncoder(drop='first')
column_transformer = ColumnTransformer([('encoder', one_hot_encoder, categorical_cols)], remainder='passthrough')

In [None]:
X_train_encoded = column_transformer.fit_transform(X_train)
X_test_encoded = column_transformer.transform(X_test)

In [None]:
# Step 6: Train a Linear Regression model
linear_regression = LinearRegression()
linear_regression.fit(X_train_encoded, y_train)

In [None]:
# Step 7: Make predictions on the test set
y_pred = linear_regression.predict(X_test_encoded)

In [None]:
# Step 8: Evaluate the model's performance
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [None]:
# Step 9: Print the results
print("Mean Squared Error (MSE):", mse)

In [None]:
print("R-squared (R2) Score:", r2)