In [1]:
import pandas as pd 
import numpy as np 
import boto3
from sklearn.model_selection import train_test_split
import sagemaker
from sagemaker import session
import io
import sagemaker.amazon.common as smac
import os
from sagemaker.amazon.amazon_estimator import get_image_uri

sagemaker.config INFO - Not applying SDK defaults from location: /etc/xdg/sagemaker/config.yaml
sagemaker.config INFO - Not applying SDK defaults from location: /home/sagemaker-user/.config/sagemaker/config.yaml


In [2]:
df = pd.read_csv("student_performance.csv")   
df.head(5)

Unnamed: 0,student_id,weekly_self_study_hours,attendance_percentage,class_participation,total_score,grade
0,1,18.5,95.6,3.8,97.9,A
1,2,14.0,80.0,2.5,83.9,B
2,3,19.5,86.3,5.3,100.0,A
3,4,25.7,70.2,7.0,100.0,A
4,5,13.4,81.9,6.9,92.0,A


In [3]:
df.shape

(1000000, 6)

In [4]:
#assigning x and y values for the prediction
x = df[["weekly_self_study_hours"]]
y = df[["total_score"]]

In [5]:
x.dtypes

weekly_self_study_hours    float64
dtype: object

In [6]:
y.dtypes

total_score    float64
dtype: object

In [7]:
x=x.astype("float32")
y=y.astype("float32") 

In [8]:
x.dtypes

weekly_self_study_hours    float32
dtype: object

In [9]:
y.dtypes

total_score    float32
dtype: object

In [10]:
#split the dataset into training and testing
X_train,X_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [11]:
#ressetting index
X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True) 
X_test= X_test.reset_index(drop=True) 
y_test=y_test.reset_index(drop=True)

In [12]:
X_train

Unnamed: 0,weekly_self_study_hours
0,16.000000
1,17.299999
2,11.400000
3,10.500000
4,24.200001
...,...
799995,14.000000
799996,16.900000
799997,27.000000
799998,18.700001


In [13]:
y_train 

Unnamed: 0,total_score
0,100.000000
1,96.199997
2,89.199997
3,77.199997
4,100.000000
...,...
799995,82.199997
799996,100.000000
799997,100.000000
799998,100.000000


In [14]:
#vectorization of the label column
y_train=y_train.iloc[:,0]

In [15]:
y_train

0         100.000000
1          96.199997
2          89.199997
3          77.199997
4         100.000000
             ...    
799995     82.199997
799996    100.000000
799997    100.000000
799998    100.000000
799999     80.500000
Name: total_score, Length: 800000, dtype: float32

In [16]:
y_test=y_test.iloc[:,0]

In [17]:
y_test

0          91.800003
1          97.599998
2         100.000000
3         100.000000
4          67.099998
             ...    
199995     64.500000
199996     53.599998
199997     76.099998
199998     36.200001
199999    100.000000
Name: total_score, Length: 200000, dtype: float32

In [18]:
#creating sagemaker session
sagemaker_session=sagemaker.Session() 
bucket_name="studentperformancedemo"
prefix="linear-reg"
role=sagemaker.get_execution_role() 


In [19]:
X_train=np.array(X_train)

In [20]:
#creating the buffer
buf=io.BytesIO() 
smac.write_numpy_to_dense_tensor(buf,X_train,y_train)
buf.seek(0)

0

In [21]:
key="student-data"
boto3.resource('s3').Bucket(bucket_name).Object(os.path.join(prefix,'train',key)).upload_fileobj(buf)
s3_train_data=f"s3://{bucket_name}/{prefix}/train/{key}" 
print("Data uploaded to s3",s3_train_data)

Data uploaded to s3 s3://studentperformancedemo/linear-reg/train/student-data


In [22]:
#converting X train into numpy aray
X_test=np.array(X_test) #create the buffer
buf=io.BytesIO()
smac.write_numpy_to_dense_tensor(buf,X_test,y_test)
buf.seek(0)
#defining the name of the file
keys="student-data-test"

#code to upload in 13
boto3.resource('s3').Bucket(bucket_name).Object(os.path.join(prefix, 'test',key)).upload_fileobj(buf)

#path of our dato
s3_train_data=f"s3://studentperformancedemo/linear-reg/test/student-data"
print("Data uploaded",s3_train_data)

Data uploaded s3://studentperformancedemo/linear-reg/test/student-data


In [23]:
#getting the output location
output_location = f"s3://studentperformancedemo/linear-reg/output"
output_location

's3://studentperformancedemo/linear-reg/output'

In [24]:
container=sagemaker.image_uris.retrieve("linear-learner",boto3.Session().region_name)

In [25]:
linear = sagemaker.estimator.Estimator(container,role,instance_count=1,instance_type="ml.m5.2xlarge",output_path=output_location,sagemaker_session=sagemaker_session)

In [26]:
#setting the hyperparameter
linear.set_hyperparameters(feature_dim=1,predictor_type="regressor",mini_batch_size=4,epochs=8,num_models=32,loss="absolute_loss")

In [27]:
#fit the model
linear.fit({"train":s3_train_data})

INFO:sagemaker:Creating training-job with name: linear-learner-2025-11-17-05-51-30-509


2025-11-17 05:51:30 Starting - Starting the training job...
2025-11-17 05:51:54 Starting - Preparing the instances for training...
2025-11-17 05:52:28 Downloading - Downloading the training image.........
2025-11-17 05:53:42 Training - Training image download completed. Training in progress..[34mDocker entrypoint called with argument(s): train[0m
[34mRunning default environment configuration script[0m
[34m[11/17/2025 05:54:04 INFO 140563665524544] Reading default configuration from /opt/amazon/lib/python3.8/site-packages/algorithm/resources/default-input.json: {'mini_batch_size': '1000', 'epochs': '15', 'feature_dim': 'auto', 'use_bias': 'true', 'binary_classifier_model_selection_criteria': 'accuracy', 'f_beta': '1.0', 'target_recall': '0.8', 'target_precision': '0.8', 'num_models': 'auto', 'num_calibration_samples': '10000000', 'init_method': 'uniform', 'init_scale': '0.07', 'init_sigma': '0.01', 'init_bias': '0.0', 'optimizer': 'auto', 'loss': 'auto', 'margin': '1.0', 'quantile'

In [33]:
#deploying the model
linear_regresor=linear.deploy(initial_instance_count=1,instance_type="ml.m5.large")

INFO:sagemaker:Creating model with name: linear-learner-2025-11-17-07-05-35-520
INFO:sagemaker:Creating endpoint-config with name linear-learner-2025-11-17-07-05-35-520
INFO:sagemaker:Creating endpoint with name linear-learner-2025-11-17-07-05-35-520


-------!

In [36]:
linear_regresor.serializer=sagemaker.serializers.CSVSerializer()
linear_regresor.deserializer=sagemaker.deserializers.JSONDeserializer()

In [38]:
#predictions from the model
results=linear_regresor.predict(X_test)

In [39]:
results

{'predictions': [{'score': 86.03411102294922},
  {'score': 94.67731475830078},
  {'score': 86.39424133300781},
  {'score': 108.00225830078125},
  {'score': 65.3264389038086},
  {'score': 88.19491577148438},
  {'score': 76.49057006835938},
  {'score': 62.62543487548828},
  {'score': 78.65137481689453},
  {'score': 86.57431030273438},
  {'score': 73.429443359375},
  {'score': 81.35237884521484},
  {'score': 68.92777252197266},
  {'score': 84.41351318359375},
  {'score': 89.45538330078125},
  {'score': 108.36238861083984},
  {'score': 105.8414535522461},
  {'score': 87.11450958251953},
  {'score': 82.43277740478516},
  {'score': 83.15304565429688},
  {'score': 84.77364349365234},
  {'score': 89.27531433105469},
  {'score': 111.06338500976562},
  {'score': 101.15971374511719},
  {'score': 70.00817108154297},
  {'score': 89.27531433105469},
  {'score': 82.2527084350586},
  {'score': 82.97297668457031},
  {'score': 84.23344421386719},
  {'score': 87.65470886230469},
  {'score': 70.9085083007

In [40]:
predictions=np.array([i["score"] for i in results['predictions']])

In [41]:
predictions

array([ 86.03411102,  94.67731476,  86.39424133, ...,  76.13043976,
        57.76363373, 103.32051849])