In [1]:
import numpy as np
import pandas as pd
import torch
from torch.utils.data import Dataset, DataLoader, TensorDataset
import warnings
warnings.filterwarnings("ignore")

#### data_loading :

In [2]:
df = pd.read_csv("D:\data\Student_performance_data _.csv")
df.sample(5)

Unnamed: 0,StudentID,Age,Gender,Ethnicity,ParentalEducation,StudyTimeWeekly,Absences,Tutoring,ParentalSupport,Extracurricular,Sports,Music,Volunteering,GPA,GradeClass
1075,2076,15,1,1,2,19.570119,13,0,1,0,0,0,0,2.096645,3.0
106,1107,16,0,3,1,2.80998,18,0,0,1,0,0,1,0.842296,4.0
1793,2794,17,1,1,1,1.335005,14,0,3,0,0,0,0,1.563592,4.0
515,1516,17,0,0,0,17.622103,8,1,2,0,0,0,0,2.647196,2.0
1611,2612,16,0,0,1,14.644758,26,0,3,0,1,0,0,0.958776,4.0


### About this file

#### Student ID: 
A unique identifier assigned to each student (1001 to 3392).
Demographic Details
#### Age: 
The age of the students ranges from 15 to 18 years.
#### Gender: 
Gender of the students, where 0 represents Male and 1 represents Female.
#### Ethnicity: 
The ethnicity of the students, coded as follows:

0: Caucasian
1: African American
2: Asian

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2392 entries, 0 to 2391
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   StudentID          2392 non-null   int64  
 1   Age                2392 non-null   int64  
 2   Gender             2392 non-null   int64  
 3   Ethnicity          2392 non-null   int64  
 4   ParentalEducation  2392 non-null   int64  
 5   StudyTimeWeekly    2392 non-null   float64
 6   Absences           2392 non-null   int64  
 7   Tutoring           2392 non-null   int64  
 8   ParentalSupport    2392 non-null   int64  
 9   Extracurricular    2392 non-null   int64  
 10  Sports             2392 non-null   int64  
 11  Music              2392 non-null   int64  
 12  Volunteering       2392 non-null   int64  
 13  GPA                2392 non-null   float64
 14  GradeClass         2392 non-null   float64
dtypes: float64(3), int64(12)
memory usage: 280.4 KB


In [4]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
StudentID,2392.0,2196.5,690.655244,1001.0,1598.75,2196.5,2794.25,3392.0
Age,2392.0,16.468645,1.123798,15.0,15.0,16.0,17.0,18.0
Gender,2392.0,0.51087,0.499986,0.0,0.0,1.0,1.0,1.0
Ethnicity,2392.0,0.877508,1.028476,0.0,0.0,0.0,2.0,3.0
ParentalEducation,2392.0,1.746237,1.000411,0.0,1.0,2.0,2.0,4.0
StudyTimeWeekly,2392.0,9.771992,5.652774,0.001057,5.043079,9.705363,14.40841,19.978094
Absences,2392.0,14.541388,8.467417,0.0,7.0,15.0,22.0,29.0
Tutoring,2392.0,0.301421,0.458971,0.0,0.0,0.0,1.0,1.0
ParentalSupport,2392.0,2.122074,1.122813,0.0,1.0,2.0,3.0,4.0
Extracurricular,2392.0,0.383361,0.486307,0.0,0.0,0.0,1.0,1.0


In [5]:
#split dataset

#convert numpy arrays
X = df.drop(columns=['Ethnicity', 'StudentID']).values
y = df['Ethnicity'].values

# Create TensorDataset
dataset = TensorDataset(torch.tensor(X, dtype=torch.float32), torch.tensor(y, dtype=torch.float32))

In [6]:
try :
    data_loader = DataLoader(dataset, batch_size=16, shuffle=True)
except Exception as error:
    print(error.with_traceback)
else:
    print('Dataloader worked.')

Dataloader worked.


In [7]:
#print the elements of dataloader

print("The batch size is : {}".format(data_loader.batch_size))
print("-" * 100)

#data here is iterable so we need to use loop

for batch in data_loader:
    X_batch, y_batch = batch
    
    #converting to tensor as it is in numpy arrays
    X_batch = torch.tensor(X_batch, dtype=torch.float16)
    y_batch = torch.tensor(y_batch, dtype=torch.float16)
    
    print(X_batch, '\n' * 3,"-" * 100)
    print(y_batch, '\n' * 3,"-" * 100)
    print(X_batch.size(), y_batch.size())
    break

The batch size is : 16
----------------------------------------------------------------------------------------------------
tensor([[17.0000,  1.0000,  1.0000,  8.8281, 10.0000,  1.0000,  1.0000,  0.0000,
          1.0000,  1.0000,  0.0000,  2.5957,  2.0000],
        [15.0000,  0.0000,  3.0000,  8.0703, 29.0000,  1.0000,  3.0000,  1.0000,
          1.0000,  0.0000,  0.0000,  1.1650,  4.0000],
        [18.0000,  1.0000,  2.0000, 16.0938, 24.0000,  1.0000,  3.0000,  0.0000,
          0.0000,  1.0000,  0.0000,  1.1055,  4.0000],
        [18.0000,  1.0000,  2.0000,  9.7578,  6.0000,  1.0000,  4.0000,  0.0000,
          1.0000,  0.0000,  0.0000,  3.2070,  1.0000],
        [16.0000,  0.0000,  1.0000, 14.1875, 10.0000,  0.0000,  4.0000,  1.0000,
          1.0000,  1.0000,  0.0000,  3.1387,  1.0000],
        [18.0000,  0.0000,  4.0000, 13.1875, 24.0000,  1.0000,  2.0000,  0.0000,
          1.0000,  1.0000,  0.0000,  1.5586,  4.0000],
        [18.0000,  0.0000,  1.0000,  6.1836, 12.0000,  0.000

##### DataLoader:
it is normally used for creating the batch size of the data.
When we will pass the data for the training during that time we will have to pass this batch size of data then we will update the weights and biases

In [8]:
#printing the data and labels

data, target = next(iter(data_loader))
print(data.shape, target.shape)

torch.Size([16, 13]) torch.Size([16])


In [9]:
#print 1st batch
next(iter(data_loader)) #next() & iter() are built in functions which call __next__() & __iter__() methods


[tensor([[15.0000,  0.0000,  3.0000,  0.3776, 23.0000,  0.0000,  2.0000,  0.0000,
           0.0000,  0.0000,  1.0000,  0.3699,  4.0000],
         [18.0000,  1.0000,  0.0000, 12.4510, 15.0000,  0.0000,  1.0000,  0.0000,
           1.0000,  0.0000,  0.0000,  1.4642,  4.0000],
         [15.0000,  1.0000,  3.0000, 15.3954, 11.0000,  0.0000,  1.0000,  1.0000,
           0.0000,  0.0000,  0.0000,  2.1627,  3.0000],
         [15.0000,  1.0000,  3.0000, 12.0876, 24.0000,  0.0000,  3.0000,  0.0000,
           1.0000,  0.0000,  0.0000,  1.1697,  4.0000],
         [17.0000,  0.0000,  1.0000, 18.7822, 19.0000,  0.0000,  2.0000,  0.0000,
           0.0000,  1.0000,  0.0000,  1.8233,  1.0000],
         [15.0000,  0.0000,  1.0000,  6.1926,  7.0000,  0.0000,  2.0000,  1.0000,
           0.0000,  0.0000,  0.0000,  2.1274,  3.0000],
         [18.0000,  1.0000,  1.0000,  7.5559, 22.0000,  1.0000,  0.0000,  0.0000,
           0.0000,  0.0000,  0.0000,  0.9690,  4.0000],
         [18.0000,  1.0000,  2.000