# High school student data related to the decision to go to college
The data set from the kaggle:https://www.kaggle.com/datasets/saddamazyazy/go-to-college-dataset
### imports
-here we import torch for gradients and networks<br>
-pandas for data processing<br>
-sci-kit learn for its usefull functions<br>
-shuffle for to shuffle the data 


In [41]:
import numpy as np
import torch
import pandas as pd
from sklearn.preprocessing import StandardScaler as ss
from sklearn.utils import shuffle

### Reading the csv file
-next we will drop the features not relevant to the output, and the label itself <br> -next converting the categorail fields into 1 or 0's <br>-next  we shuffle the data using the shuffle function<br> -printing the first 5 rows 

In [42]:
data=pd.read_csv("./data.csv")
features=data.drop(["type_school"],axis=1)
features=pd.get_dummies(features)
features=shuffle(features)
features.head()

Unnamed: 0,parent_age,parent_salary,house_area,average_grades,parent_was_in_college,will_go_to_college,school_accreditation_A,school_accreditation_B,gender_Female,gender_Male,interest_Interested,interest_Less Interested,interest_Not Interested,interest_Uncertain,interest_Very Interested,residence_Rural,residence_Urban
112,52,8160000,75.9,93.55,True,True,0,1,0,1,0,1,0,0,0,1,0
505,56,4160000,59.6,85.83,False,False,0,1,0,1,1,0,0,0,0,0,1
96,48,5740000,65.2,83.28,False,False,1,0,0,1,0,0,0,1,0,1,0
186,55,3660000,77.5,78.59,False,True,1,0,0,1,0,0,1,0,0,0,1
22,51,3900000,57.9,83.18,False,False,1,0,1,0,0,0,0,0,1,0,1


#### finally popping out the label

In [43]:
labels=features.pop("will_go_to_college")


### convertions
-converting the dataframes into numpy arrays

In [44]:
features=np.array(features)
labels=np.array(labels).reshape(-1,1)



* next line  we will normalize the features so the training process goes faster and printing the first index of the features 

In [45]:
features=ss().fit_transform(features)
features[0]

array([-0.05945106,  1.98907259,  0.09058795,  2.20689746,  0.96076892,
       -0.96269532,  0.96269532, -0.97043679,  0.97043679, -0.33333333,
        1.83488752, -0.30674404, -0.59428947, -0.69230769,  1.08129432,
       -1.08129432])

* converting the arrays into tensors so we can perform ml operations on them<br>
* finally we are printing the shapes of features and labels

In [46]:
features = torch.tensor(features, dtype=torch.float32)
labels = torch.tensor(labels, dtype=torch.float32)
print(features.shape,labels.shape)

torch.Size([1000, 16]) torch.Size([1000, 1])


#### -creating a reasonally layered model with forst input size as 16 followed by sigmoid function 
* sigmoid function output always in between 0 and 1

In [47]:
model = torch.nn.Sequential(
    torch.nn.Linear(16,10),
    torch.nn.ReLU(),
    torch.nn.Linear(10,7),
    torch.nn.ReLU(),
    torch.nn.Linear(7,3),
    torch.nn.ReLU(),
    torch.nn.Linear(3,1),
    torch.nn.Sigmoid()  # Activation function in neural networks
)

#####  the loss we will use is Binary cross entropy
##### the optimizer we will use is SGD-(stochastic gradient descent) to update network weights during training

In [48]:
loss_fn = torch.nn.BCELoss()
opt = torch.optim.SGD(model.parameters(), lr=1e-2)

##### here the training loop for 10000 epochs
##### we simply make predictions and calculating the loss
##### i next step we will find gradients and optimize the model to reduce loss
### For finding the Accuracy 
##### if predictions are greater than 0.5 then we will make the predictions equal to 1
##### if less than 0.5 then we will make the predictions equal to 0
##### now ,if the predictions and labels are equal then we increment the count 
##### now ,accuracy is equal to total count devides length of the predictions

In [49]:
epochs = 10000

for epoch in range(epochs):
    preds = model(features)
    loss = loss_fn(preds, labels)

    loss.backward()

    opt.step()
    opt.zero_grad()

    with torch.no_grad():
        if (epoch+1)%(epochs//10) == 0:
            right = 0
            for i in range(len(preds)):
                if preds[i][0] >= 0.5:
                    preds[i][0] = 1
                else:
                    preds[i][0] = 0

                if preds[i][0] == labels[i][0]:
                    right += 1


            print(f"Loss : {loss}")
            print(f"Accuracy : {round(right * 100/ len(preds), 2)}%")


Loss : 0.573555052280426
Accuracy : 82.1%
Loss : 0.2883685231208801
Accuracy : 87.2%
Loss : 0.25057247281074524
Accuracy : 89.2%
Loss : 0.22803829610347748
Accuracy : 90.5%
Loss : 0.20742779970169067
Accuracy : 91.2%
Loss : 0.19018058478832245
Accuracy : 91.7%
Loss : 0.17769964039325714
Accuracy : 92.1%
Loss : 0.16729314625263214
Accuracy : 92.4%
Loss : 0.15853933990001678
Accuracy : 93.3%
Loss : 0.15072882175445557
Accuracy : 94.0%
