In [1]:
import torch
import pandas as pd
import numpy as np
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from torch.utils.data import TensorDataset,DataLoader
import torchmetrics
import torch.optim as optim
import math
from sklearn.metrics import mean_squared_error,accuracy_score,f1_score

In [2]:
train=pd.read_csv('processed_train.csv')

In [39]:
test=pd.read_csv('processed_test.csv')

In [3]:
train.dtypes

fit                       object
user_id                    int64
bust size                 object
item_id                    int64
weight                    object
rating                     int64
rented for                object
body type                 object
category                  object
height                    object
size                       int64
age                        int64
review_polarity_score    float64
BMI                      float64
year                       int64
month                      int64
dtype: object

In [4]:
train['user_id']=train['user_id'].astype('object')
train['item_id']=train['item_id'].astype('object')

X_train,X_valid,y_train,y_valid=train_test_split(train.drop('rating',axis=1),train['rating'],test_size=0.12,random_state=68)

In [5]:
def prepare_inputs_for_nn(df):
    tar=df['rating']
    df.drop(['weight','height','body type','age','rented for','rating'],inplace=True,axis=1)
    df=pd.get_dummies(df,columns=['fit'],drop_first=True)
    le=LabelEncoder()
    df[['user_id','item_id','bust size','category']]=df[['user_id','item_id','bust size','category']].apply(le.fit_transform)
    return (df,tar)

In [6]:
train,target=prepare_inputs_for_nn(train)

embedding_user=nn.Embedding(train['user_id'].nunique(),10)
embedding_item=nn.Embedding(train['item_id'].nunique(),10)
embedding_bust_size=nn.Embedding(train['bust size'].nunique(),10)
embedding_category=nn.Embedding(train['category'].nunique(),10)


In [8]:
train

Unnamed: 0,user_id,bust size,item_id,category,size,review_polarity_score,BMI,year,month,fit_large,fit_small
0,24279,39,1633,15,8,0.0000,21.031478,2017,9,False,False
1,17998,54,5148,22,12,0.8074,22.148708,2017,1,False,False
2,22209,37,2835,15,8,0.9619,21.454890,2017,3,False,False
3,16087,29,4692,43,14,0.7793,23.010315,2017,11,False,False
4,86936,66,6,19,39,0.3400,28.886246,2015,1,False,False
...,...,...,...,...,...,...,...,...,...,...,...
153956,70290,37,515,15,4,0.9595,19.966864,2017,3,False,False
153957,11736,38,408,15,16,0.5267,22.148708,2017,12,False,True
153958,13107,37,64,15,4,0.5023,19.043618,2015,7,False,False
153959,48837,25,37,19,12,0.9263,22.140381,2014,5,False,False


embed_user=embedding_user(torch.from_numpy(train['user_id'].values))
embed_item=embedding_user(torch.from_numpy(train['item_id'].values))
embed_bust_size=embedding_user(torch.from_numpy(train['bust size'].values))
embed_category=embedding_user(torch.from_numpy(train['category'].values))

In [7]:
cat_cols=('bust size','item_id','user_id','category')
tensor=[]
for i in train.columns:
    if( i in cat_cols):
        embedding=nn.Embedding(train[i].nunique(),5)
        embed_values=embedding(torch.from_numpy(train[i].values))
        
        tensor.append(embed_values)
    else:
        tensor.append(torch.from_numpy(train[i].values).view(-1,1))
                            

In [8]:
features=torch.cat(tensor,dim=1)

In [9]:
target=torch.tensor(target).reshape(-1,1)

# Nueral Network

In [10]:
dataset=TensorDataset(features.float(),target.float())

train_dataset,valid_dataset=torch.utils.data.random_split(dataset, [math.ceil(features.shape[0]*0.88),int(features.shape[0]*0.12)])
# Let us have 100 batches

train_loader=DataLoader(train_dataset,batch_size=13548,shuffle=True)
valid_loader=DataLoader(valid_dataset,batch_size=1847,shuffle=True)

In [13]:
model=nn.Sequential(
    nn.Linear(features.shape[1],10),
    nn.ReLU(),
    nn.Linear(10,5),
    nn.ReLU(),
    nn.Linear(5,5),
    nn.ReLU(),
    nn.Linear(5,1)
)

In [14]:
criterion=nn.MSELoss()
optimizer=optim.Adam(model.parameters(),lr=0.001)

In [15]:
for epoch in range(10):
    train_loss=0.0
    
    for data in train_loader:
        optimizer.zero_grad()
        
        feat,tar=data
        #print(feat.shape)
        #print(tar.shape)
        predictions=model(feat)
        #print(predictions.shape)
        loss=criterion(predictions,tar)
        loss.backward(retain_graph=True)
        optimizer.step()
        train_loss+=loss.item()
    
    epoch_train_loss=train_loss/len(train_loader)
    
    validation_loss=0.0
    model.eval()
    with torch.no_grad():
        for val_data in valid_loader:
            feat_val,tar_val=val_data
            predictions_val=model(feat_val)
            loss_val=criterion(predictions_val,tar_val)
            validation_loss+=loss_val.item()
            
        epoch_validation_loss=validation_loss/len(valid_loader)
    model.train()
    print(f'Training loss for epoch {epoch} is {epoch_train_loss}',end=' ')
    print(f'Validation loss for epoch {epoch} is {epoch_validation_loss}\n')

Training loss for epoch 0 is 29.919632998379793 Validation loss for epoch 0 is 14.873541138388894

Training loss for epoch 1 is 6.796318065036427 Validation loss for epoch 1 is 1.948041395707564

Training loss for epoch 2 is 3.2204388054934414 Validation loss for epoch 2 is 3.020346566018733

Training loss for epoch 3 is 2.368102631785653 Validation loss for epoch 3 is 1.9677513187581843

Training loss for epoch 4 is 2.1869214448061856 Validation loss for epoch 4 is 2.17592867937955

Training loss for epoch 5 is 2.5071720860221167 Validation loss for epoch 5 is 1.9349805387583645

Training loss for epoch 6 is 1.9434295946901494 Validation loss for epoch 6 is 1.9414896368980408

Training loss for epoch 7 is 1.9356193379922346 Validation loss for epoch 7 is 1.94989383762533

Training loss for epoch 8 is 1.9484605355696245 Validation loss for epoch 8 is 1.94632089138031

Training loss for epoch 9 is 2.067740418694236 Validation loss for epoch 9 is 1.9378469803116538



In [40]:
test,tar_test=prepare_inputs_for_nn(test)
cat_cols=('bust size','item_id','user_id','category')
tensor_test=[]
for i in test.columns:
    if( i in cat_cols):
        embedding=nn.Embedding(test[i].nunique(),5)
        embed_values=embedding(torch.from_numpy(test[i].values))
        
        tensor_test.append(embed_values)
    else:
        tensor_test.append(torch.from_numpy(test[i].values).view(-1,1))
                            

features_test=torch.cat(tensor_test,dim=1)

In [40]:
pred_test=model(features_test.float())
RMSE=mean_squared_error(tar_test.values,pred_test.detach().numpy().flatten(),squared=False)
RMSE
#pred_test

1.4327558738064368

In [41]:
print(f'RMSE of the validation set is 1.392 and RMSE of the test set is {RMSE}')

RMSE of the validation set is 1.392 and RMSE of the test set is 1.4327558738064368


# Nueral network for Classification

In [18]:
model_classification=nn.Sequential(
    nn.Linear(features.shape[1],10),
    nn.ReLU(),
    nn.Linear(10,5),
    nn.ReLU(),
    nn.Linear(5,5),
    nn.ReLU(),
    nn.Linear(5,target.unique().shape[0])
)

In [19]:
criterion_classification=nn.CrossEntropyLoss()
optimizer_classification=optim.Adam(model_classification.parameters(),lr=0.001)

In [20]:
def get_accuracy(predictions, labels):
    classes = torch.argmax(predictions, dim=1)
    return torch.mean((classes == labels).float())

In [24]:
for epoch in range(10):
    train_loss=0.0
    running_accuracy = 0.0
    
    for data in train_loader:
        optimizer_classification.zero_grad()
        
        feat,tar=data
        tar=((tar/2)-torch.tensor(1)).flatten().long()
        #print(feat.shape)
        #print(tar.shape)
        outputs=model_classification(feat)
        #print(outputs)
        #print(torch.flatten(tar))
        loss=criterion_classification(outputs,tar)
        loss.backward(retain_graph=True)
        optimizer_classification.step()
        train_loss+=loss.item()
        running_accuracy += get_accuracy(outputs, tar)
        #print(outputs)
        #print(f'Actual labels are {tar}')
    
    epoch_train_loss=train_loss/len(train_loader)
    accuracy_train =running_accuracy/ len(train_loader)
    
    validation_loss=0.0
    running_accuracy_validation= 0
    model_classification.eval()
    with torch.no_grad():
        for val_data in valid_loader:
            feat_val,tar_val=val_data
            tar_val=((tar_val/2)-torch.tensor(1)).flatten().long()
            output_val=model_classification(feat_val)
            loss_val=criterion_classification(output_val,tar_val)
            validation_loss+=loss_val.item()
            running_accuracy_validation += get_accuracy(output_val, tar_val)
            #print('validation')
            #print(output_val)
            #print(f'Actual labels are {tar_val}')
            
        epoch_validation_loss=validation_loss/len(valid_loader)
        accuracy_validation = running_accuracy_validation / len(valid_loader)
    model_classification.train()
    print(f'Training loss for epoch {epoch} is {epoch_train_loss}  and accuracy is {accuracy_train}',end=' ')
    print(f'Validation loss for epoch {epoch} is {epoch_validation_loss} and accuracy is {accuracy_validation}\n')

Training loss for epoch 0 is 1.341190522367304  and accuracy is 0.6629224419593811 Validation loss for epoch 0 is 1.128218650817871 and accuracy is 0.6661220192909241

Training loss for epoch 1 is 1.0683216398412532  and accuracy is 0.6629223823547363 Validation loss for epoch 1 is 1.019225532358343 and accuracy is 0.6661219596862793

Training loss for epoch 2 is 0.9758476425300945  and accuracy is 0.6780672073364258 Validation loss for epoch 2 is 1.0030598694627935 and accuracy is 0.6479893326759338

Training loss for epoch 3 is 0.9992300922220404  and accuracy is 0.6477775573730469 Validation loss for epoch 3 is 0.9431242780251936 and accuracy is 0.6661219596862793

Training loss for epoch 4 is 0.9622874097390608  and accuracy is 0.6477775573730469 Validation loss for epoch 4 is 0.9011428925124082 and accuracy is 0.6842545866966248

Training loss for epoch 5 is 0.9618658206679604  and accuracy is 0.6326327919960022 Validation loss for epoch 5 is 0.9174319830807772 and accuracy is 0.6

In [42]:
pred_test_classification=model_classification(features_test.float())
tar_test=((torch.tensor(tar_test)/2)-torch.tensor(1)).flatten().long()
accuracy_test=get_accuracy(pred_test_classification,tar_test)
accuracy_test

tensor(0.6485)

In [46]:
print(f'Accuracy of the validation set is 0.6842 and Accuracy of the test set is {accuracy_test}')

Accuracy of the validation set is 0.6842 and Accuracy of the test set is 0.6485152244567871
