In [3]:
!pip install -r FedStar/requirements.txt

[0m

In [1]:
#generic
from pathlib import Path
import os, sys
import argparse
import random
import copy
from random import choices
import pickle

#torch
import torch
import torch.nn.functional as F
from torch_geometric.nn import GCNConv, GINConv, global_add_pool, SAGEConv
from torch_geometric.transforms import OneHotDegree
from torch_geometric.utils import to_networkx, degree, to_dense_adj, to_scipy_sparse_matrix
from sklearn.model_selection import train_test_split
from scipy import sparse as sp
import torch_geometric
from torch_geometric.data import Data, Dataset, Batch
from torch_geometric.utils import to_networkx, subgraph
import torch_geometric.utils as utils
from torch.nn.functional import one_hot


#utility
import networkx as nx
from dtaidistance import dtw
from tensorboardX import SummaryWriter
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import pymetis
from ogb.nodeproppred import PygNodePropPredDataset

num_clients = 3
device = "cuda" if torch.cuda.is_available() else "cpu"
alg = 'fedstar'
num_rounds = 20
local_epoch = 10
lr = 0.01
weight_decay = 5e-4
nlayer = 3 # number of GINConv layers
hidden = 64
dropout = 0.5
batch_size = 128  # not used
seed = 69
datapath = '.Data'
outbase = 'outputs'
data_group = 'arxiv'
n_rw = 16
n_dg = 16
n_ones = 16
type_init = 'rw_dg' #options are rw, dg and rw_dg
print(device)
seed_dataSplit = 123
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed(seed)

cuda


In [2]:
def get_numGraphLabels(g):
    s = set(g.y.flatten().tolist())
    return len(s)

def init_structure_encoding(  g, type_init = 'rw_dg'):

    if type_init == 'rw':
        # Geometric diffusion features with Random Walk
        A = to_scipy_sparse_matrix(g.edge_index, num_nodes=g.num_nodes)
        D = (degree(g.edge_index[0], num_nodes=g.num_nodes) ** -1.0).numpy()

        Dinv=sp.diags(D)
        RW=A*Dinv
        M=RW

        SE_rw=[torch.from_numpy(M.diagonal()).float()]
        M_power=M
        for _ in range(n_rw-1):
            M_power=M_power*M
            SE_rw.append(torch.from_numpy(M_power.diagonal()).float())
        SE_rw=torch.stack(SE_rw,dim=-1)

        g['stc_enc'] = SE_rw

    elif type_init == 'dg':
        # PE_degree
        g_dg = (degree(g.edge_index[0], num_nodes=g.num_nodes)).numpy().clip(1, n_dg)
        SE_dg = torch.zeros([g.num_QCnodes, n_dg])
        for i in range(len(g_dg)):
            SE_dg[i,int(g_dg[i]-1)] = 1

        g['stc_enc'] = SE_dg

    elif type_init == 'rw_dg':
        # SE_rw
        A = to_scipy_sparse_matrix(g.edge_index, num_nodes=g.num_nodes)
        D = (degree(g.edge_index[0], num_nodes=g.num_nodes) ** -1.0).numpy()

        Dinv=sp.diags(D)
        RW=A*Dinv
        M=RW

        SE=[torch.from_numpy(M.diagonal()).float()]
        M_power=M
        for _ in range(n_rw-1):
            M_power=M_power*M
            SE.append(torch.from_numpy(M_power.diagonal()).float())
        SE_rw=torch.stack(SE,dim=-1)

        # PE_degree
        g_dg = (degree(g.edge_index[0], num_nodes=g.num_nodes)).numpy().clip(1, n_dg)
        SE_dg = torch.zeros([g.num_nodes, n_dg])
        for i in range(len(g_dg)):
            SE_dg[i,int(g_dg[i]-1)] = 1

        g['stc_enc'] = torch.cat([SE_rw, SE_dg], dim=1)

    return g

def get_stats(df, ds, graph_train, graph_val=None, graph_test=None):
    from collections import Counter
    labels_train = graph_train.y.flatten().tolist()
    df.loc[ds, '#Nodes_train'] = graph_train.num_nodes
    df.loc[ds, '#Edges_train'] = graph_train.num_edges
    df.loc[ds, 'Avg_degree_train'] = graph_train.num_edges/graph_train.num_nodes
    df.loc[ds, '#Labels_train'] = len(set(labels_train))
    df.loc[ds, 'Class_dist_train'] = str(dict(Counter(labels_train)))
    
    if graph_test:
        labels_test = graph_test.y.flatten().tolist()
        df.loc[ds, '#Nodes_test'] = graph_test.num_nodes
        df.loc[ds, '#Edges_test'] = graph_test.num_edges
        df.loc[ds, 'Avg_degree_test'] = graph_test.num_edges/graph_test.num_nodes
        df.loc[ds, '#Labels_test'] = len(set(labels_test))
        df.loc[ds, 'Class_dist_test'] = str(dict(Counter(labels_test)))
        
    if graph_val:
        labels_val = graph_val.y.flatten().tolist()
        df.loc[ds, '#Nodes_val'] = graph_val.num_nodes
        df.loc[ds, '#Edges_val'] = graph_val.num_edges
        df.loc[ds, 'Avg_degree_val'] = graph_val.num_edges/graph_val.num_nodes
        df.loc[ds, '#Labels_val'] = len(set(labels_val))
        df.loc[ds, 'Class_dist_val'] = str(dict(Counter(labels_val)))
        
    return df

In [3]:
file = open('graph_struc_pickle', 'rb')
graph = pickle.load(file)
file.close()
graph

Data(num_nodes=169343, edge_index=[2, 1166243], x=[169343, 128], node_year=[169343, 1], y=[169343, 1], stc_enc=[169343, 32])

In [4]:
class GIN_dc(torch.nn.Module):
    def __init__(self, nfeat, n_se, nhid, nclass, nlayer, dropout):
        super(GIN_dc, self).__init__()
        self.num_layers = nlayer
        self.dropout = dropout

        self.pre = torch.nn.Sequential(torch.nn.Linear(nfeat, nhid))

        self.embedding_s = torch.nn.Linear(n_se, nhid)

        self.graph_convs = torch.nn.ModuleList()
        self.nn1 = torch.nn.Sequential(torch.nn.Linear(nhid + nhid, nhid), torch.nn.ReLU(), torch.nn.Linear(nhid, nhid))
        self.graph_convs.append(GINConv(self.nn1))
        self.graph_convs_s_gcn = torch.nn.ModuleList()
        self.graph_convs_s_gcn.append(GCNConv(nhid, nhid))

        for l in range(nlayer - 1):
            self.nnk = torch.nn.Sequential(torch.nn.Linear(nhid + nhid, nhid), torch.nn.ReLU(), torch.nn.Linear(nhid, nhid))
            self.graph_convs.append(GINConv(self.nnk))
            self.graph_convs_s_gcn.append(GCNConv(nhid, nhid))

        self.Whp = torch.nn.Linear(nhid + nhid, nhid)
        self.post = torch.nn.Sequential(torch.nn.Linear(nhid, nhid), torch.nn.ReLU())
        self.readout = torch.nn.Sequential(torch.nn.Linear(nhid, nclass))

    def forward(self, data):
        x, edge_index, s = data.x, data.edge_index, data.stc_enc
        x = self.pre(x)
        s = self.embedding_s(s)
        for i in range(len(self.graph_convs)):
            x = torch.cat((x, s), -1)
            x = self.graph_convs[i](x, edge_index)
            x = F.relu(x)
            x = F.dropout(x, self.dropout, training=self.training)
            s = self.graph_convs_s_gcn[i](s, edge_index)
            s = torch.tanh(s)
        x = self.Whp(torch.cat((x, s), -1))
        x = self.post(x)
        x = F.dropout(x, self.dropout, training=self.training)
        x = self.readout(x)
        # print(x)
        x = F.log_softmax(x, dim=1)
        # print(x)
        return x.float()
    def loss(self, pred, label):
        # print(pred, label)
        return F.cross_entropy(pred, label)
        # return F.nll_loss(pred, label)
data = copy.deepcopy(graph)
num_classes = get_numGraphLabels(data)
n_se = n_rw+n_dg
data.y = one_hot(data.y).squeeze(dim=1).float()
model = GIN_dc(nfeat=data.num_node_features, n_se=n_se, nhid=64, nclass=num_classes, nlayer=3, dropout=0.5).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)


In [18]:
per = torch.randperm(data.num_nodes)

In [37]:
# optimizer = torch.optim.SGD(model.parameters(), lr=0.1, momentum=0.9)
model = GIN_dc(nfeat=data.num_node_features, n_se=n_se, nhid=64, nclass=num_classes, nlayer=3, dropout=0.5).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
batch_size = 1024
data.to(device)
start = 0
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=10)
for epoch in range(20):
    acc_sum = 0
    total_loss = 0
    all = torch.randperm(data.num_nodes)
    for batch_num in range(data.num_nodes//batch_size):
    # batch = random.choices(all, k=len(all)//20)
        model.train()
        model.to(data.x.device)
        data.to(device)
        # print(f'epoch={epoch}')
        loss = torch.tensor([0.0], device='cuda:0')
    # loss.to(device)
    # print(loss.device)
        optimizer.zero_grad()
        out = model(data)
    # break
#     print(out.shape, data.y.shape)
#     loss = model.loss(out, data.y)
        out.to(device)
    # for i in range(out.shape[0]): 
        batch = per[batch_size*batch_num:batch_size*(batch_num+1)]
        for i in batch :
        # i_tensor = torch.tensor([i], dtype=torch.long, device=out.device)
            loss += model.loss(out[i], data.y[i])
            total_loss += loss.item()
        # loss += model.loss(out[i], data.y[i])
# Convert one-hot encoded labels to class indices
# Calculate number of correct predictions
        acc_sum = out.max(dim=1)[1].eq(data.y.max(dim=1)[1]).sum().item()
        # acc_sum += acc
        # if start == 0 : print(f'epoch={epoch}', total_loss, acc_sum, acc_sum*100/data.num_nodes)
        # start = 1
        # print(loss.item(), acc_sum, acc_sum*100/data.num_nodes)
    # break
        loss.backward()
        scheduler.step(loss)
        
    print(f'epoch={epoch}', total_loss, acc_sum, acc_sum*100/data.num_nodes)
    # optimizer.step()

epoch=0 326294302.67658997 4273 2.5232811512728603
epoch=1 326372319.4695916 4350 2.5687509964982316
epoch=2 326097088.06268835 4324 2.553397542266288
epoch=3 326572505.798996 4365 2.577608758555122
epoch=4 326675604.1150408 4244 2.506156144629539
epoch=5 326140412.3133693 4409 2.603591527255334
epoch=6 326823382.5893841 4263 2.5173759765682666
epoch=7 326278767.6821778 4327 2.555169094677666
epoch=8 326818438.9265218 4290 2.5333199482706696
epoch=9 326441913.9854653 4252 2.5108802843932136
epoch=10 326505077.26677394 4279 2.5268242560956167
epoch=11 326703572.72160435 4368 2.5793803109665
epoch=12 326655642.3296597 4354 2.571113066380069
epoch=13 326309872.6503229 4226 2.4955268301612703
epoch=14 326368298.9786763 4272 2.522690633802401
epoch=15 326572511.3947983 4354 2.571113066380069
epoch=16 326328794.6280587 4295 2.5362725356229663
epoch=17 326508996.8394308 4361 2.5752466886732845
epoch=18 326868802.4372189 4307 2.543358745268479
epoch=19 326404487.56867194 4289 2.532729430800210

In [42]:
model = GIN_dc(nfeat=data.num_node_features, n_se=n_se, nhid=64, nclass=num_classes, nlayer=3, dropout=0.5).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.1)

for epoch in range(20):
    model.train()
    loss = torch.tensor([0.0], device='cuda:0')
    optimizer.zero_grad()
    out = model(data)

    for i in range(out.shape[0]) : loss += model.loss(out[i], data.y[i])
    acc_sum = out.max(dim=1)[1].eq(data.y.max(dim=1)[1]).sum().item()
    print(f'epoch-{epoch}',loss.item(), acc_sum, acc_sum*100/data.num_nodes)    
    loss.backward()
    optimizer.step()
    # scheduler.step(loss)


epoch-0 641413.125 1972 1.1645004517458648
epoch-1 20512636928.0 19722 11.646185552399567
epoch-2 3391872256.0 27211 16.06857088866974
epoch-3 87146088.0 11449 6.760834519289253
epoch-4 18804420.0 16257 9.600042517257872
epoch-5 8231242.5 15337 9.056766444435258
epoch-6 23866962.0 22666 13.384668985431935
epoch-7 11963941.0 18058 10.663564481555186
epoch-8 2806580.0 6827 4.031462770826075
epoch-9 12034076.0 12444 7.3483994023963195
epoch-10 7597797.5 11175 6.599032732383388
epoch-11 5239005.5 15524 9.16719321141116
epoch-12 2112908.25 14393 8.49931795232162
epoch-13 2157015.0 7246 4.278889590948548
epoch-14 1955356.625 7441 4.394040497688124
epoch-15 701278.5 7146 4.219837843902612
epoch-16 660327.625 7869 4.646781975044732
epoch-17 655664.6875 7869 4.646781975044732
epoch-18 653521.3125 7869 4.646781975044732
epoch-19 650977.3125 7869 4.646781975044732


In [44]:
model = GIN_dc(nfeat=data.num_node_features, n_se=n_se, nhid=64, nclass=num_classes, nlayer=2, dropout=0.2).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)

for epoch in range(20):
    model.train()
    loss = torch.tensor([0.0], device='cuda:0')
    optimizer.zero_grad()
    out = model(data)

    for i in range(out.shape[0]) : loss += model.loss(out[i], data.y[i])
    acc_sum = out.max(dim=1)[1].eq(data.y.max(dim=1)[1]).sum().item()
    print(f'epoch-{epoch}',loss.item(), acc_sum, acc_sum*100/data.num_nodes)    
    loss.backward()
    optimizer.step()
    # scheduler.step(loss)


epoch-0 630005.375 5898 3.482872040769326
epoch-1 634949.9375 18939 11.183810373029885
epoch-2 583449.6875 12938 7.640115032803245
epoch-3 541708.5625 20046 11.837513212828402
epoch-4 531122.25 24439 14.431656460556386
epoch-5 512873.0 27899 16.474846908345782
epoch-6 511581.90625 29366 17.34113603750967
epoch-7 507860.375 28442 16.79549789480522
epoch-8 500878.90625 27748 16.38567877030642
epoch-9 498798.375 29346 17.329325688100482
epoch-10 493691.71875 31882 18.826877993185427
epoch-11 492156.75 31336 18.504455454314616
epoch-12 490911.5625 30795 18.1849855027961
epoch-13 489868.4375 31107 18.36922695357942
epoch-14 487555.0625 31981 18.885339222760905
epoch-15 486268.84375 33047 19.514830846270588
epoch-16 485384.6875 33480 19.770524910979493
epoch-17 484773.125 32817 19.379011828064932
epoch-18 483932.1875 32917 19.43806357511087
epoch-19 483129.1875 33163 19.583330872843874


In [None]:
# import torch
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# print('Using device:', device)
# print()

# #Additional Info when using cuda
# if device.type == 'cuda':
#     print(torch.cuda.get_device_name(0))
#     print('Memory Usage:')
#     print('Allocated:', round(torch.cuda.memory_allocated(0)/1024**3,1), 'GB')
#     print('Cached:   ', round(torch.cuda.memory_reserved(0)/1024**3,1), 'GB')

In [None]:
# import random
# from random import choices
# import numpy as np
# import pandas as pd

# import torch
# from torch_geometric.datasets import TUDataset
# from torch_geometric.loader import DataLoader
# from torch_geometric.transforms import OneHotDegree

In [None]:
# !pip install tensorboardX
# !pip install networkx
# !pip install tensorflow
# !pip install dgl

In [None]:
# !pip install torch 
# !pip install torch_geometric 
# !pip install torch_scatter
# !pip install pymetis 
# !pip install ogb

In [None]:
# num_clients = 4
# dataset = PygNodePropPredDataset(name='ogbn-arxiv')
# graph = dataset[0]
# num_nodes = graph.num_nodes
# num_edges = graph.num_edges
# num_edges, num_nodes

In [None]:
# nx_graph = utils.to_networkx(graph)
# partitions = pymetis.part_graph(num_clients, adjacency=nx.to_dict_of_lists(nx_graph))

In [None]:
# partitions_np = np.array(partitions[1])
# partition_tensor = torch.from_numpy(partitions_np)
# subgraphs = []
# for i in range(num_clients):
#     nodes = (partition_tensor == i).nonzero(as_tuple=True)[0]
#     subgraph = graph.subgraph(nodes)
#     subgraphs.append(subgraph)
#     print(f'Number of nodes = {subgraph.num_nodes} and Number of edges = {subgraph.num_edges}')

In [3]:
# !wget https://s3-ap-southeast-1.amazonaws.com/he-public-data/datasetb2d9982.zip

--2023-04-22 13:37:48--  https://s3-ap-southeast-1.amazonaws.com/he-public-data/datasetb2d9982.zip
Resolving s3-ap-southeast-1.amazonaws.com (s3-ap-southeast-1.amazonaws.com)... 52.219.124.178
Connecting to s3-ap-southeast-1.amazonaws.com (s3-ap-southeast-1.amazonaws.com)|52.219.124.178|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 895569552 (854M) [binary/octet-stream]
Saving to: ‘datasetb2d9982.zip’


2023-04-22 13:40:10 (6.29 MB/s) - ‘datasetb2d9982.zip’ saved [895569552/895569552]



In [4]:
# from zipfile import ZipFile
  
# # loading the temp.zip and creating a zip object
# with ZipFile('datasetb2d9982.zip', 'r') as zObject:
#     zObject.extractall()

In [4]:
# import pandas as pd
# import numpy as np

In [5]:
# train = pd.read_csv('dataset/train.csv', index_col='PRODUCT_ID')
# train.head()

Unnamed: 0_level_0,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH
PRODUCT_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1925202,ArtzFolio Tulip Flowers Blackout Curtain for D...,[LUXURIOUS & APPEALING: Beautiful custom-made ...,,1650,2125.98
2673191,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...,"[Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...",,2755,393.7
2765088,PRIKNIK Horn Red Electric Air Horn Compressor ...,"[Loud Dual Tone Trumpet Horn, Compatible With ...","Specifications: Color: Red, Material: Aluminiu...",7537,748.031495
1594019,ALISHAH Women's Cotton Ankle Length Leggings C...,[Made By 95%cotton and 5% Lycra which gives yo...,AISHAH Women's Lycra Cotton Ankel Leggings. Br...,2996,787.401574
283658,The United Empire Loyalists: A Chronicle of th...,,,6112,598.424


In [6]:

# train['DESCRIPTION'].fillna(train['BULLET_POINTS'], inplace=True)

# # Fill description with title
# train['DESCRIPTION'].fillna(train['TITLE'], inplace=True)

# # Fill title with description
# train['TITLE'].fillna(train['DESCRIPTION'], inplace=True)

# # Replace remaining NaN values with "Undefined"
# train.fillna('Undefined', inplace=True)

In [7]:
# from scipy.stats import entropy
# import pandas as pd

# train['product_type_entropy'] = train.groupby('PRODUCT_TYPE_ID')['PRODUCT_LENGTH'].apply(lambda x: entropy(x.value_counts(normalize=True)))

# # Fill NaN values in product_type_entropy column with a default value (e.g., 0)
# train['product_type_entropy'].fillna(0, inplace=True)

# # Rank the unique values of PRODUCT_TYPE_ID based on entropy
# ranked_product_types = train.groupby('PRODUCT_TYPE_ID')['product_type_entropy'].rank(ascending=False, method='dense')

# # Create a DataFrame with one-hot encoding
# one_hot_encoding = pd.pivot_table(train, index='PRODUCT_TYPE_ID', columns=ranked_product_types.astype(int), values='product_type_entropy', aggfunc=lambda x: 1).fillna(0).astype(int)

# # # Create a dictionary to map PRODUCT_TYPE_ID to one-hot vector encoding
# # product_type_id_to_one_hot = dict(zip(one_hot_encoding.index, one_hot_encoding.values.tolist()))

# # # Apply the one-hot encoding to the PRODUCT_TYPE_ID column in the DataFrame
# # train_temp = pd.concat([train.drop('product_type_entropy', axis=1), one_hot_encoding], axis=1)


In [8]:
# one_hot_encoding

product_type_entropy,1,2,3,4,5,6,7,8,9,10,...,274,275,276,277,278,279,280,281,282,283
PRODUCT_TYPE_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1,1,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13416,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13417,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13418,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
13419,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
# # Create a dictionary to map PRODUCT_TYPE_ID to one-hot vector encoding
# product_type_id_to_one_hot = dict(zip(one_hot_encoding.index, one_hot_encoding.values.tolist()))

# # Apply the one-hot encoding to the PRODUCT_TYPE_ID column in the DataFrame
# def map_product_type_id_to_one_hot(product_type_id):
#     return product_type_id_to_one_hot.get(product_type_id, [0, 0, 0])

# # Apply the function to the PRODUCT_TYPE_ID column to get the one-hot vector
# train['one_hot_encoding'] = train['PRODUCT_TYPE_ID'].apply(map_product_type_id_to_one_hot)

# # Split the one-hot encoding into separate columns for each element of the vector
# train_temp = pd.concat([train.drop('one_hot_encoding', axis=1), train['one_hot_encoding'].apply(pd.Series)], axis=1)


In [None]:
# train_temp.head()

In [7]:
# train_temp.head()

Unnamed: 0,TITLE,BULLET_POINTS,DESCRIPTION,PRODUCT_TYPE_ID,PRODUCT_LENGTH,1,2,3,4,5,...,274,275,276,277,278,279,280,281,282,283
1925202,ArtzFolio Tulip Flowers Blackout Curtain for D...,[LUXURIOUS & APPEALING: Beautiful custom-made ...,[LUXURIOUS & APPEALING: Beautiful custom-made ...,1650.0,2125.98,,,,,,...,,,,,,,,,,
2673191,Marks & Spencer Girls' Pyjama Sets T86_2561C_N...,"[Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...","[Harry Potter Hedwig Pyjamas (6-16 Yrs),100% c...",2755.0,393.7,,,,,,...,,,,,,,,,,
2765088,PRIKNIK Horn Red Electric Air Horn Compressor ...,"[Loud Dual Tone Trumpet Horn, Compatible With ...","Specifications: Color: Red, Material: Aluminiu...",7537.0,748.031495,,,,,,...,,,,,,,,,,
1594019,ALISHAH Women's Cotton Ankle Length Leggings C...,[Made By 95%cotton and 5% Lycra which gives yo...,AISHAH Women's Lycra Cotton Ankel Leggings. Br...,2996.0,787.401574,,,,,,...,,,,,,,,,,
283658,The United Empire Loyalists: A Chronicle of th...,Undefined,The United Empire Loyalists: A Chronicle of th...,6112.0,598.424,,,,,,...,,,,,,,,,,


In [None]:
# from scipy.stats import entropy


# # Calculate entropy for each unique value in the PRODUCT_TYPE_ID column
# product_type_entropy = train.groupby('PRODUCT_TYPE_ID').apply(lambda x: entropy(x['PRODUCT_LENGTH'].value_counts(normalize=True)))

# # Sort the unique values of PRODUCT_TYPE_ID based on entropy in descending order
# sorted_product_type_ids = product_type_entropy.sort_values(ascending=False).index

# # Assign one-hot vector encoding based on the sorted PRODUCT_TYPE_ID values
# one_hot_encoding = np.eye(len(sorted_product_type_ids), dtype=int)

# # Create a dictionary to map PRODUCT_TYPE_ID to one-hot vector encoding
# product_type_id_to_one_hot = dict(zip(sorted_product_type_ids, one_hot_encoding))

# # Apply the one-hot encoding to the PRODUCT_TYPE_ID column in the DataFrame
# train['one_hot_encoding'] = train['PRODUCT_TYPE_ID'].map(product_type_id_to_one_hot)

# # Split the one-hot encoding into separate columns for each element of the vector
# train_temp = pd.concat([train.drop('one_hot_encoding', axis=1), train['one_hot_encoding'].apply(pd.Series)], axis=1)