In [90]:
import numpy as np
import torch
import random
import torch_geometric.transforms as T
from torch_geometric.data import Data
import pandas as pd
import os
from pathlib import Path
import re


def parse_webkb(data_name):
    path = f'/storage/qiaoyr/TAPE/dataset/web_kb/WebKB/{data_name}'
    webpage_features_labels = np.genfromtxt("{}.content".format(path), dtype=np.dtype(str))
    data_X = webpage_features_labels[:, 1:-1].astype(np.float32)
    labels = webpage_features_labels[:, -1]
    #print(labels)
    class_map = {x: i for i, x in enumerate(['course', 'faculty', 'student','project', 'staff'])}  
    #print(class_map)
    data_Y = np.array([class_map[x] for x in labels])
    data_webpage_url = webpage_features_labels[:, 0]
    # data_webpage_id = np.arange(len(data_webpage_url))
    data_webpage_id_map = {x: i for i, x in enumerate(data_webpage_url)}
    edges_unordered = np.genfromtxt("{}.cites".format(path), dtype=np.dtype(str))
    '''
    for i in range(edges_unordered.shape[0]):
        if edges_unordered[i][0] == edges_unordered[i][1]:
            print('self loop:',edges_unordered[i][0])
    '''
    edges = np.array(list(map(data_webpage_id_map.get, edges_unordered.flatten())), dtype=np.int32).reshape(edges_unordered.shape)
    #print(edges.shape)
    data_edges = np.array(edges[~(edges == None).max(1)], dtype=np.int32)
    #print(data_edges.shape)
    data_edges = np.vstack((data_edges, np.fliplr(data_edges)))
    #print(data_edges.shape)

    return data_X, data_Y, data_webpage_url, np.unique(data_edges, axis=0).transpose()

In [91]:
'''
X, Y, webpage_id, edges = parse_wisconsin()
print(X.shape)
print(Y.shape)
print(webpage_id.shape)
print(edges.shape)
'''

'\nX, Y, webpage_id, edges = parse_wisconsin()\nprint(X.shape)\nprint(Y.shape)\nprint(webpage_id.shape)\nprint(edges.shape)\n'

In [92]:
def get_webkb_casestudy(data_name, SEED=0):
    data_X, data_Y, data_webpage_url, data_edges = parse_webkb(data_name)

    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED)
    np.random.seed(SEED)  # Numpy module.
    random.seed(SEED)  # Python random module.

    # load data
    data = Data(x=torch.tensor(data_X).float(),
                 edge_index=torch.tensor(data_edges).long(), 
                 y=torch.tensor(data_Y).long(),
                 num_nodes=len(data_Y))
    # split data
    node_id = np.arange(data.num_nodes)
    np.random.shuffle(node_id)

    data.train_id = np.sort(node_id[:int(data.num_nodes * 0.6)])
    data.val_id = np.sort(
        node_id[int(data.num_nodes * 0.6):int(data.num_nodes * 0.8)])
    data.test_id = np.sort(node_id[int(data.num_nodes * 0.8):])

    data.train_mask = torch.tensor(
        [x in data.train_id for x in range(data.num_nodes)])
    data.val_mask = torch.tensor(
        [x in data.val_id for x in range(data.num_nodes)])
    data.test_mask = torch.tensor(
        [x in data.test_id for x in range(data.num_nodes)])
    
    return data, data_webpage_url

In [93]:
def html_process(input_string):
    # 使用正则表达式去掉所有 HTML 标签
    lines = input_string.split('\n')
    clean_text = ' '.join(lines[6:])

    #non_empty_lines = [line for line in clean_text if line.strip()]
    
    #tag_list = ['<.*?>', r'<ahref\s*=\s*".*?"\s*>', r'<a\shref\s*=\s*".*?"\s*>', r'<meta *.html>', r'<img src*>', r'<IMG SRC*">', r'<bodyBACKGROUND*>', r'<imgsrc*>', r'<AHREF*>', '\n']
    tag_list = ['<.*?>', '\n', r'<a\s+href\s*=\s*".*?"\s*>', r'<IMG\s+SRC\s*=\s*".*?"\s+ALT\s*=\s*".*?"\s*>']
    for tag in tag_list:
        clean_text = re.sub(tag, '', clean_text, flags=re.IGNORECASE)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    return clean_text
'''
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        if not url.endswith('.html'):
            url += '^'
        try:
            file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        except:
            print(i, file_path, 'not found') ###TODO
            text.append('')
    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text
'''
def delete_vacant_webpage(data, i):
    data.y = torch.cat((data.y[:i], data.y[(i+1):]))
    # data.edge_index = torch.cat((data.edge_index[:,:i], data.edge_index[:,(i+1):]), dim=1)
    data.x = torch.cat((data.x[:i], data.x[(i+1):]))
    data.train_mask = torch.cat((data.train_mask[:i], data.train_mask[(i+1):]))
    data.val_mask = torch.cat((data.val_mask[:i], data.val_mask[(i+1):]))
    data.test_mask = torch.cat((data.test_mask[:i], data.test_mask[(i+1):]))
    data.train_id = np.delete(data.train_id, i)
    data.val_id = np.delete(data.val_id, i)
    data.test_id = np.delete(data.test_id, i)
    data.num_nodes -= 1
    mask = (data.edge_index[0] == i) | (data.edge_index[1] == i)
    data.edge_index = data.edge_index[:,~mask] 
    return data
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        webs_to_remove = []
        if not url.endswith('.html'):
            url += '^'
        file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
        if os.path.exists(file_path):
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        else:
            print(i)
            print(i)
            webs_to_remove.append(i)
            print(i, file_path, 'not found') ###TODO
            # text.append('')
            
    print(webs_to_remove)
   
    for i in reversed(webs_to_remove):
        print(i)
        data = delete_vacant_webpage(data, i)
        print(data)
    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text

In [94]:
data, clean_text = get_raw_text_webkb('wisconsin', use_text=True, seed=0)

3
3
3 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/wisconsin/http:^^www.cs.wisc.edu^ not found
5
5
5 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/project/wisconsin/http:^^www.cs.wisc.edu^condor^next.html not found
[]


In [95]:
import numpy as np
import torch
import random
import torch_geometric.transforms as T
from torch_geometric.data import Data
import pandas as pd
import os
from pathlib import Path
import re


def parse_webkb(data_name):
    path = f'/storage/qiaoyr/TAPE/dataset/web_kb/WebKB/{data_name}'
    webpage_features_labels = np.genfromtxt("{}.content".format(path), dtype=np.dtype(str))
    data_X = webpage_features_labels[:, 1:-1].astype(np.float32)
    labels = webpage_features_labels[:, -1]
    #print(labels)
    class_map = {x: i for i, x in enumerate(['course', 'faculty', 'student','project', 'staff'])}  
    #print(class_map)
    data_Y = np.array([class_map[x] for x in labels])
    data_webpage_url = webpage_features_labels[:, 0]
    # data_webpage_id = np.arange(len(data_webpage_url))
    data_webpage_id_map = {x: i for i, x in enumerate(data_webpage_url)}
    edges_unordered = np.genfromtxt("{}.cites".format(path), dtype=np.dtype(str))
    '''
    for i in range(edges_unordered.shape[0]):
        if edges_unordered[i][0] == edges_unordered[i][1]:
            print('self loop:',edges_unordered[i][0])
    '''
    edges = np.array(list(map(data_webpage_id_map.get, edges_unordered.flatten())), dtype=np.int32).reshape(edges_unordered.shape)
    #print(edges.shape)
    data_edges = np.array(edges[~(edges == None).max(1)], dtype=np.int32)
    #print(data_edges.shape)
    data_edges = np.vstack((data_edges, np.fliplr(data_edges)))
    #print(data_edges.shape)

    return data_X, data_Y, data_webpage_url, np.unique(data_edges, axis=0).transpose()

In [96]:
'''
X, Y, webpage_id, edges = parse_wisconsin()
print(X.shape)
print(Y.shape)
print(webpage_id.shape)
print(edges.shape)
'''

'\nX, Y, webpage_id, edges = parse_wisconsin()\nprint(X.shape)\nprint(Y.shape)\nprint(webpage_id.shape)\nprint(edges.shape)\n'

In [97]:
def get_webkb_casestudy(data_name, SEED=0):
    data_X, data_Y, data_webpage_url, data_edges = parse_webkb(data_name)

    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED)
    np.random.seed(SEED)  # Numpy module.
    random.seed(SEED)  # Python random module.

    # load data
    data = Data(x=torch.tensor(data_X).float(),
                 edge_index=torch.tensor(data_edges).long(), 
                 y=torch.tensor(data_Y).long(),
                 num_nodes=len(data_Y))
    # split data
    node_id = np.arange(data.num_nodes)
    np.random.shuffle(node_id)

    data.train_id = np.sort(node_id[:int(data.num_nodes * 0.6)])
    data.val_id = np.sort(
        node_id[int(data.num_nodes * 0.6):int(data.num_nodes * 0.8)])
    data.test_id = np.sort(node_id[int(data.num_nodes * 0.8):])

    data.train_mask = torch.tensor(
        [x in data.train_id for x in range(data.num_nodes)])
    data.val_mask = torch.tensor(
        [x in data.val_id for x in range(data.num_nodes)])
    data.test_mask = torch.tensor(
        [x in data.test_id for x in range(data.num_nodes)])
    
    return data, data_webpage_url

In [98]:
def html_process(input_string):
    # 使用正则表达式去掉所有 HTML 标签
    lines = input_string.split('\n')
    clean_text = ' '.join(lines[6:])

    #non_empty_lines = [line for line in clean_text if line.strip()]
    
    #tag_list = ['<.*?>', r'<ahref\s*=\s*".*?"\s*>', r'<a\shref\s*=\s*".*?"\s*>', r'<meta *.html>', r'<img src*>', r'<IMG SRC*">', r'<bodyBACKGROUND*>', r'<imgsrc*>', r'<AHREF*>', '\n']
    tag_list = ['<.*?>', '\n', r'<a\s+href\s*=\s*".*?"\s*>', r'<IMG\s+SRC\s*=\s*".*?"\s+ALT\s*=\s*".*?"\s*>']
    for tag in tag_list:
        clean_text = re.sub(tag, '', clean_text, flags=re.IGNORECASE)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    return clean_text
'''
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        if not url.endswith('.html'):
            url += '^'
        try:
            file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        except:
            print(i, file_path, 'not found') ###TODO
            text.append('')
    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text
'''
def delete_vacant_webpage(data, i):
    data.y = torch.cat((data.y[:i], data.y[(i+1):]))
    # data.edge_index = torch.cat((data.edge_index[:,:i], data.edge_index[:,(i+1):]), dim=1)
    data.x = torch.cat((data.x[:i], data.x[(i+1):]))
    data.train_mask = torch.cat((data.train_mask[:i], data.train_mask[(i+1):]))
    data.val_mask = torch.cat((data.val_mask[:i], data.val_mask[(i+1):]))
    data.test_mask = torch.cat((data.test_mask[:i], data.test_mask[(i+1):]))
    data.train_id = np.delete(data.train_id, i)
    data.val_id = np.delete(data.val_id, i)
    data.test_id = np.delete(data.test_id, i)
    data.num_nodes -= 1
    mask = (data.edge_index[0] == i) | (data.edge_index[1] == i)
    data.edge_index = data.edge_index[:,~mask] 
    return data
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        pages_to_remove = []
        if not url.endswith('.html'):
            url += '^'
        file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
        if os.path.exists(file_path):
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        else:
            pages_to_remove.append(i)
            print(i, file_path, 'not found') ###TODO
            # text.append('')
            
    print(pages_to_remove)
   
    for i in reversed(pages_to_remove):
        print(i)
        data = delete_vacant_webpage(data, i)
        print(data)
    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text

In [99]:
data, clean_text = get_raw_text_webkb('wisconsin', use_text=True, seed=0)

3 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/wisconsin/http:^^www.cs.wisc.edu^ not found
5 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/project/wisconsin/http:^^www.cs.wisc.edu^condor^next.html not found
[]


In [100]:
import numpy as np
import torch
import random
import torch_geometric.transforms as T
from torch_geometric.data import Data
import pandas as pd
import os
from pathlib import Path
import re


def parse_webkb(data_name):
    path = f'/storage/qiaoyr/TAPE/dataset/web_kb/WebKB/{data_name}'
    webpage_features_labels = np.genfromtxt("{}.content".format(path), dtype=np.dtype(str))
    data_X = webpage_features_labels[:, 1:-1].astype(np.float32)
    labels = webpage_features_labels[:, -1]
    #print(labels)
    class_map = {x: i for i, x in enumerate(['course', 'faculty', 'student','project', 'staff'])}  
    #print(class_map)
    data_Y = np.array([class_map[x] for x in labels])
    data_webpage_url = webpage_features_labels[:, 0]
    # data_webpage_id = np.arange(len(data_webpage_url))
    data_webpage_id_map = {x: i for i, x in enumerate(data_webpage_url)}
    edges_unordered = np.genfromtxt("{}.cites".format(path), dtype=np.dtype(str))
    '''
    for i in range(edges_unordered.shape[0]):
        if edges_unordered[i][0] == edges_unordered[i][1]:
            print('self loop:',edges_unordered[i][0])
    '''
    edges = np.array(list(map(data_webpage_id_map.get, edges_unordered.flatten())), dtype=np.int32).reshape(edges_unordered.shape)
    #print(edges.shape)
    data_edges = np.array(edges[~(edges == None).max(1)], dtype=np.int32)
    #print(data_edges.shape)
    data_edges = np.vstack((data_edges, np.fliplr(data_edges)))
    #print(data_edges.shape)

    return data_X, data_Y, data_webpage_url, np.unique(data_edges, axis=0).transpose()

In [101]:
'''
X, Y, webpage_id, edges = parse_wisconsin()
print(X.shape)
print(Y.shape)
print(webpage_id.shape)
print(edges.shape)
'''

'\nX, Y, webpage_id, edges = parse_wisconsin()\nprint(X.shape)\nprint(Y.shape)\nprint(webpage_id.shape)\nprint(edges.shape)\n'

In [102]:
def get_webkb_casestudy(data_name, SEED=0):
    data_X, data_Y, data_webpage_url, data_edges = parse_webkb(data_name)

    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED)
    np.random.seed(SEED)  # Numpy module.
    random.seed(SEED)  # Python random module.

    # load data
    data = Data(x=torch.tensor(data_X).float(),
                 edge_index=torch.tensor(data_edges).long(), 
                 y=torch.tensor(data_Y).long(),
                 num_nodes=len(data_Y))
    # split data
    node_id = np.arange(data.num_nodes)
    np.random.shuffle(node_id)

    data.train_id = np.sort(node_id[:int(data.num_nodes * 0.6)])
    data.val_id = np.sort(
        node_id[int(data.num_nodes * 0.6):int(data.num_nodes * 0.8)])
    data.test_id = np.sort(node_id[int(data.num_nodes * 0.8):])

    data.train_mask = torch.tensor(
        [x in data.train_id for x in range(data.num_nodes)])
    data.val_mask = torch.tensor(
        [x in data.val_id for x in range(data.num_nodes)])
    data.test_mask = torch.tensor(
        [x in data.test_id for x in range(data.num_nodes)])
    
    return data, data_webpage_url

In [103]:
def html_process(input_string):
    # 使用正则表达式去掉所有 HTML 标签
    lines = input_string.split('\n')
    clean_text = ' '.join(lines[6:])

    #non_empty_lines = [line for line in clean_text if line.strip()]
    
    #tag_list = ['<.*?>', r'<ahref\s*=\s*".*?"\s*>', r'<a\shref\s*=\s*".*?"\s*>', r'<meta *.html>', r'<img src*>', r'<IMG SRC*">', r'<bodyBACKGROUND*>', r'<imgsrc*>', r'<AHREF*>', '\n']
    tag_list = ['<.*?>', '\n', r'<a\s+href\s*=\s*".*?"\s*>', r'<IMG\s+SRC\s*=\s*".*?"\s+ALT\s*=\s*".*?"\s*>']
    for tag in tag_list:
        clean_text = re.sub(tag, '', clean_text, flags=re.IGNORECASE)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    return clean_text
'''
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        if not url.endswith('.html'):
            url += '^'
        try:
            file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        except:
            print(i, file_path, 'not found') ###TODO
            text.append('')
    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text
'''
def delete_vacant_webpage(data, i):
    data.y = torch.cat((data.y[:i], data.y[(i+1):]))
    # data.edge_index = torch.cat((data.edge_index[:,:i], data.edge_index[:,(i+1):]), dim=1)
    data.x = torch.cat((data.x[:i], data.x[(i+1):]))
    data.train_mask = torch.cat((data.train_mask[:i], data.train_mask[(i+1):]))
    data.val_mask = torch.cat((data.val_mask[:i], data.val_mask[(i+1):]))
    data.test_mask = torch.cat((data.test_mask[:i], data.test_mask[(i+1):]))
    data.train_id = np.delete(data.train_id, i)
    data.val_id = np.delete(data.val_id, i)
    data.test_id = np.delete(data.test_id, i)
    data.num_nodes -= 1
    mask = (data.edge_index[0] == i) | (data.edge_index[1] == i)
    data.edge_index = data.edge_index[:,~mask] 
    return data
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        pages_to_remove = []
        if not url.endswith('.html'):
            url += '^'
        file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
        if os.path.exists(file_path):
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        else:
            pages_to_remove.append(i)
            # print(i, file_path, 'not found') ###TODO
            # text.append('')
            
    print(pages_to_remove)
   
    for i in reversed(pages_to_remove):
        print(i)
        data = delete_vacant_webpage(data, i)
        print(data)
    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text

In [104]:
data, clean_text = get_raw_text_webkb('wisconsin', use_text=True, seed=0)

[]


In [105]:
import numpy as np
import torch
import random
import torch_geometric.transforms as T
from torch_geometric.data import Data
import pandas as pd
import os
from pathlib import Path
import re


def parse_webkb(data_name):
    path = f'/storage/qiaoyr/TAPE/dataset/web_kb/WebKB/{data_name}'
    webpage_features_labels = np.genfromtxt("{}.content".format(path), dtype=np.dtype(str))
    data_X = webpage_features_labels[:, 1:-1].astype(np.float32)
    labels = webpage_features_labels[:, -1]
    #print(labels)
    class_map = {x: i for i, x in enumerate(['course', 'faculty', 'student','project', 'staff'])}  
    #print(class_map)
    data_Y = np.array([class_map[x] for x in labels])
    data_webpage_url = webpage_features_labels[:, 0]
    # data_webpage_id = np.arange(len(data_webpage_url))
    data_webpage_id_map = {x: i for i, x in enumerate(data_webpage_url)}
    edges_unordered = np.genfromtxt("{}.cites".format(path), dtype=np.dtype(str))
    '''
    for i in range(edges_unordered.shape[0]):
        if edges_unordered[i][0] == edges_unordered[i][1]:
            print('self loop:',edges_unordered[i][0])
    '''
    edges = np.array(list(map(data_webpage_id_map.get, edges_unordered.flatten())), dtype=np.int32).reshape(edges_unordered.shape)
    #print(edges.shape)
    data_edges = np.array(edges[~(edges == None).max(1)], dtype=np.int32)
    #print(data_edges.shape)
    data_edges = np.vstack((data_edges, np.fliplr(data_edges)))
    #print(data_edges.shape)

    return data_X, data_Y, data_webpage_url, np.unique(data_edges, axis=0).transpose()

In [106]:
'''
X, Y, webpage_id, edges = parse_wisconsin()
print(X.shape)
print(Y.shape)
print(webpage_id.shape)
print(edges.shape)
'''

'\nX, Y, webpage_id, edges = parse_wisconsin()\nprint(X.shape)\nprint(Y.shape)\nprint(webpage_id.shape)\nprint(edges.shape)\n'

In [107]:
def get_webkb_casestudy(data_name, SEED=0):
    data_X, data_Y, data_webpage_url, data_edges = parse_webkb(data_name)

    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED)
    np.random.seed(SEED)  # Numpy module.
    random.seed(SEED)  # Python random module.

    # load data
    data = Data(x=torch.tensor(data_X).float(),
                 edge_index=torch.tensor(data_edges).long(), 
                 y=torch.tensor(data_Y).long(),
                 num_nodes=len(data_Y))
    # split data
    node_id = np.arange(data.num_nodes)
    np.random.shuffle(node_id)

    data.train_id = np.sort(node_id[:int(data.num_nodes * 0.6)])
    data.val_id = np.sort(
        node_id[int(data.num_nodes * 0.6):int(data.num_nodes * 0.8)])
    data.test_id = np.sort(node_id[int(data.num_nodes * 0.8):])

    data.train_mask = torch.tensor(
        [x in data.train_id for x in range(data.num_nodes)])
    data.val_mask = torch.tensor(
        [x in data.val_id for x in range(data.num_nodes)])
    data.test_mask = torch.tensor(
        [x in data.test_id for x in range(data.num_nodes)])
    
    return data, data_webpage_url

In [108]:
def html_process(input_string):
    # 使用正则表达式去掉所有 HTML 标签
    lines = input_string.split('\n')
    clean_text = ' '.join(lines[6:])

    #non_empty_lines = [line for line in clean_text if line.strip()]
    
    #tag_list = ['<.*?>', r'<ahref\s*=\s*".*?"\s*>', r'<a\shref\s*=\s*".*?"\s*>', r'<meta *.html>', r'<img src*>', r'<IMG SRC*">', r'<bodyBACKGROUND*>', r'<imgsrc*>', r'<AHREF*>', '\n']
    tag_list = ['<.*?>', '\n', r'<a\s+href\s*=\s*".*?"\s*>', r'<IMG\s+SRC\s*=\s*".*?"\s+ALT\s*=\s*".*?"\s*>']
    for tag in tag_list:
        clean_text = re.sub(tag, '', clean_text, flags=re.IGNORECASE)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    return clean_text
'''
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        if not url.endswith('.html'):
            url += '^'
        try:
            file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        except:
            print(i, file_path, 'not found') ###TODO
            text.append('')
    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text
'''
def delete_vacant_webpage(data, i):
    data.y = torch.cat((data.y[:i], data.y[(i+1):]))
    # data.edge_index = torch.cat((data.edge_index[:,:i], data.edge_index[:,(i+1):]), dim=1)
    data.x = torch.cat((data.x[:i], data.x[(i+1):]))
    data.train_mask = torch.cat((data.train_mask[:i], data.train_mask[(i+1):]))
    data.val_mask = torch.cat((data.val_mask[:i], data.val_mask[(i+1):]))
    data.test_mask = torch.cat((data.test_mask[:i], data.test_mask[(i+1):]))
    data.train_id = np.delete(data.train_id, i)
    data.val_id = np.delete(data.val_id, i)
    data.test_id = np.delete(data.test_id, i)
    data.num_nodes -= 1
    mask = (data.edge_index[0] == i) | (data.edge_index[1] == i)
    data.edge_index = data.edge_index[:,~mask] 
    return data
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        pages_to_remove = []
        if not url.endswith('.html'):
            url += '^'
        file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
        if os.path.exists(file_path):
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        else:
            pages_to_remove.append(i)
            # print(i, file_path, 'not found') ###TODO
            # text.append('')
            
    if data_name == 'wisconsin':
        pages_to_remove = [3,5]
   
    for i in reversed(pages_to_remove):
        print(i)
        data = delete_vacant_webpage(data, i)
        print(data)
    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text

In [109]:
data, clean_text = get_raw_text_webkb('wisconsin', use_text=True, seed=0)

5
Data(x=[264, 1703], edge_index=[2, 936], y=[264], num_nodes=264, train_id=[158], val_id=[52], test_id=[52], train_mask=[264], val_mask=[264], test_mask=[264])
3
Data(x=[263, 1703], edge_index=[2, 692], y=[263], num_nodes=263, train_id=[157], val_id=[51], test_id=[51], train_mask=[263], val_mask=[263], test_mask=[263])


In [110]:
import numpy as np
import torch
import random
import torch_geometric.transforms as T
from torch_geometric.data import Data
import pandas as pd
import os
from pathlib import Path
import re


def parse_webkb(data_name):
    path = f'/storage/qiaoyr/TAPE/dataset/web_kb/WebKB/{data_name}'
    webpage_features_labels = np.genfromtxt("{}.content".format(path), dtype=np.dtype(str))
    data_X = webpage_features_labels[:, 1:-1].astype(np.float32)
    labels = webpage_features_labels[:, -1]
    #print(labels)
    class_map = {x: i for i, x in enumerate(['course', 'faculty', 'student','project', 'staff'])}  
    #print(class_map)
    data_Y = np.array([class_map[x] for x in labels])
    data_webpage_url = webpage_features_labels[:, 0]
    # data_webpage_id = np.arange(len(data_webpage_url))
    data_webpage_id_map = {x: i for i, x in enumerate(data_webpage_url)}
    edges_unordered = np.genfromtxt("{}.cites".format(path), dtype=np.dtype(str))
    '''
    for i in range(edges_unordered.shape[0]):
        if edges_unordered[i][0] == edges_unordered[i][1]:
            print('self loop:',edges_unordered[i][0])
    '''
    edges = np.array(list(map(data_webpage_id_map.get, edges_unordered.flatten())), dtype=np.int32).reshape(edges_unordered.shape)
    #print(edges.shape)
    data_edges = np.array(edges[~(edges == None).max(1)], dtype=np.int32)
    #print(data_edges.shape)
    data_edges = np.vstack((data_edges, np.fliplr(data_edges)))
    #print(data_edges.shape)

    return data_X, data_Y, data_webpage_url, np.unique(data_edges, axis=0).transpose()

In [111]:
'''
X, Y, webpage_id, edges = parse_wisconsin()
print(X.shape)
print(Y.shape)
print(webpage_id.shape)
print(edges.shape)
'''

'\nX, Y, webpage_id, edges = parse_wisconsin()\nprint(X.shape)\nprint(Y.shape)\nprint(webpage_id.shape)\nprint(edges.shape)\n'

In [112]:
def get_webkb_casestudy(data_name, SEED=0):
    data_X, data_Y, data_webpage_url, data_edges = parse_webkb(data_name)

    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED)
    np.random.seed(SEED)  # Numpy module.
    random.seed(SEED)  # Python random module.

    # load data
    data = Data(x=torch.tensor(data_X).float(),
                 edge_index=torch.tensor(data_edges).long(), 
                 y=torch.tensor(data_Y).long(),
                 num_nodes=len(data_Y))
    # split data
    node_id = np.arange(data.num_nodes)
    np.random.shuffle(node_id)

    data.train_id = np.sort(node_id[:int(data.num_nodes * 0.6)])
    data.val_id = np.sort(
        node_id[int(data.num_nodes * 0.6):int(data.num_nodes * 0.8)])
    data.test_id = np.sort(node_id[int(data.num_nodes * 0.8):])

    data.train_mask = torch.tensor(
        [x in data.train_id for x in range(data.num_nodes)])
    data.val_mask = torch.tensor(
        [x in data.val_id for x in range(data.num_nodes)])
    data.test_mask = torch.tensor(
        [x in data.test_id for x in range(data.num_nodes)])
    
    return data, data_webpage_url

In [113]:
def html_process(input_string):
    # 使用正则表达式去掉所有 HTML 标签
    lines = input_string.split('\n')
    clean_text = ' '.join(lines[6:])

    #non_empty_lines = [line for line in clean_text if line.strip()]
    
    #tag_list = ['<.*?>', r'<ahref\s*=\s*".*?"\s*>', r'<a\shref\s*=\s*".*?"\s*>', r'<meta *.html>', r'<img src*>', r'<IMG SRC*">', r'<bodyBACKGROUND*>', r'<imgsrc*>', r'<AHREF*>', '\n']
    tag_list = ['<.*?>', '\n', r'<a\s+href\s*=\s*".*?"\s*>', r'<IMG\s+SRC\s*=\s*".*?"\s+ALT\s*=\s*".*?"\s*>']
    for tag in tag_list:
        clean_text = re.sub(tag, '', clean_text, flags=re.IGNORECASE)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    return clean_text
'''
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        if not url.endswith('.html'):
            url += '^'
        try:
            file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        except:
            print(i, file_path, 'not found') ###TODO
            text.append('')
    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text
'''
def delete_vacant_webpage(data, i):
    data.y = torch.cat((data.y[:i], data.y[(i+1):]))
    # data.edge_index = torch.cat((data.edge_index[:,:i], data.edge_index[:,(i+1):]), dim=1)
    data.x = torch.cat((data.x[:i], data.x[(i+1):]))
    data.train_mask = torch.cat((data.train_mask[:i], data.train_mask[(i+1):]))
    data.val_mask = torch.cat((data.val_mask[:i], data.val_mask[(i+1):]))
    data.test_mask = torch.cat((data.test_mask[:i], data.test_mask[(i+1):]))
    data.train_id = np.delete(data.train_id, i)
    data.val_id = np.delete(data.val_id, i)
    data.test_id = np.delete(data.test_id, i)
    data.num_nodes -= 1
    mask = (data.edge_index[0] == i) | (data.edge_index[1] == i)
    data.edge_index = data.edge_index[:,~mask] 
    return data
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        pages_to_remove = []
        if not url.endswith('.html'):
            url += '^'
        file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
        if os.path.exists(file_path):
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        else:
            pages_to_remove.append(i)
            print(i, file_path, 'not found') ###TODO
            # text.append('')
            
    if data_name == 'wisconsin':
        pages_to_remove = [3,5]
   
    for i in reversed(pages_to_remove):
        data = delete_vacant_webpage(data, i)

    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text

In [114]:
data, clean_text = get_raw_text_webkb('cornell', use_text=True, seed=0)

12 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/cornell/http:^^www.cs.cornell.edu^ not found


In [115]:
import numpy as np
import torch
import random
import torch_geometric.transforms as T
from torch_geometric.data import Data
import pandas as pd
import os
from pathlib import Path
import re


def parse_webkb(data_name):
    path = f'/storage/qiaoyr/TAPE/dataset/web_kb/WebKB/{data_name}'
    webpage_features_labels = np.genfromtxt("{}.content".format(path), dtype=np.dtype(str))
    data_X = webpage_features_labels[:, 1:-1].astype(np.float32)
    labels = webpage_features_labels[:, -1]
    #print(labels)
    class_map = {x: i for i, x in enumerate(['course', 'faculty', 'student','project', 'staff'])}  
    #print(class_map)
    data_Y = np.array([class_map[x] for x in labels])
    data_webpage_url = webpage_features_labels[:, 0]
    # data_webpage_id = np.arange(len(data_webpage_url))
    data_webpage_id_map = {x: i for i, x in enumerate(data_webpage_url)}
    edges_unordered = np.genfromtxt("{}.cites".format(path), dtype=np.dtype(str))
    '''
    for i in range(edges_unordered.shape[0]):
        if edges_unordered[i][0] == edges_unordered[i][1]:
            print('self loop:',edges_unordered[i][0])
    '''
    edges = np.array(list(map(data_webpage_id_map.get, edges_unordered.flatten())), dtype=np.int32).reshape(edges_unordered.shape)
    #print(edges.shape)
    data_edges = np.array(edges[~(edges == None).max(1)], dtype=np.int32)
    #print(data_edges.shape)
    data_edges = np.vstack((data_edges, np.fliplr(data_edges)))
    #print(data_edges.shape)

    return data_X, data_Y, data_webpage_url, np.unique(data_edges, axis=0).transpose()

In [116]:
'''
X, Y, webpage_id, edges = parse_wisconsin()
print(X.shape)
print(Y.shape)
print(webpage_id.shape)
print(edges.shape)
'''

'\nX, Y, webpage_id, edges = parse_wisconsin()\nprint(X.shape)\nprint(Y.shape)\nprint(webpage_id.shape)\nprint(edges.shape)\n'

In [117]:
def get_webkb_casestudy(data_name, SEED=0):
    data_X, data_Y, data_webpage_url, data_edges = parse_webkb(data_name)

    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED)
    np.random.seed(SEED)  # Numpy module.
    random.seed(SEED)  # Python random module.

    # load data
    data = Data(x=torch.tensor(data_X).float(),
                 edge_index=torch.tensor(data_edges).long(), 
                 y=torch.tensor(data_Y).long(),
                 num_nodes=len(data_Y))
    # split data
    node_id = np.arange(data.num_nodes)
    np.random.shuffle(node_id)

    data.train_id = np.sort(node_id[:int(data.num_nodes * 0.6)])
    data.val_id = np.sort(
        node_id[int(data.num_nodes * 0.6):int(data.num_nodes * 0.8)])
    data.test_id = np.sort(node_id[int(data.num_nodes * 0.8):])

    data.train_mask = torch.tensor(
        [x in data.train_id for x in range(data.num_nodes)])
    data.val_mask = torch.tensor(
        [x in data.val_id for x in range(data.num_nodes)])
    data.test_mask = torch.tensor(
        [x in data.test_id for x in range(data.num_nodes)])
    
    return data, data_webpage_url

In [118]:
def html_process(input_string):
    # 使用正则表达式去掉所有 HTML 标签
    lines = input_string.split('\n')
    clean_text = ' '.join(lines[6:])

    #non_empty_lines = [line for line in clean_text if line.strip()]
    
    #tag_list = ['<.*?>', r'<ahref\s*=\s*".*?"\s*>', r'<a\shref\s*=\s*".*?"\s*>', r'<meta *.html>', r'<img src*>', r'<IMG SRC*">', r'<bodyBACKGROUND*>', r'<imgsrc*>', r'<AHREF*>', '\n']
    tag_list = ['<.*?>', '\n', r'<a\s+href\s*=\s*".*?"\s*>', r'<IMG\s+SRC\s*=\s*".*?"\s+ALT\s*=\s*".*?"\s*>']
    for tag in tag_list:
        clean_text = re.sub(tag, '', clean_text, flags=re.IGNORECASE)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    return clean_text
'''
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        if not url.endswith('.html'):
            url += '^'
        try:
            file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        except:
            print(i, file_path, 'not found') ###TODO
            text.append('')
    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text
'''
def delete_vacant_webpage(data, i):
    data.y = torch.cat((data.y[:i], data.y[(i+1):]))
    # data.edge_index = torch.cat((data.edge_index[:,:i], data.edge_index[:,(i+1):]), dim=1)
    data.x = torch.cat((data.x[:i], data.x[(i+1):]))
    data.train_mask = torch.cat((data.train_mask[:i], data.train_mask[(i+1):]))
    data.val_mask = torch.cat((data.val_mask[:i], data.val_mask[(i+1):]))
    data.test_mask = torch.cat((data.test_mask[:i], data.test_mask[(i+1):]))
    data.train_id = np.delete(data.train_id, i)
    data.val_id = np.delete(data.val_id, i)
    data.test_id = np.delete(data.test_id, i)
    data.num_nodes -= 1
    mask = (data.edge_index[0] == i) | (data.edge_index[1] == i)
    data.edge_index = data.edge_index[:,~mask] 
    return data
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        pages_to_remove = []
        if not url.endswith('.html'):
            url += '^'
        file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
        if os.path.exists(file_path):
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        else:
            pages_to_remove.append(i)
            print(i, file_path, 'not found') ###TODO
            # text.append('')
            
    if data_name == 'wisconsin':
        pages_to_remove = [3,5]
    elif data_name == 'cornell':
        pages_to_remove = [12]
   
    for i in reversed(pages_to_remove):
        data = delete_vacant_webpage(data, i)

    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text

In [119]:
data, clean_text = get_raw_text_webkb('cornell', use_text=True, seed=0)

12 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/cornell/http:^^www.cs.cornell.edu^ not found


In [120]:
print(data)

Data(x=[194, 1703], edge_index=[2, 381], y=[194], num_nodes=194, train_id=[116], val_id=[38], test_id=[38], train_mask=[194], val_mask=[194], test_mask=[194])


In [121]:
import numpy as np
import torch
import random
import torch_geometric.transforms as T
from torch_geometric.data import Data
import pandas as pd
import os
from pathlib import Path
import re


def parse_webkb(data_name):
    path = f'/storage/qiaoyr/TAPE/dataset/web_kb/WebKB/{data_name}'
    webpage_features_labels = np.genfromtxt("{}.content".format(path), dtype=np.dtype(str))
    data_X = webpage_features_labels[:, 1:-1].astype(np.float32)
    labels = webpage_features_labels[:, -1]
    #print(labels)
    class_map = {x: i for i, x in enumerate(['course', 'faculty', 'student','project', 'staff'])}  
    #print(class_map)
    data_Y = np.array([class_map[x] for x in labels])
    data_webpage_url = webpage_features_labels[:, 0]
    # data_webpage_id = np.arange(len(data_webpage_url))
    data_webpage_id_map = {x: i for i, x in enumerate(data_webpage_url)}
    edges_unordered = np.genfromtxt("{}.cites".format(path), dtype=np.dtype(str))
    '''
    for i in range(edges_unordered.shape[0]):
        if edges_unordered[i][0] == edges_unordered[i][1]:
            print('self loop:',edges_unordered[i][0])
    '''
    edges = np.array(list(map(data_webpage_id_map.get, edges_unordered.flatten())), dtype=np.int32).reshape(edges_unordered.shape)
    #print(edges.shape)
    data_edges = np.array(edges[~(edges == None).max(1)], dtype=np.int32)
    #print(data_edges.shape)
    data_edges = np.vstack((data_edges, np.fliplr(data_edges)))
    #print(data_edges.shape)

    return data_X, data_Y, data_webpage_url, np.unique(data_edges, axis=0).transpose()

In [122]:
'''
X, Y, webpage_id, edges = parse_wisconsin()
print(X.shape)
print(Y.shape)
print(webpage_id.shape)
print(edges.shape)
'''

'\nX, Y, webpage_id, edges = parse_wisconsin()\nprint(X.shape)\nprint(Y.shape)\nprint(webpage_id.shape)\nprint(edges.shape)\n'

In [123]:
def get_webkb_casestudy(data_name, SEED=0):
    data_X, data_Y, data_webpage_url, data_edges = parse_webkb(data_name)

    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED)
    np.random.seed(SEED)  # Numpy module.
    random.seed(SEED)  # Python random module.

    # load data
    data = Data(x=torch.tensor(data_X).float(),
                 edge_index=torch.tensor(data_edges).long(), 
                 y=torch.tensor(data_Y).long(),
                 num_nodes=len(data_Y))
    # split data
    node_id = np.arange(data.num_nodes)
    np.random.shuffle(node_id)

    data.train_id = np.sort(node_id[:int(data.num_nodes * 0.6)])
    data.val_id = np.sort(
        node_id[int(data.num_nodes * 0.6):int(data.num_nodes * 0.8)])
    data.test_id = np.sort(node_id[int(data.num_nodes * 0.8):])

    data.train_mask = torch.tensor(
        [x in data.train_id for x in range(data.num_nodes)])
    data.val_mask = torch.tensor(
        [x in data.val_id for x in range(data.num_nodes)])
    data.test_mask = torch.tensor(
        [x in data.test_id for x in range(data.num_nodes)])
    
    return data, data_webpage_url

In [124]:
def html_process(input_string):
    # 使用正则表达式去掉所有 HTML 标签
    lines = input_string.split('\n')
    clean_text = ' '.join(lines[6:])

    #non_empty_lines = [line for line in clean_text if line.strip()]
    
    #tag_list = ['<.*?>', r'<ahref\s*=\s*".*?"\s*>', r'<a\shref\s*=\s*".*?"\s*>', r'<meta *.html>', r'<img src*>', r'<IMG SRC*">', r'<bodyBACKGROUND*>', r'<imgsrc*>', r'<AHREF*>', '\n']
    tag_list = ['<.*?>', '\n', r'<a\s+href\s*=\s*".*?"\s*>', r'<IMG\s+SRC\s*=\s*".*?"\s+ALT\s*=\s*".*?"\s*>']
    for tag in tag_list:
        clean_text = re.sub(tag, '', clean_text, flags=re.IGNORECASE)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    return clean_text
'''
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        if not url.endswith('.html'):
            url += '^'
        try:
            file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        except:
            print(i, file_path, 'not found') ###TODO
            text.append('')
    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text
'''
def delete_vacant_webpage(data, i):
    data.y = torch.cat((data.y[:i], data.y[(i+1):]))
    # data.edge_index = torch.cat((data.edge_index[:,:i], data.edge_index[:,(i+1):]), dim=1)
    data.x = torch.cat((data.x[:i], data.x[(i+1):]))
    data.train_mask = torch.cat((data.train_mask[:i], data.train_mask[(i+1):]))
    data.val_mask = torch.cat((data.val_mask[:i], data.val_mask[(i+1):]))
    data.test_mask = torch.cat((data.test_mask[:i], data.test_mask[(i+1):]))
    data.train_id = np.delete(data.train_id, i)
    data.val_id = np.delete(data.val_id, i)
    data.test_id = np.delete(data.test_id, i)
    data.num_nodes -= 1
    mask = (data.edge_index[0] == i) | (data.edge_index[1] == i)
    data.edge_index = data.edge_index[:,~mask] 
    return data
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        pages_to_remove = []
        if not url.endswith('.html'):
            url += '^'
        file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
        if os.path.exists(file_path):
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        else:
            pages_to_remove.append(i)
            print(i, file_path, 'not found') ###TODO
            # text.append('')
            
    if data_name == 'wisconsin':
        pages_to_remove = [3,5]
    elif data_name == 'cornell':
        pages_to_remove = [12]
   
    for i in reversed(pages_to_remove):
        print(i)
        data = delete_vacant_webpage(data, i)
        print(data)

    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text

In [125]:
data, clean_text = get_raw_text_webkb('cornell', use_text=True, seed=0)

12 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/cornell/http:^^www.cs.cornell.edu^ not found
12
Data(x=[194, 1703], edge_index=[2, 381], y=[194], num_nodes=194, train_id=[116], val_id=[38], test_id=[38], train_mask=[194], val_mask=[194], test_mask=[194])


In [126]:
import numpy as np
import torch
import random
import torch_geometric.transforms as T
from torch_geometric.data import Data
import pandas as pd
import os
from pathlib import Path
import re


def parse_webkb(data_name):
    path = f'/storage/qiaoyr/TAPE/dataset/web_kb/WebKB/{data_name}'
    webpage_features_labels = np.genfromtxt("{}.content".format(path), dtype=np.dtype(str))
    data_X = webpage_features_labels[:, 1:-1].astype(np.float32)
    labels = webpage_features_labels[:, -1]
    #print(labels)
    class_map = {x: i for i, x in enumerate(['course', 'faculty', 'student','project', 'staff'])}  
    #print(class_map)
    data_Y = np.array([class_map[x] for x in labels])
    data_webpage_url = webpage_features_labels[:, 0]
    # data_webpage_id = np.arange(len(data_webpage_url))
    data_webpage_id_map = {x: i for i, x in enumerate(data_webpage_url)}
    edges_unordered = np.genfromtxt("{}.cites".format(path), dtype=np.dtype(str))
    '''
    for i in range(edges_unordered.shape[0]):
        if edges_unordered[i][0] == edges_unordered[i][1]:
            print('self loop:',edges_unordered[i][0])
    '''
    edges = np.array(list(map(data_webpage_id_map.get, edges_unordered.flatten())), dtype=np.int32).reshape(edges_unordered.shape)
    #print(edges.shape)
    data_edges = np.array(edges[~(edges == None).max(1)], dtype=np.int32)
    #print(data_edges.shape)
    data_edges = np.vstack((data_edges, np.fliplr(data_edges)))
    #print(data_edges.shape)

    return data_X, data_Y, data_webpage_url, np.unique(data_edges, axis=0).transpose()

In [127]:
'''
X, Y, webpage_id, edges = parse_wisconsin()
print(X.shape)
print(Y.shape)
print(webpage_id.shape)
print(edges.shape)
'''

'\nX, Y, webpage_id, edges = parse_wisconsin()\nprint(X.shape)\nprint(Y.shape)\nprint(webpage_id.shape)\nprint(edges.shape)\n'

In [128]:
def get_webkb_casestudy(data_name, SEED=0):
    data_X, data_Y, data_webpage_url, data_edges = parse_webkb(data_name)

    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED)
    np.random.seed(SEED)  # Numpy module.
    random.seed(SEED)  # Python random module.

    # load data
    data = Data(x=torch.tensor(data_X).float(),
                 edge_index=torch.tensor(data_edges).long(), 
                 y=torch.tensor(data_Y).long(),
                 num_nodes=len(data_Y))
    # split data
    node_id = np.arange(data.num_nodes)
    np.random.shuffle(node_id)

    data.train_id = np.sort(node_id[:int(data.num_nodes * 0.6)])
    data.val_id = np.sort(
        node_id[int(data.num_nodes * 0.6):int(data.num_nodes * 0.8)])
    data.test_id = np.sort(node_id[int(data.num_nodes * 0.8):])

    data.train_mask = torch.tensor(
        [x in data.train_id for x in range(data.num_nodes)])
    data.val_mask = torch.tensor(
        [x in data.val_id for x in range(data.num_nodes)])
    data.test_mask = torch.tensor(
        [x in data.test_id for x in range(data.num_nodes)])
    
    return data, data_webpage_url

In [129]:
def html_process(input_string):
    # 使用正则表达式去掉所有 HTML 标签
    lines = input_string.split('\n')
    clean_text = ' '.join(lines[6:])

    #non_empty_lines = [line for line in clean_text if line.strip()]
    
    #tag_list = ['<.*?>', r'<ahref\s*=\s*".*?"\s*>', r'<a\shref\s*=\s*".*?"\s*>', r'<meta *.html>', r'<img src*>', r'<IMG SRC*">', r'<bodyBACKGROUND*>', r'<imgsrc*>', r'<AHREF*>', '\n']
    tag_list = ['<.*?>', '\n', r'<a\s+href\s*=\s*".*?"\s*>', r'<IMG\s+SRC\s*=\s*".*?"\s+ALT\s*=\s*".*?"\s*>']
    for tag in tag_list:
        clean_text = re.sub(tag, '', clean_text, flags=re.IGNORECASE)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    return clean_text
'''
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        if not url.endswith('.html'):
            url += '^'
        try:
            file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        except:
            print(i, file_path, 'not found') ###TODO
            text.append('')
    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text
'''
def delete_vacant_webpage(data, i):
    data.y = torch.cat((data.y[:i], data.y[(i+1):]))
    # data.edge_index = torch.cat((data.edge_index[:,:i], data.edge_index[:,(i+1):]), dim=1)
    data.x = torch.cat((data.x[:i], data.x[(i+1):]))
    data.train_mask = torch.cat((data.train_mask[:i], data.train_mask[(i+1):]))
    data.val_mask = torch.cat((data.val_mask[:i], data.val_mask[(i+1):]))
    data.test_mask = torch.cat((data.test_mask[:i], data.test_mask[(i+1):]))
    data.train_id = np.delete(data.train_id, i)
    data.val_id = np.delete(data.val_id, i)
    data.test_id = np.delete(data.test_id, i)
    data.num_nodes -= 1
    mask = (data.edge_index[0] == i) | (data.edge_index[1] == i)
    data.edge_index = data.edge_index[:,~mask] 
    return data
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        pages_to_remove = []
        if not url.endswith('.html'):
            url += '^'
        file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
        if os.path.exists(file_path):
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        else:
            pages_to_remove.append(i)
            print(i, file_path, 'not found') ###TODO
            # text.append('')
            
    if data_name == 'wisconsin':
        pages_to_remove = [3,5]
    elif data_name == 'cornell':
        pages_to_remove = [12]
    print(data)
    for i in reversed(pages_to_remove):
        print(i)
        data = delete_vacant_webpage(data, i)
        print(data)

    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text

In [130]:
data, clean_text = get_raw_text_webkb('cornell', use_text=True, seed=0)

12 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/cornell/http:^^www.cs.cornell.edu^ not found
Data(x=[195, 1703], edge_index=[2, 569], y=[195], num_nodes=195, train_id=[117], val_id=[39], test_id=[39], train_mask=[195], val_mask=[195], test_mask=[195])
12
Data(x=[194, 1703], edge_index=[2, 381], y=[194], num_nodes=194, train_id=[116], val_id=[38], test_id=[38], train_mask=[194], val_mask=[194], test_mask=[194])


In [131]:
data, clean_text = get_raw_text_webkb('washington', use_text=True, seed=0)

0 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/project/washington/http:^^metacrawler.cs.washington.edu:8080^ not found
1 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/washington/http:^^www.cs.washington.edu^ not found
4 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/washington/http:^^www.cs.washington.edu^education^courses^142^currentqtr^ not found
5 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/washington/http:^^www.cs.washington.edu^education^courses^143^currentqtr^ not found
8 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/washington/http:^^www.cs.washington.edu^education^courses^322^currentqtr^ not found
14 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/washington/http:^^www.cs.washington.edu^education^courses^370^currentqtr^ not found
16 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/washington/http:^^www.cs.washington.edu^education^courses^373^95a^index.html.95a^ not found
18 /storage/qiaoyr/TAPE/dataset/we

In [132]:
data, clean_text = get_raw_text_webkb('texas', use_text=True, seed=0)

0 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/texas/http:^^www.cs.utexas.edu^ not found
186 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/project/texas/http:^^www.ma.utexas.edu^users^bshults^atp^ not found
Data(x=[187, 1703], edge_index=[2, 578], y=[187], num_nodes=187, train_id=[112], val_id=[37], test_id=[38], train_mask=[187], val_mask=[187], test_mask=[187])
186


IndexError: index 186 is out of bounds for axis 0 with size 112

In [133]:
import numpy as np
import torch
import random
import torch_geometric.transforms as T
from torch_geometric.data import Data
import pandas as pd
import os
from pathlib import Path
import re


def parse_webkb(data_name):
    path = f'/storage/qiaoyr/TAPE/dataset/web_kb/WebKB/{data_name}'
    webpage_features_labels = np.genfromtxt("{}.content".format(path), dtype=np.dtype(str))
    data_X = webpage_features_labels[:, 1:-1].astype(np.float32)
    labels = webpage_features_labels[:, -1]
    #print(labels)
    class_map = {x: i for i, x in enumerate(['course', 'faculty', 'student','project', 'staff'])}  
    #print(class_map)
    data_Y = np.array([class_map[x] for x in labels])
    data_webpage_url = webpage_features_labels[:, 0]
    # data_webpage_id = np.arange(len(data_webpage_url))
    data_webpage_id_map = {x: i for i, x in enumerate(data_webpage_url)}
    edges_unordered = np.genfromtxt("{}.cites".format(path), dtype=np.dtype(str))
    '''
    for i in range(edges_unordered.shape[0]):
        if edges_unordered[i][0] == edges_unordered[i][1]:
            print('self loop:',edges_unordered[i][0])
    '''
    edges = np.array(list(map(data_webpage_id_map.get, edges_unordered.flatten())), dtype=np.int32).reshape(edges_unordered.shape)
    #print(edges.shape)
    data_edges = np.array(edges[~(edges == None).max(1)], dtype=np.int32)
    #print(data_edges.shape)
    data_edges = np.vstack((data_edges, np.fliplr(data_edges)))
    #print(data_edges.shape)

    return data_X, data_Y, data_webpage_url, np.unique(data_edges, axis=0).transpose()

In [134]:
'''
X, Y, webpage_id, edges = parse_wisconsin()
print(X.shape)
print(Y.shape)
print(webpage_id.shape)
print(edges.shape)
'''

'\nX, Y, webpage_id, edges = parse_wisconsin()\nprint(X.shape)\nprint(Y.shape)\nprint(webpage_id.shape)\nprint(edges.shape)\n'

In [135]:
def get_webkb_casestudy(data_name, SEED=0):
    data_X, data_Y, data_webpage_url, data_edges = parse_webkb(data_name)

    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED)
    np.random.seed(SEED)  # Numpy module.
    random.seed(SEED)  # Python random module.

    # load data
    data = Data(x=torch.tensor(data_X).float(),
                 edge_index=torch.tensor(data_edges).long(), 
                 y=torch.tensor(data_Y).long(),
                 num_nodes=len(data_Y))
    # split data
    node_id = np.arange(data.num_nodes)
    np.random.shuffle(node_id)

    data.train_id = np.sort(node_id[:int(data.num_nodes * 0.6)])
    data.val_id = np.sort(
        node_id[int(data.num_nodes * 0.6):int(data.num_nodes * 0.8)])
    data.test_id = np.sort(node_id[int(data.num_nodes * 0.8):])

    data.train_mask = torch.tensor(
        [x in data.train_id for x in range(data.num_nodes)])
    data.val_mask = torch.tensor(
        [x in data.val_id for x in range(data.num_nodes)])
    data.test_mask = torch.tensor(
        [x in data.test_id for x in range(data.num_nodes)])
    
    return data, data_webpage_url

In [136]:
def html_process(input_string):
    # 使用正则表达式去掉所有 HTML 标签
    lines = input_string.split('\n')
    clean_text = ' '.join(lines[6:])

    #non_empty_lines = [line for line in clean_text if line.strip()]
    
    #tag_list = ['<.*?>', r'<ahref\s*=\s*".*?"\s*>', r'<a\shref\s*=\s*".*?"\s*>', r'<meta *.html>', r'<img src*>', r'<IMG SRC*">', r'<bodyBACKGROUND*>', r'<imgsrc*>', r'<AHREF*>', '\n']
    tag_list = ['<.*?>', '\n', r'<a\s+href\s*=\s*".*?"\s*>', r'<IMG\s+SRC\s*=\s*".*?"\s+ALT\s*=\s*".*?"\s*>']
    for tag in tag_list:
        clean_text = re.sub(tag, '', clean_text, flags=re.IGNORECASE)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    return clean_text
'''
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        if not url.endswith('.html'):
            url += '^'
        try:
            file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        except:
            print(i, file_path, 'not found') ###TODO
            text.append('')
    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text
'''
def delete_vacant_webpage(data, i):
    data.y = torch.cat((data.y[:i], data.y[(i+1):]))
    # data.edge_index = torch.cat((data.edge_index[:,:i], data.edge_index[:,(i+1):]), dim=1)
    data.x = torch.cat((data.x[:i], data.x[(i+1):]))
    data.train_mask = torch.cat((data.train_mask[:i], data.train_mask[(i+1):]))
    data.val_mask = torch.cat((data.val_mask[:i], data.val_mask[(i+1):]))
    data.test_mask = torch.cat((data.test_mask[:i], data.test_mask[(i+1):]))
    data.train_id = np.delete(data.train_id, i)
    data.val_id = np.delete(data.val_id, i)
    data.test_id = np.delete(data.test_id, i)
    data.num_nodes -= 1
    mask = (data.edge_index[0] == i) | (data.edge_index[1] == i)
    data.edge_index = data.edge_index[:,~mask] 
    return data
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        pages_to_remove = []
        if not url.endswith('.html'):
            url += '^'
        file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
        if os.path.exists(file_path):
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        else:
            pages_to_remove.append(i)
            print(i, file_path, 'not found') ###TODO
            # text.append('')
            
    if data_name == 'wisconsin':
        pages_to_remove = [3,5]
    elif data_name == 'cornell':
        pages_to_remove = [12]
    pages_to_remove = []
    print(data)
    for i in reversed(pages_to_remove):
        print(i)
        data = delete_vacant_webpage(data, i)
        print(data)

    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text

In [137]:
data, clean_text = get_raw_text_webkb('texas', use_text=True, seed=0)

0 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/texas/http:^^www.cs.utexas.edu^ not found
186 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/project/texas/http:^^www.ma.utexas.edu^users^bshults^atp^ not found
Data(x=[187, 1703], edge_index=[2, 578], y=[187], num_nodes=187, train_id=[112], val_id=[37], test_id=[38], train_mask=[187], val_mask=[187], test_mask=[187])


In [138]:
import numpy as np
import torch
import random
import torch_geometric.transforms as T
from torch_geometric.data import Data
import pandas as pd
import os
from pathlib import Path
import re


def parse_webkb(data_name):
    path = f'/storage/qiaoyr/TAPE/dataset/web_kb/WebKB/{data_name}'
    webpage_features_labels = np.genfromtxt("{}.content".format(path), dtype=np.dtype(str))
    data_X = webpage_features_labels[:, 1:-1].astype(np.float32)
    labels = webpage_features_labels[:, -1]
    #print(labels)
    class_map = {x: i for i, x in enumerate(['course', 'faculty', 'student','project', 'staff'])}  
    #print(class_map)
    data_Y = np.array([class_map[x] for x in labels])
    data_webpage_url = webpage_features_labels[:, 0]
    # data_webpage_id = np.arange(len(data_webpage_url))
    data_webpage_id_map = {x: i for i, x in enumerate(data_webpage_url)}
    edges_unordered = np.genfromtxt("{}.cites".format(path), dtype=np.dtype(str))
    '''
    for i in range(edges_unordered.shape[0]):
        if edges_unordered[i][0] == edges_unordered[i][1]:
            print('self loop:',edges_unordered[i][0])
    '''
    edges = np.array(list(map(data_webpage_id_map.get, edges_unordered.flatten())), dtype=np.int32).reshape(edges_unordered.shape)
    #print(edges.shape)
    data_edges = np.array(edges[~(edges == None).max(1)], dtype=np.int32)
    #print(data_edges.shape)
    data_edges = np.vstack((data_edges, np.fliplr(data_edges)))
    #print(data_edges.shape)

    return data_X, data_Y, data_webpage_url, np.unique(data_edges, axis=0).transpose()

In [139]:
'''
X, Y, webpage_id, edges = parse_wisconsin()
print(X.shape)
print(Y.shape)
print(webpage_id.shape)
print(edges.shape)
'''

'\nX, Y, webpage_id, edges = parse_wisconsin()\nprint(X.shape)\nprint(Y.shape)\nprint(webpage_id.shape)\nprint(edges.shape)\n'

In [140]:
def get_webkb_casestudy(data_name, SEED=0):
    data_X, data_Y, data_webpage_url, data_edges = parse_webkb(data_name)

    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED)
    np.random.seed(SEED)  # Numpy module.
    random.seed(SEED)  # Python random module.

    # load data
    data = Data(x=torch.tensor(data_X).float(),
                 edge_index=torch.tensor(data_edges).long(), 
                 y=torch.tensor(data_Y).long(),
                 num_nodes=len(data_Y))
    # split data
    node_id = np.arange(data.num_nodes)
    np.random.shuffle(node_id)

    data.train_id = np.sort(node_id[:int(data.num_nodes * 0.6)])
    data.val_id = np.sort(
        node_id[int(data.num_nodes * 0.6):int(data.num_nodes * 0.8)])
    data.test_id = np.sort(node_id[int(data.num_nodes * 0.8):])

    data.train_mask = torch.tensor(
        [x in data.train_id for x in range(data.num_nodes)])
    data.val_mask = torch.tensor(
        [x in data.val_id for x in range(data.num_nodes)])
    data.test_mask = torch.tensor(
        [x in data.test_id for x in range(data.num_nodes)])
    
    return data, data_webpage_url

In [141]:
def html_process(input_string):
    # 使用正则表达式去掉所有 HTML 标签
    lines = input_string.split('\n')
    clean_text = ' '.join(lines[6:])

    #non_empty_lines = [line for line in clean_text if line.strip()]
    
    #tag_list = ['<.*?>', r'<ahref\s*=\s*".*?"\s*>', r'<a\shref\s*=\s*".*?"\s*>', r'<meta *.html>', r'<img src*>', r'<IMG SRC*">', r'<bodyBACKGROUND*>', r'<imgsrc*>', r'<AHREF*>', '\n']
    tag_list = ['<.*?>', '\n', r'<a\s+href\s*=\s*".*?"\s*>', r'<IMG\s+SRC\s*=\s*".*?"\s+ALT\s*=\s*".*?"\s*>']
    for tag in tag_list:
        clean_text = re.sub(tag, '', clean_text, flags=re.IGNORECASE)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    return clean_text
'''
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        if not url.endswith('.html'):
            url += '^'
        try:
            file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        except:
            print(i, file_path, 'not found') ###TODO
            text.append('')
    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text
'''
def delete_vacant_webpage(data, i):
    data.y = torch.cat((data.y[:i], data.y[(i+1):]))
    # data.edge_index = torch.cat((data.edge_index[:,:i], data.edge_index[:,(i+1):]), dim=1)
    data.x = torch.cat((data.x[:i], data.x[(i+1):]))
    data.train_mask = torch.cat((data.train_mask[:i], data.train_mask[(i+1):]))
    data.val_mask = torch.cat((data.val_mask[:i], data.val_mask[(i+1):]))
    data.test_mask = torch.cat((data.test_mask[:i], data.test_mask[(i+1):]))
    data.train_id = np.delete(data.train_id, i)
    data.val_id = np.delete(data.val_id, i)
    data.test_id = np.delete(data.test_id, i)
    data.num_nodes -= 1
    mask = (data.edge_index[0] == i) | (data.edge_index[1] == i)
    data.edge_index = data.edge_index[:,~mask] 
    return data
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        pages_to_remove = []
        if not url.endswith('.html'):
            url += '^'
        file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
        if os.path.exists(file_path):
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        else:
            pages_to_remove.append(i)
            print(i, file_path, 'not found') ###TODO
            # text.append('')
            
    if data_name == 'wisconsin':
        pages_to_remove = [3,5]
    elif data_name == 'cornell':
        pages_to_remove = [12]
    pages_to_remove = []
    print(data)
    for i in reversed(pages_to_remove):
        print(i)
        data = delete_vacant_webpage(data, i)
        print(data)

    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text

In [142]:
data, clean_text = get_raw_text_webkb('texas', use_text=True, seed=0)

0 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/texas/http:^^www.cs.utexas.edu^ not found
Data(x=[187, 1703], edge_index=[2, 578], y=[187], num_nodes=187, train_id=[112], val_id=[37], test_id=[38], train_mask=[187], val_mask=[187], test_mask=[187])


In [143]:
import numpy as np
import torch
import random
import torch_geometric.transforms as T
from torch_geometric.data import Data
import pandas as pd
import os
from pathlib import Path
import re


def parse_webkb(data_name):
    path = f'/storage/qiaoyr/TAPE/dataset/web_kb/WebKB/{data_name}'
    webpage_features_labels = np.genfromtxt("{}.content".format(path), dtype=np.dtype(str))
    data_X = webpage_features_labels[:, 1:-1].astype(np.float32)
    labels = webpage_features_labels[:, -1]
    #print(labels)
    class_map = {x: i for i, x in enumerate(['course', 'faculty', 'student','project', 'staff'])}  
    #print(class_map)
    data_Y = np.array([class_map[x] for x in labels])
    data_webpage_url = webpage_features_labels[:, 0]
    # data_webpage_id = np.arange(len(data_webpage_url))
    data_webpage_id_map = {x: i for i, x in enumerate(data_webpage_url)}
    edges_unordered = np.genfromtxt("{}.cites".format(path), dtype=np.dtype(str))
    '''
    for i in range(edges_unordered.shape[0]):
        if edges_unordered[i][0] == edges_unordered[i][1]:
            print('self loop:',edges_unordered[i][0])
    '''
    edges = np.array(list(map(data_webpage_id_map.get, edges_unordered.flatten())), dtype=np.int32).reshape(edges_unordered.shape)
    #print(edges.shape)
    data_edges = np.array(edges[~(edges == None).max(1)], dtype=np.int32)
    #print(data_edges.shape)
    data_edges = np.vstack((data_edges, np.fliplr(data_edges)))
    #print(data_edges.shape)

    return data_X, data_Y, data_webpage_url, np.unique(data_edges, axis=0).transpose()

In [144]:
'''
X, Y, webpage_id, edges = parse_wisconsin()
print(X.shape)
print(Y.shape)
print(webpage_id.shape)
print(edges.shape)
'''

'\nX, Y, webpage_id, edges = parse_wisconsin()\nprint(X.shape)\nprint(Y.shape)\nprint(webpage_id.shape)\nprint(edges.shape)\n'

In [145]:
def get_webkb_casestudy(data_name, SEED=0):
    data_X, data_Y, data_webpage_url, data_edges = parse_webkb(data_name)

    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED)
    np.random.seed(SEED)  # Numpy module.
    random.seed(SEED)  # Python random module.

    # load data
    data = Data(x=torch.tensor(data_X).float(),
                 edge_index=torch.tensor(data_edges).long(), 
                 y=torch.tensor(data_Y).long(),
                 num_nodes=len(data_Y))
    # split data
    node_id = np.arange(data.num_nodes)
    np.random.shuffle(node_id)

    data.train_id = np.sort(node_id[:int(data.num_nodes * 0.6)])
    data.val_id = np.sort(
        node_id[int(data.num_nodes * 0.6):int(data.num_nodes * 0.8)])
    data.test_id = np.sort(node_id[int(data.num_nodes * 0.8):])

    data.train_mask = torch.tensor(
        [x in data.train_id for x in range(data.num_nodes)])
    data.val_mask = torch.tensor(
        [x in data.val_id for x in range(data.num_nodes)])
    data.test_mask = torch.tensor(
        [x in data.test_id for x in range(data.num_nodes)])
    
    return data, data_webpage_url

In [146]:
def html_process(input_string):
    # 使用正则表达式去掉所有 HTML 标签
    lines = input_string.split('\n')
    clean_text = ' '.join(lines[6:])

    #non_empty_lines = [line for line in clean_text if line.strip()]
    
    #tag_list = ['<.*?>', r'<ahref\s*=\s*".*?"\s*>', r'<a\shref\s*=\s*".*?"\s*>', r'<meta *.html>', r'<img src*>', r'<IMG SRC*">', r'<bodyBACKGROUND*>', r'<imgsrc*>', r'<AHREF*>', '\n']
    tag_list = ['<.*?>', '\n', r'<a\s+href\s*=\s*".*?"\s*>', r'<IMG\s+SRC\s*=\s*".*?"\s+ALT\s*=\s*".*?"\s*>']
    for tag in tag_list:
        clean_text = re.sub(tag, '', clean_text, flags=re.IGNORECASE)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    return clean_text
'''
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        if not url.endswith('.html'):
            url += '^'
        try:
            file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        except:
            print(i, file_path, 'not found') ###TODO
            text.append('')
    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text
'''
def delete_vacant_webpage(data, i):
    data.y = torch.cat((data.y[:i], data.y[(i+1):]))
    # data.edge_index = torch.cat((data.edge_index[:,:i], data.edge_index[:,(i+1):]), dim=1)
    data.x = torch.cat((data.x[:i], data.x[(i+1):]))
    data.train_mask = torch.cat((data.train_mask[:i], data.train_mask[(i+1):]))
    data.val_mask = torch.cat((data.val_mask[:i], data.val_mask[(i+1):]))
    data.test_mask = torch.cat((data.test_mask[:i], data.test_mask[(i+1):]))
    data.train_id = np.delete(data.train_id, i)
    data.val_id = np.delete(data.val_id, i)
    data.test_id = np.delete(data.test_id, i)
    data.num_nodes -= 1
    mask = (data.edge_index[0] == i) | (data.edge_index[1] == i)
    data.edge_index = data.edge_index[:,~mask] 
    return data
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        pages_to_remove = []
        if not url.endswith('.html'):
            url += '^'
        file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
        if os.path.exists(file_path):
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        else:
            pages_to_remove.append(i)
            print(i, file_path, 'not found') ###TODO
            # text.append('')
            
    if data_name == 'wisconsin':
        pages_to_remove = [3,5]
    elif data_name == 'cornell':
        pages_to_remove = [12]
    elif data_name == 'texas':
        pages_to_remove = [0]
    pages_to_remove = []
    print(data)
    for i in reversed(pages_to_remove):
        print(i)
        data = delete_vacant_webpage(data, i)
        print(data)

    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text

In [147]:
data, clean_text = get_raw_text_webkb('texas', use_text=True, seed=0)

0 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/texas/http:^^www.cs.utexas.edu^ not found
Data(x=[187, 1703], edge_index=[2, 578], y=[187], num_nodes=187, train_id=[112], val_id=[37], test_id=[38], train_mask=[187], val_mask=[187], test_mask=[187])


In [148]:
import numpy as np
import torch
import random
import torch_geometric.transforms as T
from torch_geometric.data import Data
import pandas as pd
import os
from pathlib import Path
import re


def parse_webkb(data_name):
    path = f'/storage/qiaoyr/TAPE/dataset/web_kb/WebKB/{data_name}'
    webpage_features_labels = np.genfromtxt("{}.content".format(path), dtype=np.dtype(str))
    data_X = webpage_features_labels[:, 1:-1].astype(np.float32)
    labels = webpage_features_labels[:, -1]
    #print(labels)
    class_map = {x: i for i, x in enumerate(['course', 'faculty', 'student','project', 'staff'])}  
    #print(class_map)
    data_Y = np.array([class_map[x] for x in labels])
    data_webpage_url = webpage_features_labels[:, 0]
    # data_webpage_id = np.arange(len(data_webpage_url))
    data_webpage_id_map = {x: i for i, x in enumerate(data_webpage_url)}
    edges_unordered = np.genfromtxt("{}.cites".format(path), dtype=np.dtype(str))
    '''
    for i in range(edges_unordered.shape[0]):
        if edges_unordered[i][0] == edges_unordered[i][1]:
            print('self loop:',edges_unordered[i][0])
    '''
    edges = np.array(list(map(data_webpage_id_map.get, edges_unordered.flatten())), dtype=np.int32).reshape(edges_unordered.shape)
    #print(edges.shape)
    data_edges = np.array(edges[~(edges == None).max(1)], dtype=np.int32)
    #print(data_edges.shape)
    data_edges = np.vstack((data_edges, np.fliplr(data_edges)))
    #print(data_edges.shape)

    return data_X, data_Y, data_webpage_url, np.unique(data_edges, axis=0).transpose()

In [149]:
'''
X, Y, webpage_id, edges = parse_wisconsin()
print(X.shape)
print(Y.shape)
print(webpage_id.shape)
print(edges.shape)
'''

'\nX, Y, webpage_id, edges = parse_wisconsin()\nprint(X.shape)\nprint(Y.shape)\nprint(webpage_id.shape)\nprint(edges.shape)\n'

In [150]:
def get_webkb_casestudy(data_name, SEED=0):
    data_X, data_Y, data_webpage_url, data_edges = parse_webkb(data_name)

    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED)
    np.random.seed(SEED)  # Numpy module.
    random.seed(SEED)  # Python random module.

    # load data
    data = Data(x=torch.tensor(data_X).float(),
                 edge_index=torch.tensor(data_edges).long(), 
                 y=torch.tensor(data_Y).long(),
                 num_nodes=len(data_Y))
    # split data
    node_id = np.arange(data.num_nodes)
    np.random.shuffle(node_id)

    data.train_id = np.sort(node_id[:int(data.num_nodes * 0.6)])
    data.val_id = np.sort(
        node_id[int(data.num_nodes * 0.6):int(data.num_nodes * 0.8)])
    data.test_id = np.sort(node_id[int(data.num_nodes * 0.8):])

    data.train_mask = torch.tensor(
        [x in data.train_id for x in range(data.num_nodes)])
    data.val_mask = torch.tensor(
        [x in data.val_id for x in range(data.num_nodes)])
    data.test_mask = torch.tensor(
        [x in data.test_id for x in range(data.num_nodes)])
    
    return data, data_webpage_url

In [151]:
def html_process(input_string):
    # 使用正则表达式去掉所有 HTML 标签
    lines = input_string.split('\n')
    clean_text = ' '.join(lines[6:])

    #non_empty_lines = [line for line in clean_text if line.strip()]
    
    #tag_list = ['<.*?>', r'<ahref\s*=\s*".*?"\s*>', r'<a\shref\s*=\s*".*?"\s*>', r'<meta *.html>', r'<img src*>', r'<IMG SRC*">', r'<bodyBACKGROUND*>', r'<imgsrc*>', r'<AHREF*>', '\n']
    tag_list = ['<.*?>', '\n', r'<a\s+href\s*=\s*".*?"\s*>', r'<IMG\s+SRC\s*=\s*".*?"\s+ALT\s*=\s*".*?"\s*>']
    for tag in tag_list:
        clean_text = re.sub(tag, '', clean_text, flags=re.IGNORECASE)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    return clean_text
'''
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        if not url.endswith('.html'):
            url += '^'
        try:
            file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        except:
            print(i, file_path, 'not found') ###TODO
            text.append('')
    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text
'''
def delete_vacant_webpage(data, i):
    data.y = torch.cat((data.y[:i], data.y[(i+1):]))
    # data.edge_index = torch.cat((data.edge_index[:,:i], data.edge_index[:,(i+1):]), dim=1)
    data.x = torch.cat((data.x[:i], data.x[(i+1):]))
    data.train_mask = torch.cat((data.train_mask[:i], data.train_mask[(i+1):]))
    data.val_mask = torch.cat((data.val_mask[:i], data.val_mask[(i+1):]))
    data.test_mask = torch.cat((data.test_mask[:i], data.test_mask[(i+1):]))
    data.train_id = np.delete(data.train_id, i)
    data.val_id = np.delete(data.val_id, i)
    data.test_id = np.delete(data.test_id, i)
    data.num_nodes -= 1
    mask = (data.edge_index[0] == i) | (data.edge_index[1] == i)
    data.edge_index = data.edge_index[:,~mask] 
    return data
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        pages_to_remove = []
        if not url.endswith('.html'):
            url += '^'
        file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
        if os.path.exists(file_path):
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        else:
            pages_to_remove.append(i)
            print(i, file_path, 'not found') ###TODO
            # text.append('')
            
    if data_name == 'wisconsin':
        pages_to_remove = [3,5]
    elif data_name == 'cornell':
        pages_to_remove = [12]
    elif data_name == 'texas':
        pages_to_remove = [0]
    # pages_to_remove = []
    print(data)
    for i in reversed(pages_to_remove):
        print(i)
        data = delete_vacant_webpage(data, i)
        print(data)

    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text

In [152]:
data, clean_text = get_raw_text_webkb('texas', use_text=True, seed=0)

0 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/texas/http:^^www.cs.utexas.edu^ not found
Data(x=[187, 1703], edge_index=[2, 578], y=[187], num_nodes=187, train_id=[112], val_id=[37], test_id=[38], train_mask=[187], val_mask=[187], test_mask=[187])
0
Data(x=[186, 1703], edge_index=[2, 370], y=[186], num_nodes=186, train_id=[111], val_id=[36], test_id=[37], train_mask=[186], val_mask=[186], test_mask=[186])


In [153]:
data, clean_text = get_raw_text_webkb('washington', use_text=True, seed=0)

0 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/project/washington/http:^^metacrawler.cs.washington.edu:8080^ not found
1 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/washington/http:^^www.cs.washington.edu^ not found
4 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/washington/http:^^www.cs.washington.edu^education^courses^142^currentqtr^ not found
5 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/washington/http:^^www.cs.washington.edu^education^courses^143^currentqtr^ not found
8 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/washington/http:^^www.cs.washington.edu^education^courses^322^currentqtr^ not found
14 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/washington/http:^^www.cs.washington.edu^education^courses^370^currentqtr^ not found
16 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/washington/http:^^www.cs.washington.edu^education^courses^373^95a^index.html.95a^ not found
18 /storage/qiaoyr/TAPE/dataset/we

In [154]:
import numpy as np
import torch
import random
import torch_geometric.transforms as T
from torch_geometric.data import Data
import pandas as pd
import os
from pathlib import Path
import re


def parse_webkb(data_name):
    path = f'/storage/qiaoyr/TAPE/dataset/web_kb/WebKB/{data_name}'
    webpage_features_labels = np.genfromtxt("{}.content".format(path), dtype=np.dtype(str))
    data_X = webpage_features_labels[:, 1:-1].astype(np.float32)
    labels = webpage_features_labels[:, -1]
    #print(labels)
    class_map = {x: i for i, x in enumerate(['course', 'faculty', 'student','project', 'staff'])}  
    #print(class_map)
    data_Y = np.array([class_map[x] for x in labels])
    data_webpage_url = webpage_features_labels[:, 0]
    # data_webpage_id = np.arange(len(data_webpage_url))
    data_webpage_id_map = {x: i for i, x in enumerate(data_webpage_url)}
    edges_unordered = np.genfromtxt("{}.cites".format(path), dtype=np.dtype(str))
    '''
    for i in range(edges_unordered.shape[0]):
        if edges_unordered[i][0] == edges_unordered[i][1]:
            print('self loop:',edges_unordered[i][0])
    '''
    edges = np.array(list(map(data_webpage_id_map.get, edges_unordered.flatten())), dtype=np.int32).reshape(edges_unordered.shape)
    #print(edges.shape)
    data_edges = np.array(edges[~(edges == None).max(1)], dtype=np.int32)
    #print(data_edges.shape)
    data_edges = np.vstack((data_edges, np.fliplr(data_edges)))
    #print(data_edges.shape)

    return data_X, data_Y, data_webpage_url, np.unique(data_edges, axis=0).transpose()

In [155]:
'''
X, Y, webpage_id, edges = parse_wisconsin()
print(X.shape)
print(Y.shape)
print(webpage_id.shape)
print(edges.shape)
'''

'\nX, Y, webpage_id, edges = parse_wisconsin()\nprint(X.shape)\nprint(Y.shape)\nprint(webpage_id.shape)\nprint(edges.shape)\n'

In [156]:
def get_webkb_casestudy(data_name, SEED=0):
    data_X, data_Y, data_webpage_url, data_edges = parse_webkb(data_name)

    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED)
    np.random.seed(SEED)  # Numpy module.
    random.seed(SEED)  # Python random module.

    # load data
    data = Data(x=torch.tensor(data_X).float(),
                 edge_index=torch.tensor(data_edges).long(), 
                 y=torch.tensor(data_Y).long(),
                 num_nodes=len(data_Y))
    # split data
    node_id = np.arange(data.num_nodes)
    np.random.shuffle(node_id)

    data.train_id = np.sort(node_id[:int(data.num_nodes * 0.6)])
    data.val_id = np.sort(
        node_id[int(data.num_nodes * 0.6):int(data.num_nodes * 0.8)])
    data.test_id = np.sort(node_id[int(data.num_nodes * 0.8):])

    data.train_mask = torch.tensor(
        [x in data.train_id for x in range(data.num_nodes)])
    data.val_mask = torch.tensor(
        [x in data.val_id for x in range(data.num_nodes)])
    data.test_mask = torch.tensor(
        [x in data.test_id for x in range(data.num_nodes)])
    
    return data, data_webpage_url

In [157]:
def html_process(input_string):
    # 使用正则表达式去掉所有 HTML 标签
    lines = input_string.split('\n')
    clean_text = ' '.join(lines[6:])

    #non_empty_lines = [line for line in clean_text if line.strip()]
    
    #tag_list = ['<.*?>', r'<ahref\s*=\s*".*?"\s*>', r'<a\shref\s*=\s*".*?"\s*>', r'<meta *.html>', r'<img src*>', r'<IMG SRC*">', r'<bodyBACKGROUND*>', r'<imgsrc*>', r'<AHREF*>', '\n']
    tag_list = ['<.*?>', '\n', r'<a\s+href\s*=\s*".*?"\s*>', r'<IMG\s+SRC\s*=\s*".*?"\s+ALT\s*=\s*".*?"\s*>']
    for tag in tag_list:
        clean_text = re.sub(tag, '', clean_text, flags=re.IGNORECASE)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    return clean_text
'''
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        if not url.endswith('.html'):
            url += '^'
        try:
            file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        except:
            print(i, file_path, 'not found') ###TODO
            text.append('')
    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text
'''
def delete_vacant_webpage(data, i):
    data.y = torch.cat((data.y[:i], data.y[(i+1):]))
    # data.edge_index = torch.cat((data.edge_index[:,:i], data.edge_index[:,(i+1):]), dim=1)
    data.x = torch.cat((data.x[:i], data.x[(i+1):]))
    data.train_mask = torch.cat((data.train_mask[:i], data.train_mask[(i+1):]))
    data.val_mask = torch.cat((data.val_mask[:i], data.val_mask[(i+1):]))
    data.test_mask = torch.cat((data.test_mask[:i], data.test_mask[(i+1):]))
    data.train_id = np.delete(data.train_id, i)
    data.val_id = np.delete(data.val_id, i)
    data.test_id = np.delete(data.test_id, i)
    data.num_nodes -= 1
    mask = (data.edge_index[0] == i) | (data.edge_index[1] == i)
    data.edge_index = data.edge_index[:,~mask] 
    return data
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        pages_to_remove = []
        if not url.endswith('.html'):
            url += '^'
        file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
        if os.path.exists(file_path):
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        else:
            pages_to_remove.append(i)
            print(i, file_path, 'not found') ###TODO
            # text.append('')
            
    if data_name == 'wisconsin':
        pages_to_remove = [3,5]
    elif data_name == 'cornell':
        pages_to_remove = [12]
    elif data_name == 'texas':
        pages_to_remove = [0]
    pages_to_remove = []
    print(data)
    for i in reversed(pages_to_remove):
        print(i)
        data = delete_vacant_webpage(data, i)
        print(data)

    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text

In [158]:
data, clean_text = get_raw_text_webkb('washington', use_text=True, seed=0)

1 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/washington/http:^^www.cs.washington.edu^ not found
152 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/student/washington/http:^^www.cs.washington.edu^homes^montgmry^ not found
156 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/faculty/washington/http:^^www.cs.washington.edu^homes^notkin^ not found
170 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/faculty/washington/http:^^www.cs.washington.edu^homes^ruzzo^ not found
171 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/faculty/washington/http:^^www.cs.washington.edu^homes^salesin^ not found
178 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/faculty/washington/http:^^www.cs.washington.edu^homes^shapiro^ not found
214 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/project/washington/http:^^www.cs.washington.edu^research^community-networks^ not found
227 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/project/washington/http:^^www.cs.washington.edu^resear

In [159]:
import numpy as np
import torch
import random
import torch_geometric.transforms as T
from torch_geometric.data import Data
import pandas as pd
import os
from pathlib import Path
import re


def parse_webkb(data_name):
    path = f'/storage/qiaoyr/TAPE/dataset/web_kb/WebKB/{data_name}'
    webpage_features_labels = np.genfromtxt("{}.content".format(path), dtype=np.dtype(str))
    data_X = webpage_features_labels[:, 1:-1].astype(np.float32)
    labels = webpage_features_labels[:, -1]
    #print(labels)
    class_map = {x: i for i, x in enumerate(['course', 'faculty', 'student','project', 'staff'])}  
    #print(class_map)
    data_Y = np.array([class_map[x] for x in labels])
    data_webpage_url = webpage_features_labels[:, 0]
    # data_webpage_id = np.arange(len(data_webpage_url))
    data_webpage_id_map = {x: i for i, x in enumerate(data_webpage_url)}
    edges_unordered = np.genfromtxt("{}.cites".format(path), dtype=np.dtype(str))
    '''
    for i in range(edges_unordered.shape[0]):
        if edges_unordered[i][0] == edges_unordered[i][1]:
            print('self loop:',edges_unordered[i][0])
    '''
    edges = np.array(list(map(data_webpage_id_map.get, edges_unordered.flatten())), dtype=np.int32).reshape(edges_unordered.shape)
    #print(edges.shape)
    data_edges = np.array(edges[~(edges == None).max(1)], dtype=np.int32)
    #print(data_edges.shape)
    data_edges = np.vstack((data_edges, np.fliplr(data_edges)))
    #print(data_edges.shape)

    return data_X, data_Y, data_webpage_url, np.unique(data_edges, axis=0).transpose()

In [160]:
'''
X, Y, webpage_id, edges = parse_wisconsin()
print(X.shape)
print(Y.shape)
print(webpage_id.shape)
print(edges.shape)
'''

'\nX, Y, webpage_id, edges = parse_wisconsin()\nprint(X.shape)\nprint(Y.shape)\nprint(webpage_id.shape)\nprint(edges.shape)\n'

In [161]:
def get_webkb_casestudy(data_name, SEED=0):
    data_X, data_Y, data_webpage_url, data_edges = parse_webkb(data_name)

    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED)
    np.random.seed(SEED)  # Numpy module.
    random.seed(SEED)  # Python random module.

    # load data
    data = Data(x=torch.tensor(data_X).float(),
                 edge_index=torch.tensor(data_edges).long(), 
                 y=torch.tensor(data_Y).long(),
                 num_nodes=len(data_Y))
    # split data
    node_id = np.arange(data.num_nodes)
    np.random.shuffle(node_id)

    data.train_id = np.sort(node_id[:int(data.num_nodes * 0.6)])
    data.val_id = np.sort(
        node_id[int(data.num_nodes * 0.6):int(data.num_nodes * 0.8)])
    data.test_id = np.sort(node_id[int(data.num_nodes * 0.8):])

    data.train_mask = torch.tensor(
        [x in data.train_id for x in range(data.num_nodes)])
    data.val_mask = torch.tensor(
        [x in data.val_id for x in range(data.num_nodes)])
    data.test_mask = torch.tensor(
        [x in data.test_id for x in range(data.num_nodes)])
    
    return data, data_webpage_url

In [162]:
def html_process(input_string):
    # 使用正则表达式去掉所有 HTML 标签
    lines = input_string.split('\n')
    clean_text = ' '.join(lines[6:])

    #non_empty_lines = [line for line in clean_text if line.strip()]
    
    #tag_list = ['<.*?>', r'<ahref\s*=\s*".*?"\s*>', r'<a\shref\s*=\s*".*?"\s*>', r'<meta *.html>', r'<img src*>', r'<IMG SRC*">', r'<bodyBACKGROUND*>', r'<imgsrc*>', r'<AHREF*>', '\n']
    tag_list = ['<.*?>', '\n', r'<a\s+href\s*=\s*".*?"\s*>', r'<IMG\s+SRC\s*=\s*".*?"\s+ALT\s*=\s*".*?"\s*>']
    for tag in tag_list:
        clean_text = re.sub(tag, '', clean_text, flags=re.IGNORECASE)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    return clean_text
'''
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        if not url.endswith('.html'):
            url += '^'
        try:
            file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        except:
            print(i, file_path, 'not found') ###TODO
            text.append('')
    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text
'''
def delete_vacant_webpage(data, i):
    data.y = torch.cat((data.y[:i], data.y[(i+1):]))
    # data.edge_index = torch.cat((data.edge_index[:,:i], data.edge_index[:,(i+1):]), dim=1)
    data.x = torch.cat((data.x[:i], data.x[(i+1):]))
    data.train_mask = torch.cat((data.train_mask[:i], data.train_mask[(i+1):]))
    data.val_mask = torch.cat((data.val_mask[:i], data.val_mask[(i+1):]))
    data.test_mask = torch.cat((data.test_mask[:i], data.test_mask[(i+1):]))
    data.train_id = np.delete(data.train_id, i)
    data.val_id = np.delete(data.val_id, i)
    data.test_id = np.delete(data.test_id, i)
    data.num_nodes -= 1
    mask = (data.edge_index[0] == i) | (data.edge_index[1] == i)
    data.edge_index = data.edge_index[:,~mask] 
    return data
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        pages_to_remove = []
        if not url.endswith('.html'):
            url += '^'
        file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
        if os.path.exists(file_path):
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        else:
            pages_to_remove.append(i)
            print(i, file_path, 'not found') ###TODO
            # text.append('')
            
    if data_name == 'wisconsin':
        pages_to_remove = [3,5]
    elif data_name == 'cornell':
        pages_to_remove = [12]
    elif data_name == 'texas':
        pages_to_remove = [0]
    pages_to_remove = []
    print(data)
    for i in reversed(pages_to_remove):
        print(i)
        data = delete_vacant_webpage(data, i)
        print(data)

    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text

In [163]:
data, clean_text = get_raw_text_webkb('washington', use_text=True, seed=0)

1 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/washington/http:^^www.cs.washington.edu^ not found
152 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/student/washington/http:^^www.cs.washington.edu^homes^montgmry^ not found
156 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/faculty/washington/http:^^www.cs.washington.edu^homes^notkin^ not found
170 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/faculty/washington/http:^^www.cs.washington.edu^homes^ruzzo^ not found
171 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/faculty/washington/http:^^www.cs.washington.edu^homes^salesin^ not found
178 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/faculty/washington/http:^^www.cs.washington.edu^homes^shapiro^ not found
214 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/project/washington/http:^^www.cs.washington.edu^research^community-networks^ not found
227 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/project/washington/http:^^www.cs.washington.edu^resear

In [164]:
import numpy as np
import torch
import random
import torch_geometric.transforms as T
from torch_geometric.data import Data
import pandas as pd
import os
from pathlib import Path
import re


def parse_webkb(data_name):
    path = f'/storage/qiaoyr/TAPE/dataset/web_kb/WebKB/{data_name}'
    webpage_features_labels = np.genfromtxt("{}.content".format(path), dtype=np.dtype(str))
    data_X = webpage_features_labels[:, 1:-1].astype(np.float32)
    labels = webpage_features_labels[:, -1]
    #print(labels)
    class_map = {x: i for i, x in enumerate(['course', 'faculty', 'student','project', 'staff'])}  
    #print(class_map)
    data_Y = np.array([class_map[x] for x in labels])
    data_webpage_url = webpage_features_labels[:, 0]
    # data_webpage_id = np.arange(len(data_webpage_url))
    data_webpage_id_map = {x: i for i, x in enumerate(data_webpage_url)}
    edges_unordered = np.genfromtxt("{}.cites".format(path), dtype=np.dtype(str))
    '''
    for i in range(edges_unordered.shape[0]):
        if edges_unordered[i][0] == edges_unordered[i][1]:
            print('self loop:',edges_unordered[i][0])
    '''
    edges = np.array(list(map(data_webpage_id_map.get, edges_unordered.flatten())), dtype=np.int32).reshape(edges_unordered.shape)
    #print(edges.shape)
    data_edges = np.array(edges[~(edges == None).max(1)], dtype=np.int32)
    #print(data_edges.shape)
    data_edges = np.vstack((data_edges, np.fliplr(data_edges)))
    #print(data_edges.shape)

    return data_X, data_Y, data_webpage_url, np.unique(data_edges, axis=0).transpose()

In [165]:
'''
X, Y, webpage_id, edges = parse_wisconsin()
print(X.shape)
print(Y.shape)
print(webpage_id.shape)
print(edges.shape)
'''

'\nX, Y, webpage_id, edges = parse_wisconsin()\nprint(X.shape)\nprint(Y.shape)\nprint(webpage_id.shape)\nprint(edges.shape)\n'

In [166]:
def get_webkb_casestudy(data_name, SEED=0):
    data_X, data_Y, data_webpage_url, data_edges = parse_webkb(data_name)

    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED)
    np.random.seed(SEED)  # Numpy module.
    random.seed(SEED)  # Python random module.

    # load data
    data = Data(x=torch.tensor(data_X).float(),
                 edge_index=torch.tensor(data_edges).long(), 
                 y=torch.tensor(data_Y).long(),
                 num_nodes=len(data_Y))
    # split data
    node_id = np.arange(data.num_nodes)
    np.random.shuffle(node_id)

    data.train_id = np.sort(node_id[:int(data.num_nodes * 0.6)])
    data.val_id = np.sort(
        node_id[int(data.num_nodes * 0.6):int(data.num_nodes * 0.8)])
    data.test_id = np.sort(node_id[int(data.num_nodes * 0.8):])

    data.train_mask = torch.tensor(
        [x in data.train_id for x in range(data.num_nodes)])
    data.val_mask = torch.tensor(
        [x in data.val_id for x in range(data.num_nodes)])
    data.test_mask = torch.tensor(
        [x in data.test_id for x in range(data.num_nodes)])
    
    return data, data_webpage_url

In [167]:
def html_process(input_string):
    # 使用正则表达式去掉所有 HTML 标签
    lines = input_string.split('\n')
    clean_text = ' '.join(lines[6:])

    #non_empty_lines = [line for line in clean_text if line.strip()]
    
    #tag_list = ['<.*?>', r'<ahref\s*=\s*".*?"\s*>', r'<a\shref\s*=\s*".*?"\s*>', r'<meta *.html>', r'<img src*>', r'<IMG SRC*">', r'<bodyBACKGROUND*>', r'<imgsrc*>', r'<AHREF*>', '\n']
    tag_list = ['<.*?>', '\n', r'<a\s+href\s*=\s*".*?"\s*>', r'<IMG\s+SRC\s*=\s*".*?"\s+ALT\s*=\s*".*?"\s*>']
    for tag in tag_list:
        clean_text = re.sub(tag, '', clean_text, flags=re.IGNORECASE)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    return clean_text
'''
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        if not url.endswith('.html'):
            url += '^'
        try:
            file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        except:
            print(i, file_path, 'not found') ###TODO
            text.append('')
    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text
'''
def delete_vacant_webpage(data, i):
    data.y = torch.cat((data.y[:i], data.y[(i+1):]))
    # data.edge_index = torch.cat((data.edge_index[:,:i], data.edge_index[:,(i+1):]), dim=1)
    data.x = torch.cat((data.x[:i], data.x[(i+1):]))
    data.train_mask = torch.cat((data.train_mask[:i], data.train_mask[(i+1):]))
    data.val_mask = torch.cat((data.val_mask[:i], data.val_mask[(i+1):]))
    data.test_mask = torch.cat((data.test_mask[:i], data.test_mask[(i+1):]))
    data.train_id = np.delete(data.train_id, i)
    data.val_id = np.delete(data.val_id, i)
    data.test_id = np.delete(data.test_id, i)
    data.num_nodes -= 1
    mask = (data.edge_index[0] == i) | (data.edge_index[1] == i)
    data.edge_index = data.edge_index[:,~mask] 
    return data
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        pages_to_remove = []
        if not url.endswith('.html'):
            url += '^'
        file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
        if os.path.exists(file_path):
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        else:
            pages_to_remove.append(i)
            print(i, file_path, 'not found') ###TODO
            # text.append('')
            
    if data_name == 'wisconsin':
        pages_to_remove = [3,5]
    elif data_name == 'cornell':
        pages_to_remove = [12]
    elif data_name == 'texas':
        pages_to_remove = [0]
    pages_to_remove = []
    print(data)
    for i in reversed(pages_to_remove):
        print(i)
        data = delete_vacant_webpage(data, i)
        print(data)

    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text

In [168]:
data, clean_text = get_raw_text_webkb('washington', use_text=True, seed=0)

1 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/washington/http:^^www.cs.washington.edu^ not found
152 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/student/washington/http:^^www.cs.washington.edu^homes^montgmry^ not found
156 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/faculty/washington/http:^^www.cs.washington.edu^homes^notkin^ not found
170 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/faculty/washington/http:^^www.cs.washington.edu^homes^ruzzo^ not found
171 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/faculty/washington/http:^^www.cs.washington.edu^homes^salesin^ not found
178 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/faculty/washington/http:^^www.cs.washington.edu^homes^shapiro^ not found
214 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/project/washington/http:^^www.cs.washington.edu^research^community-networks^ not found
227 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/project/washington/http:^^www.cs.washington.edu^resear

In [169]:
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        pages_to_remove = []
        if not url.endswith('.html'):
            url += '^'
        file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
        if os.path.exists(file_path):
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        else:
            pages_to_remove.append(i)
            print(i, file_path, 'not found') ###TODO
            # text.append('')
            
    if data_name == 'wisconsin':
        pages_to_remove = [3,5]
    elif data_name == 'cornell':
        pages_to_remove = [12]
    elif data_name == 'texas':
        pages_to_remove = [0]
    elif data_name == 'washington':
        pages_to_remove = [1, 152, 156,170,171,178,214,227]
    print(data)
    for i in reversed(pages_to_remove):
        print(i)
        data = delete_vacant_webpage(data, i)
        print(data)

    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text

In [170]:
data, clean_text = get_raw_text_webkb('washington', use_text=True, seed=0)

1 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/washington/http:^^www.cs.washington.edu^ not found
152 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/student/washington/http:^^www.cs.washington.edu^homes^montgmry^ not found
156 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/faculty/washington/http:^^www.cs.washington.edu^homes^notkin^ not found
170 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/faculty/washington/http:^^www.cs.washington.edu^homes^ruzzo^ not found
171 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/faculty/washington/http:^^www.cs.washington.edu^homes^salesin^ not found
178 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/faculty/washington/http:^^www.cs.washington.edu^homes^shapiro^ not found
214 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/project/washington/http:^^www.cs.washington.edu^research^community-networks^ not found
227 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/project/washington/http:^^www.cs.washington.edu^resear

IndexError: index 227 is out of bounds for axis 0 with size 138

In [171]:
data, clean_text = get_raw_text_webkb('wisconsin', use_text=True, seed=0)

3 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/wisconsin/http:^^www.cs.wisc.edu^ not found
5 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/project/wisconsin/http:^^www.cs.wisc.edu^condor^next.html not found
Data(x=[265, 1703], edge_index=[2, 938], y=[265], num_nodes=265, train_id=[159], val_id=[53], test_id=[53], train_mask=[265], val_mask=[265], test_mask=[265])
5
Data(x=[264, 1703], edge_index=[2, 936], y=[264], num_nodes=264, train_id=[158], val_id=[52], test_id=[52], train_mask=[264], val_mask=[264], test_mask=[264])
3
Data(x=[263, 1703], edge_index=[2, 692], y=[263], num_nodes=263, train_id=[157], val_id=[51], test_id=[51], train_mask=[263], val_mask=[263], test_mask=[263])


In [172]:
data, clean_text = get_raw_text_webkb('washington', use_text=True, seed=0)

1 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/washington/http:^^www.cs.washington.edu^ not found
152 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/student/washington/http:^^www.cs.washington.edu^homes^montgmry^ not found
156 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/faculty/washington/http:^^www.cs.washington.edu^homes^notkin^ not found
170 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/faculty/washington/http:^^www.cs.washington.edu^homes^ruzzo^ not found
171 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/faculty/washington/http:^^www.cs.washington.edu^homes^salesin^ not found
178 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/faculty/washington/http:^^www.cs.washington.edu^homes^shapiro^ not found
214 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/project/washington/http:^^www.cs.washington.edu^research^community-networks^ not found
227 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/project/washington/http:^^www.cs.washington.edu^resear

IndexError: index 227 is out of bounds for axis 0 with size 138

In [173]:
def delete_vacant_webpage(data, i):
    data.y = torch.cat((data.y[:i], data.y[(i+1):]))
    # data.edge_index = torch.cat((data.edge_index[:,:i], data.edge_index[:,(i+1):]), dim=1)
    data.x = torch.cat((data.x[:i], data.x[(i+1):]))
    data.train_mask = torch.cat((data.train_mask[:i], data.train_mask[(i+1):]))
    data.val_mask = torch.cat((data.val_mask[:i], data.val_mask[(i+1):]))
    data.test_mask = torch.cat((data.test_mask[:i], data.test_mask[(i+1):]))
    # data.train_id = np.delete(data.train_id, i)
    data.train_id = np.array(data.train_mask.nonzero().flatten())
    data.val_id = np.array(data.val_mask.nonzero().flatten())
    data.test_id = np.array(data.test_mask.nonzero().flatten())
    data.num_nodes -= 1
    mask = (data.edge_index[0] == i) | (data.edge_index[1] == i)
    data.edge_index = data.edge_index[:,~mask] 
    return data

In [174]:
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        pages_to_remove = []
        if not url.endswith('.html'):
            url += '^'
        file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
        if os.path.exists(file_path):
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        else:
            pages_to_remove.append(i)
            print(i, file_path, 'not found') ###TODO
            # text.append('')
            
    if data_name == 'wisconsin':
        pages_to_remove = [3,5]
    elif data_name == 'cornell':
        pages_to_remove = [12]
    elif data_name == 'texas':
        pages_to_remove = [0]
    elif data_name == 'washington':
        pages_to_remove = [1, 152, 156,170,171,178,214,227]
    print(data)
    for i in reversed(pages_to_remove):
        print(i)
        data = delete_vacant_webpage(data, i)
        print(data)

    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text

In [175]:
data, clean_text = get_raw_text_webkb('washington', use_text=True, seed=0)

1 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/washington/http:^^www.cs.washington.edu^ not found
152 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/student/washington/http:^^www.cs.washington.edu^homes^montgmry^ not found
156 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/faculty/washington/http:^^www.cs.washington.edu^homes^notkin^ not found
170 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/faculty/washington/http:^^www.cs.washington.edu^homes^ruzzo^ not found
171 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/faculty/washington/http:^^www.cs.washington.edu^homes^salesin^ not found
178 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/faculty/washington/http:^^www.cs.washington.edu^homes^shapiro^ not found
214 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/project/washington/http:^^www.cs.washington.edu^research^community-networks^ not found
227 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/project/washington/http:^^www.cs.washington.edu^resear

In [176]:
print(data)
print(len(clean_text))

Data(x=[222, 1703], edge_index=[2, 518], y=[222], num_nodes=222, train_id=[133], val_id=[43], test_id=[46], train_mask=[222], val_mask=[222], test_mask=[222])
222


In [177]:
data, clean_text = get_raw_text_webkb('wisconsin', use_text=True, seed=0)

3 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/wisconsin/http:^^www.cs.wisc.edu^ not found
5 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/project/wisconsin/http:^^www.cs.wisc.edu^condor^next.html not found
Data(x=[265, 1703], edge_index=[2, 938], y=[265], num_nodes=265, train_id=[159], val_id=[53], test_id=[53], train_mask=[265], val_mask=[265], test_mask=[265])
5
Data(x=[264, 1703], edge_index=[2, 936], y=[264], num_nodes=264, train_id=[158], val_id=[53], test_id=[53], train_mask=[264], val_mask=[264], test_mask=[264])
3
Data(x=[263, 1703], edge_index=[2, 692], y=[263], num_nodes=263, train_id=[157], val_id=[53], test_id=[53], train_mask=[263], val_mask=[263], test_mask=[263])


In [178]:
print(data)
print(len(clean_text))

Data(x=[263, 1703], edge_index=[2, 692], y=[263], num_nodes=263, train_id=[157], val_id=[53], test_id=[53], train_mask=[263], val_mask=[263], test_mask=[263])
263


In [179]:
data, clean_text = get_raw_text_webkb('texas', use_text=True, seed=0)

0 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/texas/http:^^www.cs.utexas.edu^ not found
Data(x=[187, 1703], edge_index=[2, 578], y=[187], num_nodes=187, train_id=[112], val_id=[37], test_id=[38], train_mask=[187], val_mask=[187], test_mask=[187])
0
Data(x=[186, 1703], edge_index=[2, 370], y=[186], num_nodes=186, train_id=[112], val_id=[36], test_id=[38], train_mask=[186], val_mask=[186], test_mask=[186])


In [180]:
print(data)
print(len(clean_text))

Data(x=[186, 1703], edge_index=[2, 370], y=[186], num_nodes=186, train_id=[112], val_id=[36], test_id=[38], train_mask=[186], val_mask=[186], test_mask=[186])
186


In [181]:
data, clean_text = get_raw_text_webkb('cornell', use_text=True, seed=0)

12 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/cornell/http:^^www.cs.cornell.edu^ not found
Data(x=[195, 1703], edge_index=[2, 569], y=[195], num_nodes=195, train_id=[117], val_id=[39], test_id=[39], train_mask=[195], val_mask=[195], test_mask=[195])
12
Data(x=[194, 1703], edge_index=[2, 381], y=[194], num_nodes=194, train_id=[116], val_id=[39], test_id=[39], train_mask=[194], val_mask=[194], test_mask=[194])


In [182]:
print(data)
print(len(clean_text))

Data(x=[194, 1703], edge_index=[2, 381], y=[194], num_nodes=194, train_id=[116], val_id=[39], test_id=[39], train_mask=[194], val_mask=[194], test_mask=[194])
194


In [183]:
data, clean_text = get_raw_text_webkb('wisconsin', use_text=True, seed=0)

3 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/wisconsin/http:^^www.cs.wisc.edu^ not found
5 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/project/wisconsin/http:^^www.cs.wisc.edu^condor^next.html not found
Data(x=[265, 1703], edge_index=[2, 938], y=[265], num_nodes=265, train_id=[159], val_id=[53], test_id=[53], train_mask=[265], val_mask=[265], test_mask=[265])
5
Data(x=[264, 1703], edge_index=[2, 936], y=[264], num_nodes=264, train_id=[158], val_id=[53], test_id=[53], train_mask=[264], val_mask=[264], test_mask=[264])
3
Data(x=[263, 1703], edge_index=[2, 692], y=[263], num_nodes=263, train_id=[157], val_id=[53], test_id=[53], train_mask=[263], val_mask=[263], test_mask=[263])


In [184]:
print(data)
print(len(clean_text))

Data(x=[263, 1703], edge_index=[2, 692], y=[263], num_nodes=263, train_id=[157], val_id=[53], test_id=[53], train_mask=[263], val_mask=[263], test_mask=[263])
263


In [185]:
data, clean_text = get_raw_text_webkb('wisconsin', use_text=True, seed=0)

3 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/course/wisconsin/http:^^www.cs.wisc.edu^ not found
5 /storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw/project/wisconsin/http:^^www.cs.wisc.edu^condor^next.html not found
Data(x=[265, 1703], edge_index=[2, 938], y=[265], num_nodes=265, train_id=[159], val_id=[53], test_id=[53], train_mask=[265], val_mask=[265], test_mask=[265])
5
Data(x=[264, 1703], edge_index=[2, 936], y=[264], num_nodes=264, train_id=[158], val_id=[53], test_id=[53], train_mask=[264], val_mask=[264], test_mask=[264])
3
Data(x=[263, 1703], edge_index=[2, 692], y=[263], num_nodes=263, train_id=[157], val_id=[53], test_id=[53], train_mask=[263], val_mask=[263], test_mask=[263])


In [186]:
print(data)
print(len(clean_text))

Data(x=[263, 1703], edge_index=[2, 692], y=[263], num_nodes=263, train_id=[157], val_id=[53], test_id=[53], train_mask=[263], val_mask=[263], test_mask=[263])
263


In [187]:
import torch
edge_index = data.edge_index
# 假设 edge_index 是一个大小为 (2, edge_num) 的张量
# 假设 n 是节点的数量

# 获取节点的数量 n
n = data.nodes_num  # 你需要将这个值替换为你实际的节点数量

# 检查 edge_index 中的边是否超出节点范围
out_of_range_edges = (edge_index[0] < 0) | (edge_index[0] >= n) | (edge_index[1] < 0) | (edge_index[1] >= n)

# 如果 out_of_range_edges 中存在 True 值，表示有边超出节点范围
if out_of_range_edges.any():
    print("存在超出节点范围的边。")
else:
    print("所有边都在节点范围内。")

AttributeError: 'GlobalStorage' object has no attribute 'nodes_num'

In [188]:
import torch
edge_index = data.edge_index
# 假设 edge_index 是一个大小为 (2, edge_num) 的张量
# 假设 n 是节点的数量

# 获取节点的数量 n
n = data.num_nodes  # 你需要将这个值替换为你实际的节点数量

# 检查 edge_index 中的边是否超出节点范围
out_of_range_edges = (edge_index[0] < 0) | (edge_index[0] >= n) | (edge_index[1] < 0) | (edge_index[1] >= n)

# 如果 out_of_range_edges 中存在 True 值，表示有边超出节点范围
if out_of_range_edges.any():
    print("存在超出节点范围的边。")
else:
    print("所有边都在节点范围内。")

存在超出节点范围的边。


In [189]:
import torch
edge_index = data.edge_index
# 假设 edge_index 是一个大小为 (2, edge_num) 的张量
# 假设 n 是节点的数量

# 获取节点的数量 n
n = data.num_nodes  # 你需要将这个值替换为你实际的节点数量

# 检查 edge_index 中的边是否超出节点范围
out_of_range_edges = (edge_index[0] < 0) | (edge_index[0] >= n) | (edge_index[1] < 0) | (edge_index[1] >= n)

# 如果 out_of_range_edges 中存在 True 值，表示有边超出节点范围
if out_of_range_edges.any():
    print(out_of_range_edges)
    print("存在超出节点范围的边。")
else:
    print("所有边都在节点范围内。")

tensor([False, False, False,  True, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, False, False,
        False, False, False, False, False, False, False, False, 

In [190]:
import numpy as np
import torch
import random
import torch_geometric.transforms as T
from torch_geometric.data import Data
import pandas as pd
import os
from pathlib import Path
import re


def parse_webkb(data_name):
    path = f'/storage/qiaoyr/TAPE/dataset/web_kb/WebKB/{data_name}'
    webpage_features_labels = np.genfromtxt("{}.content".format(path), dtype=np.dtype(str))
    data_X = webpage_features_labels[:, 1:-1].astype(np.float32)
    labels = webpage_features_labels[:, -1]
    #print(labels)
    class_map = {x: i for i, x in enumerate(['course', 'faculty', 'student','project', 'staff'])}  
    #print(class_map)
    data_Y = np.array([class_map[x] for x in labels])
    data_webpage_url = webpage_features_labels[:, 0]
    # data_webpage_id = np.arange(len(data_webpage_url))
    data_webpage_id_map = {x: i for i, x in enumerate(data_webpage_url)}
    edges_unordered = np.genfromtxt("{}.cites".format(path), dtype=np.dtype(str))
    '''
    for i in range(edges_unordered.shape[0]):
        if edges_unordered[i][0] == edges_unordered[i][1]:
            print('self loop:',edges_unordered[i][0])
    '''
    edges = np.array(list(map(data_webpage_id_map.get, edges_unordered.flatten())), dtype=np.int32).reshape(edges_unordered.shape)
    #print(edges.shape)
    data_edges = np.array(edges[~(edges == None).max(1)], dtype=np.int32)
    #print(data_edges.shape)
    data_edges = np.vstack((data_edges, np.fliplr(data_edges)))
    #print(data_edges.shape)

    return data_X, data_Y, data_webpage_url, np.unique(data_edges, axis=0).transpose()

In [191]:
'''
X, Y, webpage_id, edges = parse_wisconsin()
print(X.shape)
print(Y.shape)
print(webpage_id.shape)
print(edges.shape)
'''

'\nX, Y, webpage_id, edges = parse_wisconsin()\nprint(X.shape)\nprint(Y.shape)\nprint(webpage_id.shape)\nprint(edges.shape)\n'

In [192]:
def get_webkb_casestudy(data_name, SEED=0):
    data_X, data_Y, data_webpage_url, data_edges = parse_webkb(data_name)

    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED)
    np.random.seed(SEED)  # Numpy module.
    random.seed(SEED)  # Python random module.

    # load data
    data = Data(x=torch.tensor(data_X).float(),
                 edge_index=torch.tensor(data_edges).long(), 
                 y=torch.tensor(data_Y).long(),
                 num_nodes=len(data_Y))
    # split data
    node_id = np.arange(data.num_nodes)
    np.random.shuffle(node_id)

    data.train_id = np.sort(node_id[:int(data.num_nodes * 0.6)])
    data.val_id = np.sort(
        node_id[int(data.num_nodes * 0.6):int(data.num_nodes * 0.8)])
    data.test_id = np.sort(node_id[int(data.num_nodes * 0.8):])

    data.train_mask = torch.tensor(
        [x in data.train_id for x in range(data.num_nodes)])
    data.val_mask = torch.tensor(
        [x in data.val_id for x in range(data.num_nodes)])
    data.test_mask = torch.tensor(
        [x in data.test_id for x in range(data.num_nodes)])
    
    return data, data_webpage_url

In [193]:
def html_process(input_string):
    # 使用正则表达式去掉所有 HTML 标签
    lines = input_string.split('\n')
    clean_text = ' '.join(lines[6:])

    #non_empty_lines = [line for line in clean_text if line.strip()]
    
    #tag_list = ['<.*?>', r'<ahref\s*=\s*".*?"\s*>', r'<a\shref\s*=\s*".*?"\s*>', r'<meta *.html>', r'<img src*>', r'<IMG SRC*">', r'<bodyBACKGROUND*>', r'<imgsrc*>', r'<AHREF*>', '\n']
    tag_list = ['<.*?>', '\n', r'<a\s+href\s*=\s*".*?"\s*>', r'<IMG\s+SRC\s*=\s*".*?"\s+ALT\s*=\s*".*?"\s*>']
    for tag in tag_list:
        clean_text = re.sub(tag, '', clean_text, flags=re.IGNORECASE)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    return clean_text
'''
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        if not url.endswith('.html'):
            url += '^'
        try:
            file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        except:
            print(i, file_path, 'not found') ###TODO
            text.append('')
    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text
'''

"\ndef get_raw_text_webkb(data_name, use_text=False, seed=0):\n    data, data_webpage_url = get_webkb_casestudy(data_name, seed)\n    if not use_text:\n        return data, None\n    text = []\n    clean_text = []\n    category_list = ['course', 'faculty', 'student','project', 'staff']\n    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'\n    # print(data.y.shape)\n    # for category in category_list:\n        # webpages = os.listdir('{}/{}'.format(path, category))\n    for i, url in enumerate(data_webpage_url):\n        label = data.y[i]\n        url = url.replace('/', '^')\n        if not url.endswith('.html'):\n            url += '^'\n        try:\n            file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)\n            t = open(file_path, 'r', errors='ignore').read()\n            text.append(t)\n        except:\n            print(i, file_path, 'not found') ###TODO\n            text.append('')\n    for t in text:\n        clean = html_process

In [194]:
def delete_vacant_webpage(data, i):
    data.y = torch.cat((data.y[:i], data.y[(i+1):]))
    # data.edge_index = torch.cat((data.edge_index[:,:i], data.edge_index[:,(i+1):]), dim=1)
    data.x = torch.cat((data.x[:i], data.x[(i+1):]))
    data.train_mask = torch.cat((data.train_mask[:i], data.train_mask[(i+1):]))
    data.val_mask = torch.cat((data.val_mask[:i], data.val_mask[(i+1):]))
    data.test_mask = torch.cat((data.test_mask[:i], data.test_mask[(i+1):]))
    # data.train_id = np.delete(data.train_id, i)
    data.train_id = np.array(data.train_mask.nonzero().flatten())
    data.val_id = np.array(data.val_mask.nonzero().flatten())
    data.test_id = np.array(data.test_mask.nonzero().flatten())
    data.num_nodes -= 1
    mask = (data.edge_index[0] == i) | (data.edge_index[1] == i)
    data.edge_index = data.edge_index[:,~mask] 
    return data

In [195]:
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        pages_to_remove = []
        if not url.endswith('.html'):
            url += '^'
        file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
        if os.path.exists(file_path):
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        else:
            pages_to_remove.append(i)
            # print(i, file_path, 'not found') ###TODO
            # text.append('')
            
    if data_name == 'wisconsin':
        pages_to_remove = [3,5]
    elif data_name == 'cornell':
        pages_to_remove = [12]
    elif data_name == 'texas':
        pages_to_remove = [0]
    elif data_name == 'washington':
        pages_to_remove = [1, 152, 156,170,171,178,214,227]

    for i in reversed(pages_to_remove):
        data = delete_vacant_webpage(data, i)
    edge_index = data.edge_index
    out_of_range_edges = (edge_index[0] < 0) | (edge_index[0] >= n) | (edge_index[1] < 0) | (edge_index[1] >= n)
    data.edge_index = data.edge_index[:,~out_of_range_edges]
    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text

In [196]:
data, clean_text = get_raw_text_webkb('wisconsin', use_text=True, seed=0)

In [197]:
print(data)
print(len(clean_text))

Data(x=[263, 1703], edge_index=[2, 690], y=[263], num_nodes=263, train_id=[157], val_id=[53], test_id=[53], train_mask=[263], val_mask=[263], test_mask=[263])
263


In [198]:
import torch
edge_index = data.edge_index
# 假设 edge_index 是一个大小为 (2, edge_num) 的张量
# 假设 n 是节点的数量

# 获取节点的数量 n
n = data.num_nodes  # 你需要将这个值替换为你实际的节点数量

# 检查 edge_index 中的边是否超出节点范围
out_of_range_edges = (edge_index[0] < 0) | (edge_index[0] >= n) | (edge_index[1] < 0) | (edge_index[1] >= n)

# 如果 out_of_range_edges 中存在 True 值，表示有边超出节点范围
if out_of_range_edges.any():
    print(out_of_range_edges)
    print("存在超出节点范围的边。")
else:
    print("所有边都在节点范围内。")

所有边都在节点范围内。


In [199]:
import numpy as np
import torch
import random
import torch_geometric.transforms as T
from torch_geometric.data import Data
import pandas as pd
import os
from pathlib import Path
import re


def parse_webkb(data_name):
    path = f'/storage/qiaoyr/TAPE/dataset/web_kb/WebKB/{data_name}'
    webpage_features_labels = np.genfromtxt("{}.content".format(path), dtype=np.dtype(str))
    data_X = webpage_features_labels[:, 1:-1].astype(np.float32)
    labels = webpage_features_labels[:, -1]
    #print(labels)
    class_map = {x: i for i, x in enumerate(['course', 'faculty', 'student','project', 'staff'])}  
    #print(class_map)
    data_Y = np.array([class_map[x] for x in labels])
    data_webpage_url = webpage_features_labels[:, 0]
    # data_webpage_id = np.arange(len(data_webpage_url))
    data_webpage_id_map = {x: i for i, x in enumerate(data_webpage_url)}
    edges_unordered = np.genfromtxt("{}.cites".format(path), dtype=np.dtype(str))
    '''
    for i in range(edges_unordered.shape[0]):
        if edges_unordered[i][0] == edges_unordered[i][1]:
            print('self loop:',edges_unordered[i][0])
    '''
    edges = np.array(list(map(data_webpage_id_map.get, edges_unordered.flatten())), dtype=np.int32).reshape(edges_unordered.shape)
    #print(edges.shape)
    data_edges = np.array(edges[~(edges == None).max(1)], dtype=np.int32)
    #print(data_edges.shape)
    data_edges = np.vstack((data_edges, np.fliplr(data_edges)))
    #print(data_edges.shape)

    return data_X, data_Y, data_webpage_url, np.unique(data_edges, axis=0).transpose()

In [200]:
'''
X, Y, webpage_id, edges = parse_wisconsin()
print(X.shape)
print(Y.shape)
print(webpage_id.shape)
print(edges.shape)
'''

'\nX, Y, webpage_id, edges = parse_wisconsin()\nprint(X.shape)\nprint(Y.shape)\nprint(webpage_id.shape)\nprint(edges.shape)\n'

In [201]:
def get_webkb_casestudy(data_name, SEED=0):
    data_X, data_Y, data_webpage_url, data_edges = parse_webkb(data_name)

    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED)
    np.random.seed(SEED)  # Numpy module.
    random.seed(SEED)  # Python random module.

    # load data
    data = Data(x=torch.tensor(data_X).float(),
                 edge_index=torch.tensor(data_edges).long(), 
                 y=torch.tensor(data_Y).long(),
                 num_nodes=len(data_Y))
    # split data
    node_id = np.arange(data.num_nodes)
    np.random.shuffle(node_id)

    data.train_id = np.sort(node_id[:int(data.num_nodes * 0.6)])
    data.val_id = np.sort(
        node_id[int(data.num_nodes * 0.6):int(data.num_nodes * 0.8)])
    data.test_id = np.sort(node_id[int(data.num_nodes * 0.8):])

    data.train_mask = torch.tensor(
        [x in data.train_id for x in range(data.num_nodes)])
    data.val_mask = torch.tensor(
        [x in data.val_id for x in range(data.num_nodes)])
    data.test_mask = torch.tensor(
        [x in data.test_id for x in range(data.num_nodes)])
    
    return data, data_webpage_url

In [202]:
def html_process(input_string):
    # 使用正则表达式去掉所有 HTML 标签
    lines = input_string.split('\n')
    clean_text = ' '.join(lines[6:])

    #non_empty_lines = [line for line in clean_text if line.strip()]
    
    #tag_list = ['<.*?>', r'<ahref\s*=\s*".*?"\s*>', r'<a\shref\s*=\s*".*?"\s*>', r'<meta *.html>', r'<img src*>', r'<IMG SRC*">', r'<bodyBACKGROUND*>', r'<imgsrc*>', r'<AHREF*>', '\n']
    tag_list = ['<.*?>', '\n', r'<a\s+href\s*=\s*".*?"\s*>', r'<IMG\s+SRC\s*=\s*".*?"\s+ALT\s*=\s*".*?"\s*>']
    for tag in tag_list:
        clean_text = re.sub(tag, '', clean_text, flags=re.IGNORECASE)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    return clean_text
'''
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        if not url.endswith('.html'):
            url += '^'
        try:
            file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        except:
            print(i, file_path, 'not found') ###TODO
            text.append('')
    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text
'''

"\ndef get_raw_text_webkb(data_name, use_text=False, seed=0):\n    data, data_webpage_url = get_webkb_casestudy(data_name, seed)\n    if not use_text:\n        return data, None\n    text = []\n    clean_text = []\n    category_list = ['course', 'faculty', 'student','project', 'staff']\n    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'\n    # print(data.y.shape)\n    # for category in category_list:\n        # webpages = os.listdir('{}/{}'.format(path, category))\n    for i, url in enumerate(data_webpage_url):\n        label = data.y[i]\n        url = url.replace('/', '^')\n        if not url.endswith('.html'):\n            url += '^'\n        try:\n            file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)\n            t = open(file_path, 'r', errors='ignore').read()\n            text.append(t)\n        except:\n            print(i, file_path, 'not found') ###TODO\n            text.append('')\n    for t in text:\n        clean = html_process

In [203]:
def delete_vacant_webpage(data, i):
    data.y = torch.cat((data.y[:i], data.y[(i+1):]))
    # data.edge_index = torch.cat((data.edge_index[:,:i], data.edge_index[:,(i+1):]), dim=1)
    data.x = torch.cat((data.x[:i], data.x[(i+1):]))
    data.train_mask = torch.cat((data.train_mask[:i], data.train_mask[(i+1):]))
    data.val_mask = torch.cat((data.val_mask[:i], data.val_mask[(i+1):]))
    data.test_mask = torch.cat((data.test_mask[:i], data.test_mask[(i+1):]))
    # data.train_id = np.delete(data.train_id, i)
    data.train_id = np.array(data.train_mask.nonzero().flatten())
    data.val_id = np.array(data.val_mask.nonzero().flatten())
    data.test_id = np.array(data.test_mask.nonzero().flatten())
    data.num_nodes -= 1
    #mask = (data.edge_index[0] == i) | (data.edge_index[1] == i)
    #data.edge_index = data.edge_index[:,~mask] 
    return data

In [204]:
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        pages_to_remove = []
        if not url.endswith('.html'):
            url += '^'
        file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
        if os.path.exists(file_path):
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        else:
            pages_to_remove.append(i)
            # print(i, file_path, 'not found') ###TODO
            # text.append('')
            
    if data_name == 'wisconsin':
        pages_to_remove = [3,5]
    elif data_name == 'cornell':
        pages_to_remove = [12]
    elif data_name == 'texas':
        pages_to_remove = [0]
    elif data_name == 'washington':
        pages_to_remove = [1, 152, 156,170,171,178,214,227]

    for i in reversed(pages_to_remove):
        data = delete_vacant_webpage(data, i)
    edge_index = data.edge_index
    out_of_range_edges = (edge_index[0] < 0) | (edge_index[0] >= data.num_nodes) | (edge_index[1] < 0) | (edge_index[1] >= data.num_nodes)
    print(out_of_range_edges.sum())
    print(data.edge_index.shape)
    data.edge_index = data.edge_index[:,~out_of_range_edges]
    print(data.edge_index.shape)
    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text

In [205]:
data, clean_text = get_raw_text_webkb('wisconsin', use_text=True, seed=0)

tensor(4)
torch.Size([2, 938])
torch.Size([2, 934])


In [206]:
import numpy as np
import torch
import random
import torch_geometric.transforms as T
from torch_geometric.data import Data
import pandas as pd
import os
from pathlib import Path
import re


def parse_webkb(data_name):
    path = f'/storage/qiaoyr/TAPE/dataset/web_kb/WebKB/{data_name}'
    webpage_features_labels = np.genfromtxt("{}.content".format(path), dtype=np.dtype(str))
    data_X = webpage_features_labels[:, 1:-1].astype(np.float32)
    labels = webpage_features_labels[:, -1]
    #print(labels)
    class_map = {x: i for i, x in enumerate(['course', 'faculty', 'student','project', 'staff'])}  
    #print(class_map)
    data_Y = np.array([class_map[x] for x in labels])
    data_webpage_url = webpage_features_labels[:, 0]
    # data_webpage_id = np.arange(len(data_webpage_url))
    data_webpage_id_map = {x: i for i, x in enumerate(data_webpage_url)}
    edges_unordered = np.genfromtxt("{}.cites".format(path), dtype=np.dtype(str))
    '''
    for i in range(edges_unordered.shape[0]):
        if edges_unordered[i][0] == edges_unordered[i][1]:
            print('self loop:',edges_unordered[i][0])
    '''
    edges = np.array(list(map(data_webpage_id_map.get, edges_unordered.flatten())), dtype=np.int32).reshape(edges_unordered.shape)
    #print(edges.shape)
    data_edges = np.array(edges[~(edges == None).max(1)], dtype=np.int32)
    #print(data_edges.shape)
    data_edges = np.vstack((data_edges, np.fliplr(data_edges)))
    #print(data_edges.shape)

    return data_X, data_Y, data_webpage_url, np.unique(data_edges, axis=0).transpose()

In [207]:
'''
X, Y, webpage_id, edges = parse_wisconsin()
print(X.shape)
print(Y.shape)
print(webpage_id.shape)
print(edges.shape)
'''

'\nX, Y, webpage_id, edges = parse_wisconsin()\nprint(X.shape)\nprint(Y.shape)\nprint(webpage_id.shape)\nprint(edges.shape)\n'

In [208]:
def get_webkb_casestudy(data_name, SEED=0):
    data_X, data_Y, data_webpage_url, data_edges = parse_webkb(data_name)

    torch.manual_seed(SEED)
    if torch.cuda.is_available():
        torch.cuda.manual_seed(SEED)
    np.random.seed(SEED)  # Numpy module.
    random.seed(SEED)  # Python random module.

    # load data
    data = Data(x=torch.tensor(data_X).float(),
                 edge_index=torch.tensor(data_edges).long(), 
                 y=torch.tensor(data_Y).long(),
                 num_nodes=len(data_Y))
    # split data
    node_id = np.arange(data.num_nodes)
    np.random.shuffle(node_id)

    data.train_id = np.sort(node_id[:int(data.num_nodes * 0.6)])
    data.val_id = np.sort(
        node_id[int(data.num_nodes * 0.6):int(data.num_nodes * 0.8)])
    data.test_id = np.sort(node_id[int(data.num_nodes * 0.8):])

    data.train_mask = torch.tensor(
        [x in data.train_id for x in range(data.num_nodes)])
    data.val_mask = torch.tensor(
        [x in data.val_id for x in range(data.num_nodes)])
    data.test_mask = torch.tensor(
        [x in data.test_id for x in range(data.num_nodes)])
    
    return data, data_webpage_url

In [209]:
def html_process(input_string):
    # 使用正则表达式去掉所有 HTML 标签
    lines = input_string.split('\n')
    clean_text = ' '.join(lines[6:])

    #non_empty_lines = [line for line in clean_text if line.strip()]
    
    #tag_list = ['<.*?>', r'<ahref\s*=\s*".*?"\s*>', r'<a\shref\s*=\s*".*?"\s*>', r'<meta *.html>', r'<img src*>', r'<IMG SRC*">', r'<bodyBACKGROUND*>', r'<imgsrc*>', r'<AHREF*>', '\n']
    tag_list = ['<.*?>', '\n', r'<a\s+href\s*=\s*".*?"\s*>', r'<IMG\s+SRC\s*=\s*".*?"\s+ALT\s*=\s*".*?"\s*>']
    for tag in tag_list:
        clean_text = re.sub(tag, '', clean_text, flags=re.IGNORECASE)
    clean_text = re.sub(r'\s+', ' ', clean_text)
    return clean_text
'''
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        if not url.endswith('.html'):
            url += '^'
        try:
            file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        except:
            print(i, file_path, 'not found') ###TODO
            text.append('')
    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text
'''

"\ndef get_raw_text_webkb(data_name, use_text=False, seed=0):\n    data, data_webpage_url = get_webkb_casestudy(data_name, seed)\n    if not use_text:\n        return data, None\n    text = []\n    clean_text = []\n    category_list = ['course', 'faculty', 'student','project', 'staff']\n    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'\n    # print(data.y.shape)\n    # for category in category_list:\n        # webpages = os.listdir('{}/{}'.format(path, category))\n    for i, url in enumerate(data_webpage_url):\n        label = data.y[i]\n        url = url.replace('/', '^')\n        if not url.endswith('.html'):\n            url += '^'\n        try:\n            file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)\n            t = open(file_path, 'r', errors='ignore').read()\n            text.append(t)\n        except:\n            print(i, file_path, 'not found') ###TODO\n            text.append('')\n    for t in text:\n        clean = html_process

In [210]:
def delete_vacant_webpage(data, i):
    data.y = torch.cat((data.y[:i], data.y[(i+1):]))
    # data.edge_index = torch.cat((data.edge_index[:,:i], data.edge_index[:,(i+1):]), dim=1)
    data.x = torch.cat((data.x[:i], data.x[(i+1):]))
    data.train_mask = torch.cat((data.train_mask[:i], data.train_mask[(i+1):]))
    data.val_mask = torch.cat((data.val_mask[:i], data.val_mask[(i+1):]))
    data.test_mask = torch.cat((data.test_mask[:i], data.test_mask[(i+1):]))
    # data.train_id = np.delete(data.train_id, i)
    data.train_id = np.array(data.train_mask.nonzero().flatten())
    data.val_id = np.array(data.val_mask.nonzero().flatten())
    data.test_id = np.array(data.test_mask.nonzero().flatten())
    data.num_nodes -= 1
    mask = (data.edge_index[0] == i) | (data.edge_index[1] == i)
    data.edge_index = data.edge_index[:,~mask] 
    return data

In [211]:
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        pages_to_remove = []
        if not url.endswith('.html'):
            url += '^'
        file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
        if os.path.exists(file_path):
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        else:
            pages_to_remove.append(i)
            # print(i, file_path, 'not found') ###TODO
            # text.append('')
            
    if data_name == 'wisconsin':
        pages_to_remove = [3,5]
    elif data_name == 'cornell':
        pages_to_remove = [12]
    elif data_name == 'texas':
        pages_to_remove = [0]
    elif data_name == 'washington':
        pages_to_remove = [1, 152, 156,170,171,178,214,227]

    for i in reversed(pages_to_remove):
        data = delete_vacant_webpage(data, i)
    edge_index = data.edge_index
    out_of_range_edges = (edge_index[0] < 0) | (edge_index[0] >= data.num_nodes) | (edge_index[1] < 0) | (edge_index[1] >= data.num_nodes)
    print(out_of_range_edges.sum())
    print(data.edge_index.shape)
    data.edge_index = data.edge_index[:,~out_of_range_edges]
    print(data.edge_index.shape)
    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text

In [212]:
def get_raw_text_webkb(data_name, use_text=False, seed=0):
    data, data_webpage_url = get_webkb_casestudy(data_name, seed)
    if not use_text:
        return data, None
    text = []
    clean_text = []
    category_list = ['course', 'faculty', 'student','project', 'staff']
    path = '/storage/qiaoyr/TAPE/dataset/web_kb_orig/webkb_raw'
    # print(data.y.shape)
    # for category in category_list:
        # webpages = os.listdir('{}/{}'.format(path, category))
    for i, url in enumerate(data_webpage_url):
        label = data.y[i]
        url = url.replace('/', '^')
        pages_to_remove = []
        if not url.endswith('.html'):
            url += '^'
        file_path = '{}/{}/{}/{}'.format(path, category_list[label], data_name, url)
        if os.path.exists(file_path):
            t = open(file_path, 'r', errors='ignore').read()
            text.append(t)
        else:
            pages_to_remove.append(i)
            # print(i, file_path, 'not found') ###TODO
            # text.append('')
            
    if data_name == 'wisconsin':
        pages_to_remove = [3,5]
    elif data_name == 'cornell':
        pages_to_remove = [12]
    elif data_name == 'texas':
        pages_to_remove = [0]
    elif data_name == 'washington':
        pages_to_remove = [1, 152, 156,170,171,178,214,227]

    for i in reversed(pages_to_remove):
        data = delete_vacant_webpage(data, i)
    edge_index = data.edge_index
    out_of_range_edges = (edge_index[0] < 0) | (edge_index[0] >= data.num_nodes) | (edge_index[1] < 0) | (edge_index[1] >= data.num_nodes)
    print(out_of_range_edges.sum())
    print(data.edge_index.shape)
    data.edge_index = data.edge_index[:,~out_of_range_edges]
    print(data.edge_index.shape)
    for t in text:
        clean = html_process(t)
        clean_text.append(clean)
    return data, clean_text

In [213]:
data, clean_text = get_raw_text_webkb('wisconsin', use_text=True, seed=0)

tensor(2)
torch.Size([2, 692])
torch.Size([2, 690])
