<a href="https://colab.research.google.com/github/NormBill/Data_Fine-grained-wafer-defect-classification-based-on-backdoor-adjustment-and-contrastive-learning/blob/main/Cross_Market.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Data Processing

In [1]:
# import dataset XMarket
import os
import urllib.request as urlreq

tgt_markets = ['ca', 'de', 'fr', 'in', 'jp', 'mx', 'uk', 'us']
tgt_cat = 'Electronics'

fix_url = 'https://ciir.cs.umass.edu/downloads/XMarket/FULL/'
orig_data_dl = 'DATA2/orig_data'
proc_data_out = 'DATA2/proc_data'
if not os.path.exists(orig_data_dl):
    os.makedirs(orig_data_dl)
if not os.path.exists(proc_data_out):
    os.makedirs(proc_data_out)
    
for tgt_market in tgt_markets:
    cur_url = f'{fix_url}{tgt_market}/{tgt_cat}/ratings_{tgt_market}_{tgt_cat}.txt.gz'
    urlreq.urlretrieve(cur_url, f'{orig_data_dl}/ratings_{tgt_market}_{tgt_cat}.txt.gz')
    print(f'Done: {cur_url}')
    cur_url_review = f'{fix_url}{tgt_market}/{tgt_cat}/reviews_{tgt_market}_{tgt_cat}.json.gz'
    urlreq.urlretrieve(cur_url_review, f'{orig_data_dl}/reviews_{tgt_market}_{tgt_cat}.json.gz')
    print(f'Done: {cur_url_review}')

Done: https://ciir.cs.umass.edu/downloads/XMarket/FULL/ca/Electronics/ratings_ca_Electronics.txt.gz
Done: https://ciir.cs.umass.edu/downloads/XMarket/FULL/ca/Electronics/reviews_ca_Electronics.json.gz
Done: https://ciir.cs.umass.edu/downloads/XMarket/FULL/de/Electronics/ratings_de_Electronics.txt.gz
Done: https://ciir.cs.umass.edu/downloads/XMarket/FULL/de/Electronics/reviews_de_Electronics.json.gz
Done: https://ciir.cs.umass.edu/downloads/XMarket/FULL/fr/Electronics/ratings_fr_Electronics.txt.gz
Done: https://ciir.cs.umass.edu/downloads/XMarket/FULL/fr/Electronics/reviews_fr_Electronics.json.gz
Done: https://ciir.cs.umass.edu/downloads/XMarket/FULL/in/Electronics/ratings_in_Electronics.txt.gz
Done: https://ciir.cs.umass.edu/downloads/XMarket/FULL/in/Electronics/reviews_in_Electronics.json.gz
Done: https://ciir.cs.umass.edu/downloads/XMarket/FULL/jp/Electronics/ratings_jp_Electronics.txt.gz
Done: https://ciir.cs.umass.edu/downloads/XMarket/FULL/jp/Electronics/reviews_jp_Electronics.jso

In [2]:
# Data preprocessing
# set thr for K-core data cleaning. We use 5core, +1 valid and +1 test-> so at this point we filter 7core
# set thresholds for item interaction 
user_thr = 7
item_thr = 7

# one can iterate a few times, we only perform one time filter
def get_kcore(ratings_all, user_thr, item_thr, repeat=1):
    for i in range(repeat):
        ratings_all.reset_index(drop=True, inplace=True)
        ratings_all = ratings_all.loc[ratings_all.groupby("itemId").filter(lambda x: len(x) >= item_thr).index]
        ratings_all.reset_index(drop=True, inplace=True)
        ratings_all = ratings_all.loc[ratings_all.groupby("userId").filter(lambda x: len(x) >= user_thr).index]
        ratings_all.reset_index(drop=True, inplace=True)
    return ratings_all

def rating_stats(ratings_all):
    n_rating = ratings_all.shape[0]
    n_user = len(set(ratings_all['userId'].unique()))
    n_item = len(set(ratings_all['itemId'].unique()))
    
    if (n_user*n_item)!=0:
        density = round((n_rating/(n_user*n_item) )*100, 5)
    else:
        density = 0
        
    return { '#users': n_user,
        '#items': n_item,
        '#rates': n_rating,
        'dens\%': density,
        }

In [3]:
# Load and Clean US (base) —— rating 
# import pandas as pd

# tgt_market = 'us'
# us_ratings_file = f'{orig_data_dl}/ratings_{tgt_market}_{tgt_cat}.txt.gz'

# us_df = pd.read_csv(us_ratings_file, compression='gzip', header=None, sep=' ', names=["userId", "itemId", "rate", "date"] )
# us_df_7core = get_kcore(us_df, user_thr=user_thr, item_thr=item_thr)

# us_user_thr = 10
# us_item_thr = 25
# us_df_10core = get_kcore(us_df, user_thr=us_user_thr, item_thr=us_item_thr)

# # write us data 
# us_df_7core.to_csv(f'{proc_data_out}/{tgt_market}_5core.txt', index=False, sep=' ')
# us_df_10core.to_csv(f'{proc_data_out}/{tgt_market}_10core.txt', index=False, sep=' ')

# print('US Market data stats:')
# print(rating_stats(us_df))
# print(rating_stats(us_df_7core))
# print(rating_stats(us_df_10core))
import pandas as pd

tgt_market = 'us'
us_ratings_file = f'{orig_data_dl}/ratings_{tgt_market}_{tgt_cat}.txt.gz'

us_df = pd.read_csv(us_ratings_file, compression='gzip', header=None, sep=' ', names=["userId", "itemId", "rate", "date"] )

# Random sample 0.5% of the data
us_df_sample = us_df.sample(frac=0.005)

# write us sampled data 
us_df_sample.to_csv(f'{proc_data_out}/{tgt_market}_sample.txt', index=False, sep=' ')

print('US Market data stats:')
print(rating_stats(us_df))
print(rating_stats(us_df_sample))

US Market data stats:
{'#users': 2784128, '#items': 35943, '#rates': 4169476, 'dens\\%': 0.00417}
{'#users': 20732, '#items': 12939, '#rates': 20847, 'dens\\%': 0.00777}


In [4]:
# Load and Clean target —— rating
# us_items_set = set(us_df_10core['itemId'].unique())

# for tgt_market in tgt_markets:
#     if tgt_market=='us':
#         continue
#     #read market ratings
#     cur_ratings_file = f'{orig_data_dl}/ratings_{tgt_market}_{tgt_cat}.txt.gz'
#     cur_df = pd.read_csv(cur_ratings_file, compression='gzip', header=None, sep=' ', names=["userId", "itemId", "rate", "date"] )
    
#     # item exist for us 
#     cur_df = cur_df.loc[cur_df['itemId'].isin( us_items_set )] 
#     cur_df_7core = get_kcore(cur_df, user_thr=user_thr, item_thr=item_thr)
#     cur_df_7core.to_csv(f'{proc_data_out}/{tgt_market}_5core.txt', index=False, sep=' ')
#     print(f'\n-- {tgt_market} stats: ')
#     print(rating_stats(cur_df_7core))
us_items_set = set(us_df_sample['itemId'].unique())

for tgt_market in tgt_markets:
    if tgt_market=='us':
        continue
    # read market ratings
    cur_ratings_file = f'{orig_data_dl}/ratings_{tgt_market}_{tgt_cat}.txt.gz'
    cur_df = pd.read_csv(cur_ratings_file, compression='gzip', header=None, sep=' ', names=["userId", "itemId", "rate", "date"] )
    
    # items that exist in US
    cur_df = cur_df.loc[cur_df['itemId'].isin( us_items_set )] 
    
    # Random sample 0.5% of the data
    cur_df_sample = cur_df.sample(frac=0.005)
    
    cur_df_sample.to_csv(f'{proc_data_out}/{tgt_market}_sample.txt', index=False, sep=' ')
    print(f'\n-- {tgt_market} stats: ')
    print(rating_stats(cur_df_sample))


-- ca stats: 
{'#users': 1730, '#items': 982, '#rates': 1734, 'dens\\%': 0.10207}

-- de stats: 
{'#users': 1267, '#items': 523, '#rates': 1269, 'dens\\%': 0.19151}

-- fr stats: 
{'#users': 921, '#items': 392, '#rates': 922, 'dens\\%': 0.25538}

-- in stats: 
{'#users': 661, '#items': 213, '#rates': 662, 'dens\\%': 0.47019}

-- jp stats: 
{'#users': 418, '#items': 223, '#rates': 418, 'dens\\%': 0.44843}

-- mx stats: 
{'#users': 446, '#items': 268, '#rates': 451, 'dens\\%': 0.37732}

-- uk stats: 
{'#users': 1823, '#items': 764, '#rates': 1831, 'dens\\%': 0.13146}


In [6]:
# Load and Clean US (base) —— review
import numpy as np  
from itertools import islice
import gzip
import json

tgt_market = 'us'
us_reviews_file = f'{orig_data_dl}/reviews_{tgt_market}_{tgt_cat}.json.gz'

# read market reviews
sample_ratio = 0.03  

# unzip
with gzip.open(us_reviews_file, 'rt') as f:
  # read json by line
  lines = list(islice(f, int(sample_ratio * sum(1 for _ in f))))

cur = [json.loads(line) for line in lines]


us_review_sum = []
for row in cur:
    for dict in row[0]:
      us_review_sum.append(dict["summary"])

us_review_sum = pd.DataFrame(us_review_sum)    
us_review_sum.to_csv(f'{proc_data_out}/{tgt_market}_reviews.txt', index=False, sep=' ')


In [7]:
# Load and Clean target —— review 
from itertools import islice

us_items_set = pd.DataFrame(us_items_set)

for tgt_market in tgt_markets:
    if tgt_market=='us':
        continue
    # read market reviews
    cur_reviews_file = f'{orig_data_dl}/reviews_{tgt_market}_{tgt_cat}.json.gz'
    sample_ratio = 0.03  

    # unzip
    with gzip.open(cur_reviews_file, 'rt') as f:
        # read json by line
        lines = list(islice(f, int(sample_ratio * sum(1 for _ in f))))

    cur = [json.loads(line) for line in lines]
    
    for row in cur:
      row[0] = [dict for dict in row[0] if dict["asin"] in us_items_set]


    cur_review_sum = []
    for row in cur:
        for dict in row:
          cur_review_sum.append(dict["summary"])

    cur_review_sum = pd.DataFrame(cur_review_sum)
    
    cur_review_sum.to_csv(f'{proc_data_out}/{tgt_market}_reviews.txt', index=False, sep=' ')

#Model Construction

##Predefinition

In [9]:
!pip install args
!pip install bpemb

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting args
  Downloading args-0.1.0.tar.gz (3.0 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: args
  Building wheel for args (setup.py) ... [?25l[?25hdone
  Created wheel for args: filename=args-0.1.0-py3-none-any.whl size=3320 sha256=e67770df62fee54bf790afed990b538332752918f85bc5eff4829f7f3a93d403
  Stored in directory: /root/.cache/pip/wheels/18/d7/bc/7b88d8405d97070a1a62712fd639ea0ad8d14b3dd74075cca6
Successfully built args
Installing collected packages: args
Successfully installed args-0.1.0
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting bpemb
  Downloading bpemb-0.3.4-py3-none-any.whl (19 kB)
Collecting sentencepiece (from bpemb)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━

In [14]:
import argparse

data_info_n2m = {
	"mind_0_0shot": {
		'user_n': 1000,
		'news_n': 5000,
		'max_title_token': 30,
	},

	"mind_200_4shot":{
		'user_n': 2500,
		'news_n': 8000,
		'max_title_token': 30,
	},

	"adressa": {
		'user_n' : 10000,
		'news_n' : 2097,
		'max_title_token': 30,
	}
}

data_info_m2n = {
	"mind_0_0shot": {
		'user_n': 1000,
		'news_n': 708,
		'max_title_token': 30,
	},

	"mind_200_2shot": {
		'user_n': 1000,
		'news_n': 646,
		'max_title_token': 30,
	},

	"mind_200_4shot":{
		'user_n': 1000,
		'news_n': 708,
		'max_title_token': 30,
	},

	"mind_200_6shot": {
		'user_n': 1000,
		'news_n': 692,
		'max_title_token': 30,
	},

	"mind_5500_10shot": {
		'user_n': 13500,
		'news_n': 1862,
		'max_title_token': 30,
	},

	"adressa": {
		'user_n' : 10000,
		'news_n' : 31099,
		'max_title_token': 30,
	}
}


pretrained_embed = 300
deepwalk_embed = 300
embed_d = 300

def read_args(db='mind', lr=0.0002):

	parser = argparse.ArgumentParser()
	parser.add_argument('--few_shot', type=str, default='_0_0shot')  # ''/'_100'/'_500'/'_1000'
	parser.add_argument('--db', type = str, default = db,
				   help = 'node net dimension')

	parser.add_argument('--embed_d', type = int, default = embed_d,
				   help = 'embedding dimension')

	parser.add_argument('--lr', type = int, default = lr,
				   help = 'learning rate')

	parser.add_argument('--batch_s', type = int, default = 20000,
				   help = 'batch size')

	if db == 'mind':
		mini_batch_s = 80
	else:
		mini_batch_s = 80

	parser.add_argument('--mini_batch_s', type = int, default = mini_batch_s,
				   help = 'mini batch size')

	parser.add_argument('--train_iter_n', type = int, default = 10,
				   help = 'max number of training iteration')
	parser.add_argument("--random_seed", default = 42, type = int)

	parser.add_argument('--save_model_freq', type = float, default = 1,
				   help = 'number of iterations to save model')
	parser.add_argument("--cuda", default = 2, type = int)
	parser.add_argument("--checkpoint", default = '', type=str)
	parser.add_argument("--npratio", default=4, type=int)

	"""
	Some other parameters needed to be test
	"""
	parser.add_argument("--save_emb", default=0, type=int)

	# ablation studies for different modules
	parser.add_argument("--use_PLM", default=1, type=int)
	parser.add_argument("--use_KG", default=0, type=int)
	# few-shot setting
	parser.add_argument("--few_shot_method", default=2, type=int)  # 0-only mind 1-mind+adressa 2-ours
	# other domains
	parser.add_argument("--range", default="Model/engTonor", type=str) # Model/data  Model/engTonor

	parser.add_argument("--align_mode", default="no_freeze", type=str)

	parser.add_argument("--loss_weight", default=0.2, type=float)
	parser.add_argument("--loss_weight_align", default=1, type=float)
	parser.add_argument("--news_cls_iter", default=1, type=int)
	# target domain sim
	parser.add_argument("--target_domain_sim", default=0.6, type=float)

	# top-n plm news
	parser.add_argument("--topn", default=1, type=int)


	args = parser.parse_args()

	if db == 'mind':
		data_key = db + args.few_shot
	else:
		data_key = db

	range = args.range
	if range == 'Model/data' or range == 'Model/new_data' or range != 'Model/engTonor':
		data_info = data_info_n2m
		if data_key in data_info:
			args.A_n = data_info[data_key]['user_n']
			args.P_n = data_info[data_key]['news_n']
			args.max_title_token = data_info[data_key]['max_title_token']
		else:
			args.A_n = data_info["mind_200_4shot"]['user_n']
			args.P_n = data_info["mind_200_4shot"]['news_n']
			args.max_title_token = data_info["mind_200_4shot"]['max_title_token']
	else:
		data_info = data_info_m2n

		if data_key in data_info:
			args.A_n = data_info[data_key]['user_n']
			args.P_n = data_info[data_key]['news_n']
			args.max_title_token = data_info[data_key]['max_title_token']
		else:
			args.A_n = data_info["mind_5500_10shot"]['user_n']
			args.P_n = data_info["mind_5500_10shot"]['news_n']
			args.max_title_token = data_info["mind_5500_10shot"]['max_title_token']

	args.data_path = '../{}/{}/'.format(range, data_key)
	args.model_path = './model_save/{}/'.format(data_key)
	return args

In [15]:
class AdditiveAttention(torch.nn.Module):
    def __init__(self, in_dim=100, v_size=200):
        super().__init__()

        self.in_dim = in_dim
        self.v_size = v_size
        # self.v = torch.nn.Parameter(torch.rand(self.v_size))
        self.proj = nn.Sequential(nn.Linear(self.in_dim, self.v_size), nn.Tanh())
        self.proj_v = nn.Linear(self.v_size, 1)

    def forward(self, context, mask=None):
        """Additive Attention

        Args:
            context (tensor): [B, seq_len, in_dim]

        Returns:
            outputs, weights: [B, seq_len, out_dim], [B, seq_len]
        """
        # weights = self.proj(context) @ self.v
        weights = self.proj_v(self.proj(context)).squeeze(-1)    # B*seq

        if mask is not None:
            weights = weights.masked_fill(mask, float("-inf"))
        weights = torch.softmax(weights, dim=-1) # [B, seq_len]
        return torch.bmm(weights.unsqueeze(1), context).squeeze(1), weights # [B, 1, seq_len], [B, seq_len, dim]


In [17]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import tensor
import numpy as np
import math
from bpemb import BPEmb
import random
from tqdm import tqdm
import copy
import pandas as pd
import pdb
class PredLayer(nn.Module):
    """
    Prediction layer (cross_entropy or adaptive_softmax).
    """
    def __init__(self, n_words, emb_dim, asm=False):
        super().__init__()
        self.n_words = n_words
        self.emb_dim = emb_dim

        # if asm is False:
        self.proj = nn.Linear(self.emb_dim, self.n_words, bias=True)
        # else:
        #     self.proj = nn.AdaptiveLogSoftmaxWithLoss(
        #         in_features=self.emb_dim,
        #         n_classes=self.n_words,
        #         cutoffs=params.asm_cutoffs,
        #         div_value=params.asm_div_value,
        #         head_bias=True,  # default is False
        #     )

    def forward(self, x, y, get_scores=False):
        """
        Compute the loss, and optionally the scores.
        """
        scores = self.proj(x).view(-1, self.n_words)
        loss = F.cross_entropy(scores, y, reduction='mean')
        return scores, loss

##Review Embedding


In [21]:
bpemb = BPEmb(lang="en", vs=1000, dim=300)

file_path = "/content/DATA2/proc_data/us_reviews.txt"
with open(file_path, "r", encoding="utf-8") as file:
    sentences = [line.strip() for line in file]

# Embed the sentences
embeddings = []
for sentence in sentences:
    encoded_sentence = bpemb.encode(sentence)
    embedding = np.mean(bpemb.vectors[encoded_sentence], axis=0)
    embeddings.append(embedding)

# Convert the embeddings list to a NumPy array
embeddings = np.array(embeddings)

# Save the embeddings to a text file
output_file = "us_embeddings.txt"
np.savetxt(output_file, embeddings)

print("Embeddings have been saved to the file:", output_file)

Embeddings have been saved to the file: us_embeddings.txt


In [29]:
bpemb = BPEmb(lang="en", vs=1000, dim=300)

for tgt_market in tgt_markets:
    file_path = f'/content/DATA2/proc_data/{tgt_market}_reviews.txt'
    with open(file_path, "r", encoding="utf-8") as file:
        sentences = [line.strip() for line in file]

    # Embed the sentences
    embeddings = []
    for sentence in sentences:
      embedding = bpemb.embed(sentence)
      mean_embedding = np.mean(embedding, axis=0)
      embeddings.append(mean_embedding)

    # Convert the embeddings list to a NumPy array
    embeddings = np.array(embeddings)

    # Save the embeddings to a text file
    output_file = f'{tgt_market}_embeddings.txt'
    np.savetxt(output_file, embeddings)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = um.true_divide(


##HetGNN Model