In [None]:
from google.colab import drive
drive.mount(('/content/drive'))

Mounted at /content/drive


In [None]:
# Install required libraries
!pip install transformers torch jieba
!pip install chardet
!pip install lime

Collecting lime
  Downloading lime-0.2.0.1.tar.gz (275 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m275.7/275.7 kB[0m [31m1.3 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: lime
  Building wheel for lime (setup.py) ... [?25l[?25hdone
  Created wheel for lime: filename=lime-0.2.0.1-py3-none-any.whl size=283834 sha256=6b934a2252f9cd7df01bf5075ff2c5921148e2b1b327347e02ad01f70369b841
  Stored in directory: /root/.cache/pip/wheels/fd/a2/af/9ac0a1a85a27f314a06b39e1f492bee1547d52549a4606ed89
Successfully built lime
Installing collected packages: lime
Successfully installed lime-0.2.0.1


In [None]:
# Import necessary libraries
import torch
from transformers import DistilBertTokenizer, DistilBertForSequenceClassification, AdamW
from torch.utils.data import Dataset, DataLoader
import pandas as pd
import jieba
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from tqdm.notebook import tqdm
import os
import glob
import chardet

In [None]:
# Function to load and preprocess the dataset
def load_dataset(directory, encoding='gb18030'):
    files = glob.glob(f"{directory}/*.txt")
    reviews = []
    labels = []
    for file in files:
        try:
            with open(file, 'r', encoding=encoding) as f:
                # Read the entire content of the file as one string
                content = f.read().strip()
            if content:  # Check if content is not empty
                label = 1 if 'pos' in file else 0
                reviews.append(content)
                labels.append(label)
        except UnicodeDecodeError:
            print(f"Error decoding file: {file}")
    return reviews, labels


In [None]:
# Load and preprocess dataset
pos_dir = '/content/drive/My Drive/China hotel review/pos'
neg_dir = '/content/drive/My Drive/China hotel review/neg'
pos_reviews, pos_labels = load_dataset(pos_dir)
neg_reviews, neg_labels = load_dataset(neg_dir)
reviews = pos_reviews + neg_reviews
labels = pos_labels + neg_labels


# Split dataset into training and evaluation sets
train_texts, eval_texts, train_labels, eval_labels = train_test_split(reviews, labels, test_size=0.2, random_state=42)

Error decoding file: /content/drive/My Drive/China hotel review/pos/pos.159.txt
Error decoding file: /content/drive/My Drive/China hotel review/pos/pos.2836.txt
Error decoding file: /content/drive/My Drive/China hotel review/pos/pos.1480.txt
Error decoding file: /content/drive/My Drive/China hotel review/pos/pos.2174.txt
Error decoding file: /content/drive/My Drive/China hotel review/pos/pos.1687.txt
Error decoding file: /content/drive/My Drive/China hotel review/pos/pos.681.txt
Error decoding file: /content/drive/My Drive/China hotel review/pos/pos.147.txt
Error decoding file: /content/drive/My Drive/China hotel review/pos/pos.748.txt
Error decoding file: /content/drive/My Drive/China hotel review/pos/pos.818.txt
Error decoding file: /content/drive/My Drive/China hotel review/pos/pos.1895.txt
Error decoding file: /content/drive/My Drive/China hotel review/pos/pos.169.txt
Error decoding file: /content/drive/My Drive/China hotel review/pos/pos.1888.txt
Error decoding file: /content/driv

In [None]:
print(pos_reviews)
print(neg_reviews)

['酒店环境,房间档次都很不错,服务水平专业.住了两天,感觉确实比较不错,二楼自助烧烤给人宾至如归的感觉', '对这次携程网提供的世贸广场酒店很满意，服务态度一流，环境也不错，只有房间的装修感觉稍微久了点。\n\n补充点评 2008年2月10日 ： 要补充的是：我们这次住的是豪华套间，房间的电视稍微小了点，要是液晶的就好了。呵呵', '比较经济,早餐还可以,就是离市区有点距离.', '四月清明假期去的，早餐一般，停车场不够，晚上晚一点回去就要停到对面没人管的空地。', '住了这家饭店，感觉还不错，没有网络上评论的那么差，开始看到网络上的评论还担心这家饭店不好，看来是多余了！刚进大厅竟然在显示屏上有自己的名字，真的感觉很惊喜！这里服务员很热情，床虽然小了一点，但很干净整洁，服务也很周到！这的地理位置也不错，靠近市中心，出门就有去莫高窟的车，这次敦煌之行还算愉快！！', '物有所值,看来安徽人民币江苏人民是在得多', '春节特价入住套房，含早/晚餐，服务非常到位，只是由于酒店有些老，但是设施还算不错！只不过这种春节价位希望酒店能够实现春节特价周末常态化，这样会吸引大部分旅客！', '这是第二次入住了，因为出差需要，在安庆，确实也没有其他地方可住，酒店整体环境优雅，房间设施一般，不过卫生还可以，每次入住后，习惯检查一下床面是否整洁，近期出差，入住的酒店中，只有这一家未发现床上有残留的头发等异物。', '酒店感觉还行，不过说实话不如苏州及上海那边的，不过早点很好呵呵，服务态度也很好的。交通也很方便，就是现在周边正在进行整修，修好后应该是不错的选择。', '房间比较小，打车不是很方便。\n\n还有早饭种类比较少', '设施有点老化了，但毕竟是闹中取静的地方！洗漱用的东东配的挺齐全的！', '管理非常好！细节服务非常到位，值得推荐！感觉得到是一个老牌的酒店．', '房间一般，隔音感觉不是很好，地理位置不错，就在江边，靠江的房间开穿远眺就能看见大佛。服务态度餐厅的比前台好很多，早餐品种少了点。', '非常好。国内最好的酒店之一。各方面都不错。', '目前是新昌最好的酒店，环境和服务质量都非常好，国内长度免费和免费宽带还是蛮有吸引力的，离大佛寺仅咫尺之遥，太方便了\n\n补充点评 2008年5月4日 ： 酒店一楼中餐厅内的白斩鸡和水库剁椒鱼头是不能不尝的？住了两晚，吃了两餐。特色的

In [None]:
# Function to tokenize using jieba and DistilBERT tokenizer
def jieba_tokenize(texts):
    tokenized_texts = [" ".join(jieba.cut(text)) for text in texts]
    return tokenized_texts

# Initialize DistilBERT tokenizer
distilbert_tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# Tokenize reviews
train_texts = jieba_tokenize(train_texts)
eval_texts = jieba_tokenize(eval_texts)

tokenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

Building prefix dict from the default dictionary ...
DEBUG:jieba:Building prefix dict from the default dictionary ...
Dumping model to file cache /tmp/jieba.cache
DEBUG:jieba:Dumping model to file cache /tmp/jieba.cache
Loading model cost 1.073 seconds.
DEBUG:jieba:Loading model cost 1.073 seconds.
Prefix dict has been built successfully.
DEBUG:jieba:Prefix dict has been built successfully.


In [None]:
# Dataset class
class ChineseSentimentDataset(Dataset):
    def __init__(self, reviews, labels, tokenizer):
        self.encodings = tokenizer(reviews, truncation=True, padding=True, max_length=512)
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Create datasets
train_dataset = ChineseSentimentDataset(train_texts, train_labels, distilbert_tokenizer)
eval_dataset = ChineseSentimentDataset(eval_texts, eval_labels, distilbert_tokenizer)

In [None]:
# DataLoaders
train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
eval_loader = DataLoader(eval_dataset, batch_size=16, shuffle=False)

# Load DistilBERT model
model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', num_labels=2)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['pre_classifier.weight', 'classifier.weight', 'classifier.bias', 'pre_classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
 

In [None]:
# Evaluation function
def evaluate(model, eval_loader):
    model.eval()
    total, correct = 0, 0
    with torch.no_grad():
        for batch in tqdm(eval_loader, desc="Evaluating"):
            inputs = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)

            outputs = model(inputs, attention_mask=attention_mask)
            logits = outputs.logits
            predictions = torch.argmax(logits, dim=1)
            correct += (predictions == labels).sum().item()
            total += labels.size(0)

    accuracy = correct / total
    print(f'Accuracy: {accuracy:.4f}')

In [None]:
# Initial evaluation
evaluate(model, eval_loader)

Evaluating:   0%|          | 0/73 [00:00<?, ?it/s]

Accuracy: 0.5233


In [None]:
def predict_sentiment(user_input, model, tokenizer):
    # Preprocessing the user input
    preprocessed_input = jieba_tokenize([user_input])

    # Tokenizing the preprocessed input
    encoding = tokenizer.encode_plus(preprocessed_input[0],
                                     add_special_tokens=True,
                                     return_tensors='pt',
                                     max_length=512,
                                     truncation=True,
                                     padding='max_length')

    # Extract input_ids and attention mask
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)

    # Getting the model's prediction
    model.eval()
    with torch.no_grad():
        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits

    # Converting logits to probabilities
    probabilities = torch.nn.functional.softmax(logits, dim=1)

    # Converting probabilities to sentiment
    sentiment = 'positive' if torch.argmax(probabilities) == 1 else 'negative'

    return sentiment, probabilities

# Example usage:
user_review = "这家酒店很好"  # Example Chinese text
sentiment, probs = predict_sentiment(user_review, model, distilbert_tokenizer)
print(f"Predicted Sentiment: {sentiment}, Probabilities: {probs}")


Predicted Sentiment: negative, Probabilities: tensor([[0.5043, 0.4957]], device='cuda:0')


In [None]:
# Fine-tuning the model
optimizer = AdamW(model.parameters(), lr=1e-5)
model.train()
for epoch in range(10):  # Adjust epochs based on your dataset and training needs
    for batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}"):
        optimizer.zero_grad()
        inputs = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)

        outputs = model(inputs, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()
        optimizer.step()




Training Epoch 1:   0%|          | 0/290 [00:00<?, ?it/s]

Training Epoch 2:   0%|          | 0/290 [00:00<?, ?it/s]

Training Epoch 3:   0%|          | 0/290 [00:00<?, ?it/s]

Training Epoch 4:   0%|          | 0/290 [00:00<?, ?it/s]

Training Epoch 5:   0%|          | 0/290 [00:00<?, ?it/s]

Training Epoch 6:   0%|          | 0/290 [00:00<?, ?it/s]

Training Epoch 7:   0%|          | 0/290 [00:00<?, ?it/s]

Training Epoch 8:   0%|          | 0/290 [00:00<?, ?it/s]

Training Epoch 9:   0%|          | 0/290 [00:00<?, ?it/s]

Training Epoch 10:   0%|          | 0/290 [00:00<?, ?it/s]

In [None]:
# Re-evaluate the model
evaluate(model, eval_loader)

Evaluating:   0%|          | 0/73 [00:00<?, ?it/s]

Accuracy: 0.7810


In [None]:
# Example usage:
user_review = "这家酒店很好"  # Example Chinese text
sentiment, probs = predict_sentiment(user_review, model, distilbert_tokenizer)
print(f"Predicted Sentiment: {sentiment}, Probabilities: {probs}")

Predicted Sentiment: positive, Probabilities: tensor([[0.4163, 0.5837]], device='cuda:0')


In [None]:
user_review = input()  # Example Chinese text
sentiment, probs = predict_sentiment(user_review, model, distilbert_tokenizer)
print(f"Predicted Sentiment: {sentiment}, Probabilities: {probs}")



这家酒店的位置非常方便，靠近市中心，但是房间隔音效果不是很好，晚上有时会听到外面的噪音。服务态度很友好，但是早餐种类比较少。
Predicted Sentiment: positive, Probabilities: tensor([[0.0460, 0.9540]], device='cuda:0')


In [None]:
#!pip install lime
import lime
from lime.lime_text import LimeTextExplainer

def predict_proba(texts, model, tokenizer, device):
    # This function will be used by LIME to make predictions
    model.eval()
    inputs = tokenizer(texts, return_tensors='pt', padding=True, truncation=True, max_length=512)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        logits = model(input_ids, attention_mask=attention_mask).logits
        probabilities = torch.nn.functional.softmax(logits, dim=1).cpu().numpy()
    return probabilities

def explain_with_lime(text, model, tokenizer, device, num_features=10, num_samples=1000):
    # Initialize LIME Text Explainer
    explainer = LimeTextExplainer(class_names=['Negative', 'Positive'])

    # Define a wrapper function for the model's prediction function
    predict_fn = lambda x: predict_proba(x, model, tokenizer, device)

    # Generate explanation
    exp = explainer.explain_instance(text, predict_fn, num_features=num_features, num_samples=num_samples)
    return exp

# Example usage
text = "这家酒店的位置非常方便，靠近市中心，但是房间隔音效果不是很好，晚上有时会听到外面的噪音。服务态度很友好，但是早餐种类比较少。"  # Replace with your text
explanation = explain_with_lime(text, model, distilbert_tokenizer, device)

# Print explanation
print(explanation.as_list())
explanation.show_in_notebook(text=True)


[('靠近市中心', 0.3675393909414371), ('但是房间隔音效果不是很好', -0.05061193648604859), ('晚上有时会听到外面的噪音', 0.04686197210777947), ('这家酒店的位置非常方便', 0.04548557485903894), ('但是早餐种类比较少', 0.010594251334498607), ('服务态度很友好', -0.00920675938161886)]


In [None]:
!pip install shap
import shap
import torch

def get_embeddings(text, model, tokenizer, device):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=512)
    input_ids = inputs['input_ids'].to(device)
    attention_mask = inputs['attention_mask'].to(device)

    with torch.no_grad():
        # Get embeddings from the model; the method might vary depending on the model architecture
        embeddings = model.distilbert(input_ids, attention_mask=attention_mask)[0]
    return embeddings.cpu().numpy()

def shap_values_on_embeddings(text, model, tokenizer, device):
    # Get embeddings for the text
    embeddings = get_embeddings(text, model, tokenizer, device)

    # Use KernelExplainer on the embeddings
    explainer = shap.KernelExplainer(model=lambda x: model(x).logits, data=embeddings)
    shap_values = explainer.shap_values(embeddings, nsamples=100)  # Adjust nsamples as needed

    return shap_values

# Example usage
text = "这家酒店的位置非常方便，靠近市中心，但是房间隔音效果不是很好，晚上有时会听到外面的噪音。服务态度很友好，但是早餐种类比较少。"
model.to(device)  # Ensure model is on the right device
shap_values = shap_values_on_embeddings(text, model, distilbert_tokenizer, device)

# Interpretation (this part is quite abstract as it relates to embedding dimensions)
print(shap_values)

