Packages to install

In [None]:
!pip install pyserini
!pip install dice-ml
!pip install faiss-cpu

Collecting pyserini
  Downloading pyserini-0.22.0-py3-none-any.whl (140.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m140.5/140.5 MB[0m [31m8.4 MB/s[0m eta [36m0:00:00[0m
Collecting pyjnius>=1.4.0 (from pyserini)
  Downloading pyjnius-1.6.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m82.9 MB/s[0m eta [36m0:00:00[0m
Collecting transformers>=4.6.0 (from pyserini)
  Downloading transformers-4.34.0-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m122.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting sentencepiece>=0.1.95 (from pyserini)
  Downloading sentencepiece-0.1.99-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m83.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting nmslib>=2.1.1 (from pys

Loading MSMARCO-Dev-Set

In [None]:
import pyserini
import os
from pyserini.search import get_topics
from pyserini.search import LuceneSearcher
import math
from pyserini.index.lucene import IndexReader
from sklearn.feature_extraction.text import TfidfVectorizer

os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"



topics = get_topics('msmarco-passage-dev-subset')
print(f'{len(topics)} queries total')


indexer = IndexReader.from_prebuilt_index('msmarco-v1-passage')
total_documents = indexer.stats()["documents"]

6980 queries total
Downloading index at https://rgw.cs.uwaterloo.ca/pyserini/indexes/lucene-index.msmarco-v1-passage.20221004.252b5e.tar.gz...


lucene-index.msmarco-v1-passage.20221004.252b5e.tar.gz: 2.02GB [00:28, 77.1MB/s]                            


Retrieval

In [None]:
import json
from pyserini.search import LuceneSearcher
topK=30

query = 'average rent in california'



searcher = LuceneSearcher.from_prebuilt_index('msmarco-v1-passage')
hits = searcher.search(query,topK)

documents = []
# Prints the first 10 hits
for i in range(0, topK):
    jsondoc = json.loads(hits[i].raw)
    documents.append(jsondoc["contents"][:1000])
    #print(f'{i+1:2} {hits[i].score:.5f} {jsondoc["id"]} {jsondoc["contents"][:1000]}..')



tokenized_documents = [doc.split() for doc in documents]
preprocessed_documents = [' '.join(doc) for doc in tokenized_documents]

Extracting Top-K words from each document

In [None]:
from spacy.lang.de.stop_words import STOP_WORDS
import pandas as pd
import numpy as np
import scipy.sparse as sp
from sklearn.feature_extraction.text import TfidfVectorizer, ENGLISH_STOP_WORDS



def calculate_idf(word):
    df,cf = indexer.get_term_counts(word)
    idf = math.log(total_documents / (df + 1))

    return idf


def extract_topk_words(document_str):
    tfidf_vectorizer = TfidfVectorizer(stop_words = list(ENGLISH_STOP_WORDS),use_idf = False, norm=None)
    tfidf = tfidf_vectorizer.fit_transform([document_str])
    arrays_of_list = tfidf.tolil().data
    list_of_list = arrays_of_list.tolist()[0]

    feature_names = tfidf_vectorizer.get_feature_names_out()
    idfs=[]
    for feature in feature_names:
      idf=calculate_idf(feature)
      idfs.append(idf)
    x=[a*b for a,b in zip(list_of_list,idfs)]
    tfidf_scores = zip(feature_names, x)
    #print(list(tfidf_scores))
    sorted_scores = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)
    #print(sorted_scores)
    #sorted_scores = sorted(x, reverse=True)
    top_words = [word for word, score in sorted_scores[:word_k]]
    vector = [tfidf[0, tfidf_vectorizer.vocabulary_[word]] for word in top_words]

    return top_words, vector


vectors = []
top_words_list = []
word_k=10

for paragraph in preprocessed_documents:
  top_words, vector = extract_topk_words(paragraph)
  vectors.append(vector)
  top_words_list.append(top_words)


# Print the resulting top 10 words and vectors for each paragraph
#for i in range(len(preprocessed_documents)):
    #print(f"Top 10 words for paragraph {i+1}: {top_words_list[i]}")
    #print(f"TF-IDF vector features for paragraph {i+1}: {vectors[i]}")




Prepare Counterfactual Input

In [None]:
threshold=9
import pandas as pd

doc_ar = []
for i in range(0,topK):

  jsonload = json.loads(hits[i].raw)
  doc_ar.append(jsonload)

doc_df = pd.json_normalize(doc_ar)
doc_df['top10_words'] = top_words_list
doc_df['tfidf_vector'] = vectors
doc_df=doc_df.drop("contents",axis=1)
#doc_df.head

corpus=[]
for i in range(len(doc_df)):
  x=doc_df["top10_words"][i]
  y=doc_df["tfidf_vector"][i]
  doc_str=""
  for i in range(len(x)):
    for j in range(int(y[i])):
      doc_str=doc_str +" "+x[i]
  corpus.append(doc_str)


tfidf_vectorizer = TfidfVectorizer(stop_words = list(ENGLISH_STOP_WORDS),use_idf = False, norm=None)
tfidf = tfidf_vectorizer.fit_transform(corpus)

vocab_size=len(tfidf_vectorizer.get_feature_names_out())

print(tfidf.shape)
feature_vector=[]
xx=tfidf.toarray()
for i in range(tfidf.shape[0]):
  feature_vector.append(xx[i])

doc_df["Feature_Vector"]=feature_vector
doc_df=doc_df.drop("tfidf_vector",axis=1)

#doc_df.head

doc_df["true_label"] = 0

doc_df.loc[:threshold, 'true_label'] = 1

(30, 152)


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,classification_report



vocabulary=tfidf_vectorizer.get_feature_names_out()
df_split = pd.DataFrame(doc_df['Feature_Vector'].tolist())
print(len(vocabulary))
print(len(feature_vector[0]))

152
152


In [None]:
df_split = df_split.rename(columns=dict(zip(df_split.columns, vocabulary)))
df_concat = pd.concat([doc_df, df_split], axis=1)
X = df_concat.iloc[:, 7:]
y = df_concat['true_label']
dataset = pd.concat([X,y],axis=1)


X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

model1 = RandomForestClassifier(max_depth=100, random_state=42)
model1.fit(X_train, y_train)

y_pred1 = model1.predict(X_train)

print("Accuracy:", accuracy_score(y_train, y_pred1))

print(classification_report(y_train, y_pred1))

Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00        16
           1       1.00      1.00      1.00         8

    accuracy                           1.00        24
   macro avg       1.00      1.00      1.00        24
weighted avg       1.00      1.00      1.00        24



In [None]:
import dice_ml
import warnings
warnings.filterwarnings('ignore')


d = dice_ml.Data(dataframe=dataset, continuous_features=[], outcome_name='true_label')
m = dice_ml.Model(model=model1, backend="sklearn")
exp = dice_ml.Dice(d, m, method="random")
e1 = exp.generate_counterfactuals(X_test[1:2], total_CFs=2, desired_class="opposite")
e1.visualize_as_dataframe(display_sparse_df=True,show_only_changes=True)


100%|██████████| 1/1 [00:00<00:00,  1.72it/s]

Query instance (original outcome : 0)





Unnamed: 0,1197,1458,1529,160,166,167,179,180,1940,1940s,...,trend,trulia,twice,twoâ,uncommonâ,unemployed,versus,view,yearly,true_label
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0



Diverse Counterfactual set (new outcome: 1.0)


Unnamed: 0,1197,1458,1529,160,166,167,179,180,1940,1940s,...,trend,trulia,twice,twoâ,uncommonâ,unemployed,versus,view,yearly,true_label
0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,2.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
1,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-
