In [None]:
!pip install pyserini
!pip install dice-ml
!pip install faiss-cpu



In [None]:
import pyserini
import os
from pyserini.search import get_topics
from pyserini.search import LuceneSearcher
import math
import ast
from pyserini.index.lucene import IndexReader
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, ENGLISH_STOP_WORDS
from wordcloud import WordCloud
import matplotlib.pyplot as plt
from spacy.lang.de.stop_words import STOP_WORDS
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score,classification_report
import dice_ml
from dice_ml.utils import helpers
import numpy as np
import random
import warnings


os.environ["JAVA_HOME"] = "/usr/lib/jvm/java-11-openjdk-amd64"

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
data = pd.read_csv("/content/drive/MyDrive/dataset.csv")

In [None]:
KD=30 # Top KD docs used for ranking and making feature vectors and train classifier
KW=10 # Top KW words taken out of each KD documents for feature vector and vocabulary
T=10 # Threshold for classifier
no=9 # choose which query to use

In [None]:
data

Unnamed: 0.1,Unnamed: 0,query_id,query,Rank,Score,Doc_id,Document
0,0,992839,definition of curfew,24,8.0823,3807532,What curfew rules should you set for your teen...
1,1,992839,definition of curfew,17,8.266,8191267,"An additional 6 percent, or 23 cities, were co..."
2,2,992839,definition of curfew,12,8.6025,5207344,The definition of universal is relating to or ...
3,3,992839,definition of curfew,26,8.044699,3807525,Curfew rules can be adjusted as your teen grow...
4,4,992839,definition of curfew,19,8.1833,2620523,Discuss is defined as to talk about and to con...
5,5,1013965,"what type of wave is electromagnetic,",15,12.8568,2646538,FIND OUT MORE. Many different kinds of energy ...
6,6,1013965,"what type of wave is electromagnetic,",19,12.7204,6615903,The electromagnetic spectrum contains many typ...
7,7,1013965,"what type of wave is electromagnetic,",24,12.6659,752147,Sound waves and electromagnetic waves are diff...
8,8,1013965,"what type of wave is electromagnetic,",17,12.7295,1742690,"In transverse waves, the oscillations (vibrati..."
9,9,1013965,"what type of wave is electromagnetic,",18,12.7237,1839,Difference Between Electromagnetic Waves and R...


In [None]:
queries = data['query'].unique()
query = queries[no]

df = data[data['query'] == query]
df

Unnamed: 0.1,Unnamed: 0,query_id,query,Rank,Score,Doc_id,Document
45,45,844658,what is the role of lipids in a cell,28,10.3397,6159679,Answers. Best Answer: The cell membrane is pri...
46,46,844658,what is the role of lipids in a cell,25,10.5129,2043703,The membrane that surrounds a cell is made up ...
47,47,844658,what is the role of lipids in a cell,18,10.8006,7129659,Lipids Examples. Lipids are the fatty or waxy ...
48,48,844658,what is the role of lipids in a cell,17,10.808,5480522,"The Lipid Bilayer, page 2. The lipid bilayer i..."
49,49,844658,what is the role of lipids in a cell,15,10.8396,1762072,Lipid Bilayer Structure. The lipid bilayer is ...


In [None]:
searcher = LuceneSearcher.from_prebuilt_index('msmarco-passage')

In [None]:
hits = searcher.search(query,KD)
import json
for i in range(0, KD):
    jsondoc = json.loads(hits[i].raw)
    print(f'{i+1:2} {hits[i].score:.5f} {jsondoc["id"]} {jsondoc["contents"][:1000]}..')

 1 11.75180 670137 Lipid Bilayer Structure. The lipid bilayer is a universal component of all cell membranes. Its role is critical because its structural components provide the barrier that marks the boundaries of a cell. The structure is called a lipid bilayer because it is composed of two layers of fat cells organized in two sheets.ipid Bilayer Structure. The lipid bilayer is a universal component of all cell membranes. Its role is critical because its structural components provide the barrier that marks the boundaries of a cell. The structure is called a lipid bilayer because it is composed of two layers of fat cells organized in two sheets...
 2 11.67630 942066 Lipid Bilayer Structure. The lipid bilayer is a universal component of all cell membranes. Its role is critical because its structural components provide the barrier that marks the boundaries of a cell. The structure is called a lipid bilayer because it is composed of two layers of fat cells organized in two sheets.ts role i

In [None]:
documents = []
indexer = IndexReader.from_prebuilt_index('msmarco-v1-passage')
total_documents = indexer.stats()["documents"]
def calculate_idf(word):
    df,cf = indexer.get_term_counts(word)

    idf = math.log(total_documents / (df + 1))

    return idf
for i in range(0,KD):
    jsondoc = json.loads(hits[i].raw)
    documents.append(jsondoc["contents"][:1000])
tokenized_documents = [doc.split() for doc in documents]
preprocessed_documents = [' '.join(doc) for doc in tokenized_documents]
total_docs = indexer.stats().get('documents')

from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, ENGLISH_STOP_WORDS

vectors = []
top_words_list = []
tfidf_vectorizer = TfidfVectorizer(stop_words = list(ENGLISH_STOP_WORDS),use_idf = False)
for paragraph in preprocessed_documents:
    tfidf = tfidf_vectorizer.fit_transform([paragraph])
    feature_names = tfidf_vectorizer.get_feature_names_out()
    # tfidf_ms = calculate_idf(feature_names)
    tfidf_scores = zip(feature_names, tfidf.sum(axis=0).tolist()[0])
    sorted_scores = sorted(tfidf_scores, key=lambda x: x[1], reverse=True)
    top_words = [word for word, score in sorted_scores[:KW]]
    top_words_list.append(top_words)
    vector = [tfidf[0, tfidf_vectorizer.vocabulary_[word]] for word in top_words]
    vectors.append(vector)


for i in range(len(preprocessed_documents)):
    print(f"Top 10 words for paragraph {i+1}: {top_words_list[i]}")
    print(f"TF-IDF vector features for paragraph {i+1}: {vectors[i]}")

Top 10 words for paragraph 1: ['bilayer', 'lipid', 'cell', 'structure', 'barrier', 'boundaries', 'called', 'cells', 'component', 'components']
TF-IDF vector features for paragraph 1: [0.4656903154237997, 0.3880752628531664, 0.3104602102825331, 0.3104602102825331, 0.15523010514126656, 0.15523010514126656, 0.15523010514126656, 0.15523010514126656, 0.15523010514126656, 0.15523010514126656]
Top 10 words for paragraph 2: ['bilayer', 'lipid', 'cell', 'structure', 'barrier', 'boundaries', 'called', 'cells', 'components', 'composed']
TF-IDF vector features for paragraph 2: [0.3746343246326776, 0.3746343246326776, 0.2809757434745082, 0.2809757434745082, 0.1873171623163388, 0.1873171623163388, 0.1873171623163388, 0.1873171623163388, 0.1873171623163388, 0.1873171623163388]
Top 10 words for paragraph 3: ['lipids', 'fats', 'hydrogen', 'molecules', 'oxygen', 'atoms', 'carbohydrates', 'carbon', 'elements', 'energy']
TF-IDF vector features for paragraph 3: [0.6445033866354896, 0.2864459496157732, 0.21

In [None]:


pd.set_option('display.max_columns', None)

doc_ar = []
for i in range(0,KD):

  jsonload = json.loads(hits[i].raw)
  doc_ar.append(jsonload)

doc_df = pd.json_normalize(doc_ar)
doc_df

Unnamed: 0,id,contents
0,670137,Lipid Bilayer Structure. The lipid bilayer is ...
1,942066,Lipid Bilayer Structure. The lipid bilayer is ...
2,777761,What are lipids? Lipids are one of the four ma...
3,3949941,Cell Membrane Structure. The cell membrane is ...
4,6599489,What is the role of membrane renewal vesicles?...
5,2480684,Lipid Bilayer Structure. The lipid bilayer is ...
6,2823459,Lipids important to the body What are lipids? ...
7,942065,The lipid bilayer is a universal component of ...
8,4418215,Lipids important to the body What are lipids? ...
9,8534840,The Lipid Bilayer. Lipid Bilayer Structure. Th...


In [None]:
doc_df['top10_words'] = top_words_list
doc_df['tfidf_vector'] = vectors
doc_df['Score'] = 0
for i in range(0,KD):
  doc_df.loc[i, 'Score'] = hits[i].score
doc_df['test'] = doc_df['top10_words'].apply(lambda x: ' '.join(x))


In [None]:
combined_text = ' '.join(doc_df['test'])
words = combined_text.split()
words

['bilayer',
 'lipid',
 'cell',
 'structure',
 'barrier',
 'boundaries',
 'called',
 'cells',
 'component',
 'components',
 'bilayer',
 'lipid',
 'cell',
 'structure',
 'barrier',
 'boundaries',
 'called',
 'cells',
 'components',
 'composed',
 'lipids',
 'fats',
 'hydrogen',
 'molecules',
 'oxygen',
 'atoms',
 'carbohydrates',
 'carbon',
 'elements',
 'energy',
 'membrane',
 'lipids',
 'proteins',
 'cell',
 '20',
 '80',
 'body',
 'composed',
 'depending',
 'location',
 'cell',
 'membrane',
 'proteins',
 'add',
 'allows',
 'carrier',
 'change',
 'changing',
 'lipids',
 'new',
 'bilayer',
 'lipid',
 'cell',
 'structure',
 'cells',
 'barrier',
 'boundaries',
 'called',
 'component',
 'components',
 'important',
 'lipids',
 'body',
 'acids',
 'lipid',
 'basic',
 'bile',
 'blocks',
 'building',
 'cell',
 'bilayer',
 'cell',
 'lipid',
 'barrier',
 'boundaries',
 'called',
 'cells',
 'components',
 'composed',
 'critical',
 'important',
 'lipids',
 'body',
 'acids',
 'lipid',
 'basic',
 'bile

In [None]:
vocabulary = (list(set(words)))

print(len(vocabulary))


119


In [None]:
doc_df

Unnamed: 0,id,contents,top10_words,tfidf_vector,Score,test
0,670137,Lipid Bilayer Structure. The lipid bilayer is ...,"[bilayer, lipid, cell, structure, barrier, bou...","[0.4656903154237997, 0.3880752628531664, 0.310...",11.7518,bilayer lipid cell structure barrier boundarie...
1,942066,Lipid Bilayer Structure. The lipid bilayer is ...,"[bilayer, lipid, cell, structure, barrier, bou...","[0.3746343246326776, 0.3746343246326776, 0.280...",11.6763,bilayer lipid cell structure barrier boundarie...
2,777761,What are lipids? Lipids are one of the four ma...,"[lipids, fats, hydrogen, molecules, oxygen, at...","[0.6445033866354896, 0.2864459496157732, 0.214...",11.3958,lipids fats hydrogen molecules oxygen atoms ca...
3,3949941,Cell Membrane Structure. The cell membrane is ...,"[membrane, lipids, proteins, cell, 20, 80, bod...","[0.5183210553488161, 0.34554737023254406, 0.34...",11.3532,membrane lipids proteins cell 20 80 body compo...
4,6599489,What is the role of membrane renewal vesicles?...,"[cell, membrane, proteins, add, allows, carrie...","[0.6099942813304187, 0.457495710997814, 0.3049...",11.3128,cell membrane proteins add allows carrier chan...
5,2480684,Lipid Bilayer Structure. The lipid bilayer is ...,"[bilayer, lipid, cell, structure, cells, barri...","[0.45291081365783825, 0.45291081365783825, 0.3...",11.274,bilayer lipid cell structure cells barrier bou...
6,2823459,Lipids important to the body What are lipids? ...,"[important, lipids, body, acids, lipid, basic,...","[0.4390570399587614, 0.4390570399587614, 0.329...",11.2625,important lipids body acids lipid basic bile b...
7,942065,The lipid bilayer is a universal component of ...,"[bilayer, cell, lipid, barrier, boundaries, ca...","[0.30779350562554625, 0.30779350562554625, 0.3...",11.1917,bilayer cell lipid barrier boundaries called c...
8,4418215,Lipids important to the body What are lipids? ...,"[important, lipids, body, acids, lipid, basic,...","[0.4338609156373123, 0.4338609156373123, 0.325...",11.1407,important lipids body acids lipid basic bile b...
9,8534840,The Lipid Bilayer. Lipid Bilayer Structure. Th...,"[bilayer, lipid, cell, structure, barrier, bou...","[0.5252257314388902, 0.5252257314388902, 0.262...",11.0974,bilayer lipid cell structure barrier boundarie...


In [None]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer1 = CountVectorizer(vocabulary=vocabulary)


matrix1 = vectorizer1.fit_transform(doc_df['test'])
array1 = matrix1.toarray()
size = array1.shape[1]

In [None]:
array1D = [array1[i] for i in range(KD)]
doc_df['feature_vector'] = array1D
doc_df.pop('test')
doc_df["true_label"] = 0
doc_df.loc[:T-1, 'true_label'] = 1
test = doc_df["feature_vector"]
testt = pd.DataFrame(test)
doc_df['feature_vector'] = doc_df['feature_vector'].apply(lambda x: np.array(x))
df_split = pd.DataFrame(testt['feature_vector'].tolist())
df_split = df_split.rename(columns=dict(zip(df_split.columns, vocabulary)))
df_concat = pd.concat([doc_df, df_split], axis=1)
df_concat

Unnamed: 0,id,contents,top10_words,tfidf_vector,Score,feature_vector,true_label,cell,critical,lipid,crucial,plasma,abundant,rafts,immunity,cellular,adhesion,boundaries,cells,carbohydrate,abundance,eukaryotic,proteins,structure,allergic,plays,atoms,vacuoles,hydrogen,john,component,plant,biomolecules,elements,activities,vacuole,carbon,components,carbohydrates,signalling,called,body,actually,water,human,basic,20,bile,oxygen,prominent,maintaining,include,assist,role,bad,building,hormones,fact,cholesterol,changing,article,80,lipids,agents,percent,white,allows,membraneâ,certain,homeostasis,depending,consists,attached,barrier,believed,carrier,change,molecule,group,just,form,energy,fats,phospholipid,markers,terms,fatty,adrenaline,50,serve,functions,make,animal,aka,provide,antigen,examples,blocks,storage,important,forming,acids,counterparts,membranes,function,bilayer,explains,biochemical,contributes,new,receptor,acid,boundary,add,composed,bcr,fluid,asthma,molecules,location,activation,formation,biosynthesis,region,blood,membrane
0,670137,Lipid Bilayer Structure. The lipid bilayer is ...,"[bilayer, lipid, cell, structure, barrier, bou...","[0.4656903154237997, 0.3880752628531664, 0.310...",11.7518,"[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...",1,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,942066,Lipid Bilayer Structure. The lipid bilayer is ...,"[bilayer, lipid, cell, structure, barrier, bou...","[0.3746343246326776, 0.3746343246326776, 0.280...",11.6763,"[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...",1,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,777761,What are lipids? Lipids are one of the four ma...,"[lipids, fats, hydrogen, molecules, oxygen, at...","[0.6445033866354896, 0.2864459496157732, 0.214...",11.3958,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,3949941,Cell Membrane Structure. The cell membrane is ...,"[membrane, lipids, proteins, cell, 20, 80, bod...","[0.5183210553488161, 0.34554737023254406, 0.34...",11.3532,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1
4,6599489,What is the role of membrane renewal vesicles?...,"[cell, membrane, proteins, add, allows, carrie...","[0.6099942813304187, 0.457495710997814, 0.3049...",11.3128,"[1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
5,2480684,Lipid Bilayer Structure. The lipid bilayer is ...,"[bilayer, lipid, cell, structure, cells, barri...","[0.45291081365783825, 0.45291081365783825, 0.3...",11.274,"[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...",1,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,2823459,Lipids important to the body What are lipids? ...,"[important, lipids, body, acids, lipid, basic,...","[0.4390570399587614, 0.4390570399587614, 0.329...",11.2625,"[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,942065,The lipid bilayer is a universal component of ...,"[bilayer, cell, lipid, barrier, boundaries, ca...","[0.30779350562554625, 0.30779350562554625, 0.3...",11.1917,"[1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...",1,1,1,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
8,4418215,Lipids important to the body What are lipids? ...,"[important, lipids, body, acids, lipid, basic,...","[0.4338609156373123, 0.4338609156373123, 0.325...",11.1407,"[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",1,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,8534840,The Lipid Bilayer. Lipid Bilayer Structure. Th...,"[bilayer, lipid, cell, structure, barrier, bou...","[0.5252257314388902, 0.5252257314388902, 0.262...",11.0974,"[1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, ...",1,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
X = df_concat.iloc[:, 7:]
y = df_concat['true_label']
dataset = pd.concat([X,y],axis=1)

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
model1 = RandomForestClassifier(max_depth=100, random_state=42)
model1.fit(X_train, y_train)

In [None]:
y_pred1 = model1.predict(X_train)

print("Accuracy:", accuracy_score(y_train, y_pred1))

print(classification_report(y_train, y_pred1))



Accuracy: 0.9166666666666666
              precision    recall  f1-score   support

           0       1.00      0.88      0.93        16
           1       0.80      1.00      0.89         8

    accuracy                           0.92        24
   macro avg       0.90      0.94      0.91        24
weighted avg       0.93      0.92      0.92        24



In [None]:
y_pred2 = model1.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred2))

print(classification_report(y_test, y_pred2))


Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00         4
           1       1.00      1.00      1.00         2

    accuracy                           1.00         6
   macro avg       1.00      1.00      1.00         6
weighted avg       1.00      1.00      1.00         6



In [None]:
X

Unnamed: 0,cell,critical,lipid,crucial,plasma,abundant,rafts,immunity,cellular,adhesion,boundaries,cells,carbohydrate,abundance,eukaryotic,proteins,structure,allergic,plays,atoms,vacuoles,hydrogen,john,component,plant,biomolecules,elements,activities,vacuole,carbon,components,carbohydrates,signalling,called,body,actually,water,human,basic,20,bile,oxygen,prominent,maintaining,include,assist,role,bad,building,hormones,fact,cholesterol,changing,article,80,lipids,agents,percent,white,allows,membraneâ,certain,homeostasis,depending,consists,attached,barrier,believed,carrier,change,molecule,group,just,form,energy,fats,phospholipid,markers,terms,fatty,adrenaline,50,serve,functions,make,animal,aka,provide,antigen,examples,blocks,storage,important,forming,acids,counterparts,membranes,function,bilayer,explains,biochemical,contributes,new,receptor,acid,boundary,add,composed,bcr,fluid,asthma,molecules,location,activation,formation,biosynthesis,region,blood,membrane
0,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0
3,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,1
4,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1
5,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
6,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
7,1,1,1,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0
8,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,1,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
9,1,0,1,0,0,0,0,0,0,0,1,1,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
df

Unnamed: 0.1,Unnamed: 0,query_id,query,Rank,Score,Doc_id,Document
45,45,844658,what is the role of lipids in a cell,28,10.3397,6159679,Answers. Best Answer: The cell membrane is pri...
46,46,844658,what is the role of lipids in a cell,25,10.5129,2043703,The membrane that surrounds a cell is made up ...
47,47,844658,what is the role of lipids in a cell,18,10.8006,7129659,Lipids Examples. Lipids are the fatty or waxy ...
48,48,844658,what is the role of lipids in a cell,17,10.808,5480522,"The Lipid Bilayer, page 2. The lipid bilayer i..."
49,49,844658,what is the role of lipids in a cell,15,10.8396,1762072,Lipid Bilayer Structure. The lipid bilayer is ...


In [None]:
doc_ranks = np.array(df[['Rank']]).flatten()

doc_ranks

array([28, 25, 18, 17, 15])

In [None]:
T = X[23:24]
a = []
for column in T.columns:
    if T[column].any():
        a.append(column)
a

['cell',
 'lipid',
 'abundant',
 'plays',
 'body',
 'role',
 'bad',
 'hormones',
 'cholesterol',
 'adrenaline']

In [None]:
doc_fv = X[23:24]
doc_fv

Unnamed: 0,cell,critical,lipid,crucial,plasma,abundant,rafts,immunity,cellular,adhesion,boundaries,cells,carbohydrate,abundance,eukaryotic,proteins,structure,allergic,plays,atoms,vacuoles,hydrogen,john,component,plant,biomolecules,elements,activities,vacuole,carbon,components,carbohydrates,signalling,called,body,actually,water,human,basic,20,bile,oxygen,prominent,maintaining,include,assist,role,bad,building,hormones,fact,cholesterol,changing,article,80,lipids,agents,percent,white,allows,membraneâ,certain,homeostasis,depending,consists,attached,barrier,believed,carrier,change,molecule,group,just,form,energy,fats,phospholipid,markers,terms,fatty,adrenaline,50,serve,functions,make,animal,aka,provide,antigen,examples,blocks,storage,important,forming,acids,counterparts,membranes,function,bilayer,explains,biochemical,contributes,new,receptor,acid,boundary,add,composed,bcr,fluid,asthma,molecules,location,activation,formation,biosynthesis,region,blood,membrane
23,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:

doc_fv = X[23:24]
d = dice_ml.Data(dataframe=dataset, continuous_features=[], outcome_name='true_label')
m = dice_ml.Model(model=model1, backend="sklearn")
exp = dice_ml.Dice(d, m, method="random")
e1 = exp.generate_counterfactuals(doc_fv, total_CFs=1, desired_class="opposite")
xx=json.loads(e1.to_json())
cf = xx["cfs_list"][0][0]



100%|██████████| 1/1 [00:01<00:00,  1.04s/it]


In [None]:
final_cf = cf[:-1]

print(final_cf)

['0', '1', 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, '0', 0, 0, 0, '0', 0, 0, 1, 1, 0, 1, '1', 1, 0, 0, 0, 0, 0, 0, 0, 0, '1', 0, 0, 0, 0, 0, 0, 0, 0, 0, '0', 0, 0, 0, '1', 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, '1', 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, '0', '1', 0, 0, 0, 0, '1', 0, 0, 0, 0, 0, 0, 0, 0, 0, '1', 0]


In [None]:
cf_df = pd.DataFrame([final_cf], columns=X.columns)
cf_df



Unnamed: 0,cell,critical,lipid,crucial,plasma,abundant,rafts,immunity,cellular,adhesion,boundaries,cells,carbohydrate,abundance,eukaryotic,proteins,structure,allergic,plays,atoms,vacuoles,hydrogen,john,component,plant,biomolecules,elements,activities,vacuole,carbon,components,carbohydrates,signalling,called,body,actually,water,human,basic,20,bile,oxygen,prominent,maintaining,include,assist,role,bad,building,hormones,fact,cholesterol,changing,article,80,lipids,agents,percent,white,allows,membraneâ,certain,homeostasis,depending,consists,attached,barrier,believed,carrier,change,molecule,group,just,form,energy,fats,phospholipid,markers,terms,fatty,adrenaline,50,serve,functions,make,animal,aka,provide,antigen,examples,blocks,storage,important,forming,acids,counterparts,membranes,function,bilayer,explains,biochemical,contributes,new,receptor,acid,boundary,add,composed,bcr,fluid,asthma,molecules,location,activation,formation,biosynthesis,region,blood,membrane
0,0,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,1,0,1,1,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0


In [None]:
aa = []
for column in cf_df.columns:
    if cf_df[column].any():
        aa.append(column)
aa

['cell',
 'critical',
 'lipid',
 'abundant',
 'plays',
 'body',
 '20',
 'maintaining',
 'role',
 'bad',
 'hormones',
 'fact',
 'cholesterol',
 'membraneâ',
 'molecule',
 'energy',
 'adrenaline',
 'blocks',
 'contributes',
 'new',
 'composed',
 'blood']

In [None]:
def find_words(arr):
  a = []
  for column in arr.columns:
    if arr[column].any():
        a.append(column)
  return a

In [None]:
df

Unnamed: 0.1,Unnamed: 0,query_id,query,Rank,Score,Doc_id,Document
45,45,844658,what is the role of lipids in a cell,28,10.3397,6159679,Answers. Best Answer: The cell membrane is pri...
46,46,844658,what is the role of lipids in a cell,25,10.5129,2043703,The membrane that surrounds a cell is made up ...
47,47,844658,what is the role of lipids in a cell,18,10.8006,7129659,Lipids Examples. Lipids are the fatty or waxy ...
48,48,844658,what is the role of lipids in a cell,17,10.808,5480522,"The Lipid Bilayer, page 2. The lipid bilayer i..."
49,49,844658,what is the role of lipids in a cell,15,10.8396,1762072,Lipid Bilayer Structure. The lipid bilayer is ...


In [None]:
new_pred = []
new_cf = []
words_new_dice = []
for i in doc_ranks:
  doc_fv = X[i-1:i]
  d = dice_ml.Data(dataframe=dataset, continuous_features=[], outcome_name='true_label')
  m = dice_ml.Model(model=model1, backend="sklearn")
  exp = dice_ml.Dice(d, m, method="random")
  e1 = exp.generate_counterfactuals(doc_fv, total_CFs=1, desired_class="opposite")
  xx=json.loads(e1.to_json())
  cf = xx["cfs_list"][0][0]
  cf = cf[:-1]
  cf_df = pd.DataFrame([cf], columns=X.columns)
  class_out = model1.predict(cf_df)

  new_pred.append(class_out)




  jsontest = json.loads(hits[i-1].raw)
  texts = (jsontest["contents"][:1000])
  words_doc_fv = find_words(doc_fv)
  words_cf = find_words(cf_df)
  words_new = list(set(words_cf) - set(words_doc_fv))
  words_new = ' '.join(words_new)
  words_new_dice.append(words_new)
  new_cff = texts + words_new
  new_cf.append(new_cff)







100%|██████████| 1/1 [00:00<00:00,  2.50it/s]
100%|██████████| 1/1 [00:00<00:00,  1.14it/s]
100%|██████████| 1/1 [00:00<00:00,  1.49it/s]
100%|██████████| 1/1 [00:00<00:00,  2.49it/s]
100%|██████████| 1/1 [00:00<00:00,  2.57it/s]


In [None]:
print(new_pred)
print(new_cf)
print(words_new_dice)

[array([1]), array([1]), array([1]), array([1]), array([0])]
['Answers. Best Answer: The cell membrane is primarily composed of a mix of proteins and lipids. Depending on the membraneâ\x80\x99s location and role in the body, lipids can make up anywhere from 20 to 80 percent of the membrane, with the remainder being proteins.epending on the membraneâ\x80\x99s location and role in the body, lipids can make up anywhere from 20 to 80 percent of the membrane, with the remainder being proteins.molecules group', 'The membrane that surrounds a cell is made up of proteins and lipids. Depending on the membraneâ\x80\x99s location and role in the body, lipids can make up anywhere from 20 to 80 percent of the membrane, with the remainder being proteins. Cholesterol, which is not found in plant cells, is a type of lipid that helps stiffen the membrane.change hormones percent basic immunity changing allows', 'Lipids Examples. Lipids are the fatty or waxy molecules that make up the basic structure of 

In [None]:
df['Classifier_Prediction'] = [item for sublist in new_pred for item in sublist]
df['CF_generatedd'] = new_cf
df['New_words'] = words_new_dice
df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['Classifier_Prediction'] = [item for sublist in new_pred for item in sublist]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['CF_generatedd'] = new_cf
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['New_words'] = words_new_dice


Unnamed: 0.1,Unnamed: 0,query_id,query,Rank,Score,Doc_id,Document,Classifier_Prediction,CF_generatedd,New_words
45,45,844658,what is the role of lipids in a cell,28,10.3397,6159679,Answers. Best Answer: The cell membrane is pri...,1,Answers. Best Answer: The cell membrane is pri...,molecules group
46,46,844658,what is the role of lipids in a cell,25,10.5129,2043703,The membrane that surrounds a cell is made up ...,1,The membrane that surrounds a cell is made up ...,change hormones percent basic immunity changin...
47,47,844658,what is the role of lipids in a cell,18,10.8006,7129659,Lipids Examples. Lipids are the fatty or waxy ...,1,Lipids Examples. Lipids are the fatty or waxy ...,molecule components carbohydrates blocks prote...
48,48,844658,what is the role of lipids in a cell,17,10.808,5480522,"The Lipid Bilayer, page 2. The lipid bilayer i...",1,"The Lipid Bilayer, page 2. The lipid bilayer i...",critical counterparts
49,49,844658,what is the role of lipids in a cell,15,10.8396,1762072,Lipid Bilayer Structure. The lipid bilayer is ...,0,Lipid Bilayer Structure. The lipid bilayer is ...,fluid


In [None]:
df.to_csv(f'query{no}.csv')

In [None]:
filenames = ["/content/query0.csv", "/content/query1.csv", "/content/query2.csv","/content/query3.csv",'/content/query4.csv','/content/query5.csv','/content/query6.csv','/content/query7.csv','/content/query8.csv','/content/query9.csv']  # Add more filenames to this list as needed

combined_cf_df = pd.concat([pd.read_csv(f) for f in filenames], ignore_index=True)
combined_cf_df

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,query_id,query,Rank,Score,Doc_id,Document,Classifier_Prediction,CF_generatedd,New_words
0,0,0,992839,definition of curfew,24,8.0823,3807532,What curfew rules should you set for your teen...,1,What curfew rules should you set for your teen...,lift definition inside 27 00 cambridge day 2a
1,1,1,992839,definition of curfew,17,8.266,8191267,"An additional 6 percent, or 23 cities, were co...",0,"An additional 6 percent, or 23 cities, were co...",designated appears consequences children 53 00...
2,2,2,992839,definition of curfew,12,8.6025,5207344,The definition of universal is relating to or ...,1,The definition of universal is relating to or ...,county designated applying days authorities ac...
3,3,3,992839,definition of curfew,26,8.044699,3807525,Curfew rules can be adjusted as your teen grow...,0,Curfew rules can be adjusted as your teen grow...,dictionary academic effect 25 00 cambridge pm ...
4,4,4,992839,definition of curfew,19,8.1833,2620523,Discuss is defined as to talk about and to con...,0,Discuss is defined as to talk about and to con...,14th enacted 37 activities 00 cambridge 1897
5,5,5,1013965,"what type of wave is electromagnetic,",15,12.8568,2646538,FIND OUT MORE. Many different kinds of energy ...,1,FIND OUT MORE. Many different kinds of energy ...,rays classified visible electric highest trans...
6,6,6,1013965,"what type of wave is electromagnetic,",19,12.7204,6615903,The electromagnetic spectrum contains many typ...,0,The electromagnetic spectrum contains many typ...,include properties air including 10 field dete...
7,7,7,1013965,"what type of wave is electromagnetic,",24,12.6659,752147,Sound waves and electromagnetic waves are diff...,1,Sound waves and electromagnetic waves are diff...,video 10 causes detection called right definit...
8,8,8,1013965,"what type of wave is electromagnetic,",17,12.7295,1742690,"In transverse waves, the oscillations (vibrati...",1,"In transverse waves, the oscillations (vibrati...",according vacuum
9,9,9,1013965,"what type of wave is electromagnetic,",18,12.7237,1839,Difference Between Electromagnetic Waves and R...,1,Difference Between Electromagnetic Waves and R...,beach frequency devices extremely magnetic gir...


In [None]:
combined_cf_df.to_csv("combined_cf.csv", index=False)
