In [1]:
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import CharacterTextSplitter

# Load the document
loader = TextLoader('speech.txt')
document = loader.load()

# Split the document into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=25)
docs = text_splitter.split_documents(document)

# Initialize the embedding with a valid model
try:
    embedding = OllamaEmbeddings(model_name='llama2')  # Specify the correct model name
except ValueError as e:
    print(f"Error initializing embedding: {e}")
    print("Exiting due to failure in model initialization.")
    exit(1)

# Create the FAISS vector store from documents
try:
    db = FAISS.from_documents(docs, embedding)
except ValueError as e:
    print(f"Error creating FAISS vector store: {e}")
    print("Exiting due to failure in creating FAISS vector store.")
    exit(1)

print("FAISS vector store created successfully.")


Error initializing embedding: 1 validation error for OllamaEmbeddings
model_name
  extra fields not permitted (type=value_error.extra)
Exiting due to failure in model initialization.


NameError: name 'embedding' is not defined

In [2]:
import logging
from langchain_community.document_loaders import TextLoader
from langchain_community.vectorstores import FAISS
from langchain_community.embeddings import OllamaEmbeddings
from langchain_text_splitters import CharacterTextSplitter

# Configure logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Load the document
loader = TextLoader('speech.txt')
document = loader.load()

# Split the document into chunks
text_splitter = CharacterTextSplitter(chunk_size=1000, chunk_overlap=25)
docs = text_splitter.split_documents(document)

# Initialize the embedding
try:
    embedding = OllamaEmbeddings(model='gemma:2b')  # Initialize without model_name
    logger.info("Embedding model initialized successfully.")
except ValueError as e:
    logger.error(f"Error initializing embedding: {e}")
    logger.info("Exiting due to failure in model initialization.")
    exit(1)

# Create the FAISS vector store from documents
try:
    db = FAISS.from_documents(docs, embedding)
    logger.info("FAISS vector store created successfully.")
except ValueError as e:
    logger.error(f"Error creating FAISS vector store: {e}")
    if "404" in str(e):
        logger.error("Model not found. Please ensure the model is available.")
    logger.info("Exiting due to failure in creating FAISS vector store.")
    exit(1)


INFO:__main__:Embedding model initialized successfully.
INFO:faiss.loader:Loading faiss with AVX2 support.
INFO:faiss.loader:Successfully loaded faiss with AVX2 support.
INFO:__main__:FAISS vector store created successfully.


In [3]:
query='what does speaker believe to'


In [4]:
dosc=db.similarity_search(query)

In [5]:
print(docs)

[Document(metadata={'source': 'speech.txt'}, page_content='The world must be made safe for democracy. Its peace must be planted upon the tested foundations of political liberty. We have no selfish ends to serve. We desire no conquest, no dominion. We seek no indemnities for ourselves, no material compensation for the sacrifices we shall freely make. We are but one of the champions of the rights of mankind. We shall be satisfied when those rights have been made as secure as the faith and the freedom of nations can make them.\n\nJust because we fight without rancor and without selfish object, seeking nothing for ourselves but what we shall wish to share with all free peoples, we shall, I feel confident, conduct our operations as belligerents without passion and ourselves observe with proud punctilio the principles of right and of fair play we profess to be fighting for.\n\n…'), Document(metadata={'source': 'speech.txt'}, page_content='…\n\nIt will be all the easier for us to conduct ours

In [6]:
# this allow it to work with other langchain method
retiver=db.as_retriever()
docs=retiver.invoke(query)
docs[0].page_content

'We have borne with their present government through all these bitter months because of that friendship—exercising a patience and forbearance which would otherwise have been impossible. We shall, happily, still have an opportunity to prove that friendship in our daily attitude and actions toward the millions of men and women of German birth and native sympathy who live among us and share our life, and we shall be proud to prove it toward all who are in fact loyal to their neighbors and to the government in the hour of test. They are, most of them, as true and loyal Americans as if they had never known any other fealty or allegiance. They will be prompt to stand with us in rebuking and restraining the few who may be of a different mind and purpose. If there should be disloyalty, it will be dealt with with a firm hand of stern repression; but, if it lifts its head at all, it will lift it only here and there and without countenance except from a lawless and malignant few.'

In [7]:
doc_score=db.similarity_search_with_score(query)
print(doc_score)

[(Document(metadata={'source': 'speech.txt'}, page_content='We have borne with their present government through all these bitter months because of that friendship—exercising a patience and forbearance which would otherwise have been impossible. We shall, happily, still have an opportunity to prove that friendship in our daily attitude and actions toward the millions of men and women of German birth and native sympathy who live among us and share our life, and we shall be proud to prove it toward all who are in fact loyal to their neighbors and to the government in the hour of test. They are, most of them, as true and loyal Americans as if they had never known any other fealty or allegiance. They will be prompt to stand with us in rebuking and restraining the few who may be of a different mind and purpose. If there should be disloyalty, it will be dealt with with a firm hand of stern repression; but, if it lifts its head at all, it will lift it only here and there and without countenanc

In [8]:
embedding_vector=embedding.embed_documents(query)
embedding_vector

[[-0.5765013694763184,
  -1.7552490234375,
  0.014628933742642403,
  -0.02671516314148903,
  -0.020177148282527924,
  -0.5226941108703613,
  0.5958667993545532,
  0.07543151080608368,
  3.0704197883605957,
  -1.614870548248291,
  0.6001216173171997,
  -0.7855664491653442,
  -0.23010052740573883,
  0.8946802020072937,
  0.651324450969696,
  1.232951283454895,
  0.5071627497673035,
  -1.9598907232284546,
  0.46549561619758606,
  -0.8398616313934326,
  -0.030219009146094322,
  -0.011305670253932476,
  -0.6374347805976868,
  -1.218986988067627,
  0.07336674630641937,
  1.755157709121704,
  3.0102014541625977,
  -1.0671836137771606,
  2.1704540252685547,
  1.9374130964279175,
  -0.2982863187789917,
  0.6008247137069702,
  0.20298047363758087,
  -0.14382335543632507,
  0.03152168542146683,
  -1.6451146602630615,
  1.748512625694275,
  -0.6271026134490967,
  0.14330480992794037,
  -0.09662926197052002,
  -0.4357972741127014,
  -2.6340980529785156,
  0.8591344952583313,
  0.5270428657531738,
 

In [11]:
docs_score_vector=db.similarity_search_by_vector(embedding_vector)

In [10]:
embedding_vector=embedding.embed_query(query)
embedding_vector

[-0.12818007171154022,
 -1.7380225658416748,
 -0.6366466283798218,
 2.2799556255340576,
 0.44607070088386536,
 1.0427672863006592,
 0.31115421652793884,
 -0.0669110119342804,
 0.05927254259586334,
 0.6438223719596863,
 1.1168874502182007,
 0.6517317891120911,
 1.8048912286758423,
 0.1694083958864212,
 0.5401955246925354,
 -0.1454387605190277,
 7.449321746826172,
 1.5383806228637695,
 -0.0141797736287117,
 -0.37289246916770935,
 -0.391854852437973,
 0.30739471316337585,
 0.8384457230567932,
 0.010279931128025055,
 -0.43966078758239746,
 0.10551584511995316,
 -0.028239896520972252,
 -0.5971493124961853,
 -0.008015882223844528,
 -0.3118875324726105,
 0.49002113938331604,
 0.7042258977890015,
 -0.41978713870048523,
 0.06343997269868851,
 -0.9439117312431335,
 -0.45556068420410156,
 -0.24267731606960297,
 -0.5116889476776123,
 0.35439521074295044,
 -1.0039399862289429,
 0.6277124881744385,
 0.5890437960624695,
 -0.05449380353093147,
 -1.488599181175232,
 -1.2997413873672485,
 1.146688818931

In [None]:
['male', 'female']

In [13]:
from sklearn.feature_extraction.text import CountVectorizer

# Sample text data
documents = [
    "Python is an amazing programming language and java programming.",
    "Python is great for data science.",
    "I love coding in Python.",
    "Machine learning and data science are fascinating."
]

# Create a CountVectorizer instance
vectorizer = CountVectorizer()

# Fit and transform the documents to get the BoW representation
X = vectorizer.fit_transform(documents)

# Get feature names (words)
feature_names = vectorizer.get_feature_names_out()

# Convert the result to an array
X_array = X.toarray()

# Display the BoW representation
print("Feature names:", feature_names)
print("BoW representation (as array):")
print(X_array)


Feature names: ['amazing' 'an' 'and' 'are' 'coding' 'data' 'fascinating' 'for' 'great'
 'in' 'is' 'java' 'language' 'learning' 'love' 'machine' 'programming'
 'python' 'science']
BoW representation (as array):
[[1 1 1 0 0 0 0 0 0 0 1 1 1 0 0 0 2 1 0]
 [0 0 0 0 0 1 0 1 1 0 1 0 0 0 0 0 0 1 1]
 [0 0 0 0 1 0 0 0 0 1 0 0 0 0 1 0 0 1 0]
 [0 0 1 1 0 1 1 0 0 0 0 0 0 1 0 1 0 0 1]]
