In [9]:
import torch
from transformers import BertTokenizer, BertModel
from torch.nn.functional import cosine_similarity

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Example input texts
text1 = "this is the first text."
text2 = "he is a eggplant"

# Tokenize the input texts
tokens = tokenizer([text1, text2], padding=True, truncation=True, return_tensors='pt')

# Generate the BERT embeddings
outputs = model(**tokens)
embeddings = outputs.last_hidden_state

# Calculate similarity using cosine similarity
similarity = cosine_similarity(embeddings, embeddings, dim=1)

print(f"Similarity matrix:\n{similarity}")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Similarity matrix:
tensor([[1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000],
        [1.0000, 1.0000, 1.0000,  ..., 1.0000, 1.0000, 1.0000]],
       grad_fn=<SumBackward1>)


In [5]:
from transformers import BertTokenizer, BertModel

# Load the tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Example input texts
text1 = "This is the first text."
text2 = "This is the second text."

# Tokenize the input texts
tokens1 = tokenizer.encode_plus(text1, padding=True, truncation=True, return_tensors="pt")
tokens2 = tokenizer.encode_plus(text2, padding=True, truncation=True, return_tensors="pt")

# Forward pass through the model
outputs1 = model(**tokens1)
outputs2 = model(**tokens2)

# Get the embeddings from the model outputs
embeddings1 = outputs1.last_hidden_state
embeddings2 = outputs2.last_hidden_state

# Compare the embeddings (e.g., using cosine similarity)
similarity = torch.cosine_similarity(embeddings1, embeddings2)
print(similarity)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


tensor([[ 0.9761,  0.8488,  0.8985,  0.9620,  0.9672,  0.9361,  0.9398,  0.9939,
          0.9560,  0.8898,  0.9228,  0.9933,  0.9483,  0.6590,  0.7932,  0.9931,
          0.9688,  0.4430,  0.9622,  0.7619,  0.8228,  0.8193,  0.9656,  0.7524,
          0.9957,  0.7742,  0.9358,  0.4830,  0.7489,  0.9480,  0.9630,  0.8613,
          0.9103,  0.6640,  0.7027,  0.9062,  0.9872,  0.9091,  0.9371,  0.9601,
          0.9855,  0.9413,  0.9488,  0.9107,  0.9941,  0.9442,  0.9678,  0.9032,
          0.8676,  0.9703,  0.9815,  0.8849,  0.9315,  0.9265,  0.9906,  0.9809,
          0.9113,  0.9776,  0.9409,  0.3558,  0.9711,  0.9530,  0.8570,  0.9202,
          0.9499,  0.7722,  0.9653,  0.9917,  0.9937,  0.9363,  0.9727,  0.9850,
          0.8450,  0.8572,  0.9257,  0.8860,  0.9347,  0.9764,  0.9852,  0.9748,
          0.9712,  0.9924,  0.8240,  0.9288,  0.9596,  0.7782,  0.9809,  0.8212,
          0.8698,  0.9802,  0.8960,  0.9630,  0.8945,  0.7831,  0.7525,  0.9963,
          0.4533,  0.6399,  

In [16]:
from transformers import pipeline
unmasker = pipeline('fill-mask', model='bert-base-uncased')
unmasker("Rahul is well versed in [MASK] and dancing")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


[{'score': 0.43587255477905273,
  'token': 2189,
  'token_str': 'music',
  'sequence': 'rahul is well versed in music and dancing'},
 {'score': 0.27947288751602173,
  'token': 4823,
  'token_str': 'singing',
  'sequence': 'rahul is well versed in singing and dancing'},
 {'score': 0.1585802137851715,
  'token': 3772,
  'token_str': 'acting',
  'sequence': 'rahul is well versed in acting and dancing'},
 {'score': 0.0125444820150733,
  'token': 3689,
  'token_str': 'drama',
  'sequence': 'rahul is well versed in drama and dancing'},
 {'score': 0.011465097777545452,
  'token': 2299,
  'token_str': 'song',
  'sequence': 'rahul is well versed in song and dancing'}]

In [18]:
from transformers import BertTokenizer, BertModel
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained("bert-base-uncased")
text = "Replace me by any text you'd like."
encoded_input = tokenizer(text, return_tensors='pt')
output = model(**encoded_input)
print(output)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.1386,  0.1583, -0.2967,  ..., -0.2708, -0.2844,  0.4581],
         [ 0.5364, -0.2327,  0.1754,  ...,  0.5540,  0.4981, -0.0024],
         [ 0.3002, -0.3475,  0.1208,  ..., -0.4562,  0.3288,  0.8773],
         ...,
         [ 0.3799,  0.1203,  0.8283,  ..., -0.8624, -0.5957,  0.0471],
         [-0.0252, -0.7177, -0.6950,  ...,  0.0757, -0.6668, -0.3401],
         [ 0.7535,  0.2391,  0.0717,  ...,  0.2467, -0.6458, -0.3213]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.9377, -0.5043, -0.9799,  0.9030,  0.9329, -0.2438,  0.8926,  0.2288,
         -0.9531, -1.0000, -0.8862,  0.9906,  0.9855,  0.7155,  0.9455, -0.8645,
         -0.6035, -0.6666,  0.3020, -0.1587,  0.7455,  1.0000, -0.4022,  0.4261,
          0.6151,  0.9996, -0.8773,  0.9594,  0.9585,  0.6950, -0.6718,  0.3325,
         -0.9954, -0.2268, -0.9658, -0.9951,  0.6127, -0.7670,  0.0873,  0.0824,
         -0.9518,  0.4713,  1.00

In [5]:
from transformers import AutoTokenizer, AutoModel
from scipy.spatial.distance import cosine

# Load the tokenizer and model
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
model = AutoModel.from_pretrained('bert-base-uncased')

# Example sentences
sentence1 = "The cat sits on the mat."
sentence2 = "The dog plays in the park."

# Tokenize and encode the sentences
tokens = tokenizer([sentence1, sentence2], padding=True, truncation=True, return_tensors='pt')
outputs = model(**tokens)

# # Get the sentence embeddings
embeddings = outputs.last_hidden_state.mean(dim=1)  # Average pooling of token embeddings

# # Calculate the cosine similarity between the sentence embeddings
similarity = 1 - cosine(embeddings[0], embeddings[1])

# print(f"Similarity: {similarity:.4f}")


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


RuntimeError: Can't call numpy() on Tensor that requires grad. Use tensor.detach().numpy() instead.

In [6]:
from setuptools import setup, find_packages

long_description = '''
Easily train your own text-generating neural network of
any size and complexity on any text dataset with a few lines
of code, or quickly train on a text using a pretrained model.

- A modern neural network architecture which utilizes new techniques as
attention-weighting and skip-embedding to accelerate training
and improve model quality.
- Able to train on and generate text at either the
character-level or word-level.
- Able to configure RNN size, the number of RNN layers,
and whether to use bidirectional RNNs.
- Able to train on any generic input text file, including large files.
- Able to train models on a GPU and then use them with a CPU.
- Able to utilize a powerful CuDNN implementation of RNNs
when trained on the GPU, which massively speeds up training time as
opposed to normal LSTM implementations.
- Able to train the model using contextual labels,
allowing it to learn faster and produce better results in some cases.
- Able to generate text interactively for customized stories.
'''


setup(
    name='textgenrnn',
    packages=['textgenrnn'],  # this must be the same as the name above
    version='2.0.0',
    description='Easily train your own text-generating neural network ' \
    'of any size and complexity',
    long_description=long_description,
    long_description_content_type='text/markdown',
    author='Max Woolf',
    author_email='max@minimaxir.com',
    url='https://github.com/minimaxir/textgenrnn',
    keywords=['deep learning', 'tensorflow', 'keras', 'text generation'],
    classifiers=[],
    license='MIT',
    python_requires='>=3.5',
    include_package_data=True,
    install_requires=['h5py', 'scikit-learn', 'tqdm', 'tensorflow>=2.1.0']
)

AssertionError: 

In [1]:
import sys

locate_python = sys.exec_prefix

print(locate_python)

c:\Users\Hp\AppData\Local\Programs\Python\Python310


In [22]:
# import re
import torch
import whisper
from typing import NewType
import warnings
import timeit
from functools import lru_cache
import warnings
# warnings.filterwarnings("ignore", category=DeprecationWarning)

# path_of_audio = NewType('path_of_i_th_audio',str)

def banner(text):
    # """Display a message when the script is working in the background"""
    print(f"# {text} #\n")


def check_device():
    
    # """Check CUDA availability."""
    if torch.cuda.is_available() == 1:
        device = "cuda"
        
    else:
        device = "cpu"
    print(device)
    return device

# """Get speech recognition model."""
# model_name = input("Select speech recognition model name (tiny, base, small, medium, large): ")
@lru_cache(maxsize=None)  # Decorator to enable caching
def load_model(model_name):
    
    return whisper.load_model(model_name, device=check_device())

def convertText(AUDIOFILE):
    
    # warnings.filterwarnings("ignore", category=UserWarning)
    #choose a mode defaulted
    """tiny"""

    model_name = "base"

    banner("Transcribing texts")
    model = load_model(model_name)
        
    # for i in range(0,len(AUDIOFILE)):
        
    result = model.transcribe(AUDIOFILE)
        
    # print("Result: ",result["text"])
    
        
    # warnings.resetwarnings()
    return result["text"]
    
    # return list_temp
    
# measure execution time of my_function
print(convertText(r"D:\Projects and codes\interview\resources\extinsion_interview\out1.mp3"))


# Transcribing texts #

cuda
 Artificial intelligence is a branch of computer science and engineering that focuses on developing intelligent machines capable of performing quantitative tasks that are traditionally associated with human beings. Data structure is a collection of data elements that are organized in a particular way to facilitate efficient processing of data, different types of data structure or stutter to different kinds of applications depending on the nature of data and the operation that need to be performed. Data planning is a type of artificial intelligence that relies on artificial neural network with multiple layers to process and analyze large amount of data. These networks are trained using algorithms that adjust the weights and biases of the connections between neurons to optimize the performance on a specific task. Machine learning is a process of teaching computer to learn from the data without being explicitly programmed by analyzing and finding patterns in a

In [12]:
# model.py

import torch
import whisper

class SpeechRecognitionModel:
    _instance = None

    def __new__(cls, *args, **kwargs):
        if not cls._instance:
            cls._instance = super(SpeechRecognitionModel, cls).__new__(cls)
        return cls._instance


    def __init__(self, model_name):
        if not hasattr(self, "model"):
            self.model = whisper.load_model(model_name, device=self.check_device())

    def check_device(self):
        if torch.cuda.is_available() == 1:
            return "cuda"
        else:
            return "cpu"

In [13]:
# transcriber.py

# from model import SpeechRecognitionModel

def convertText(AUDIOFILE):
    list_temp = []
    model_name = "base"
    model = SpeechRecognitionModel(model_name)
    banner("Transcribing texts")
    result = model.model.transcribe(AUDIOFILE)
    list_temp.append(result["text"])
    return list_temp

In [18]:
convertText(r"D:\Projects and codes\interview\resources\extinsion_interview\out1.mp3")

# Transcribing texts #



[' Artificial intelligence is a branch of computer science and engineering that focuses on developing intelligent machines capable of performing quantitative tasks that are traditionally associated with human beings. Data structure is a collection of data elements that are organized in a particular way to facilitate efficient processing of data, different types of data structure or stutter to different kinds of applications depending on the nature of data and the operation that need to be performed. Data planning is a type of artificial intelligence that relies on artificial neural network with multiple layers to process and analyze large amount of data. These networks are trained using algorithms that adjust the weights and biases of the connections between neurons to optimize the performance on a specific task. Machine learning is a process of teaching computer to learn from the data without being explicitly programmed by analyzing and finding patterns in a large dataset. Machine lea

In [19]:
convertText(r"D:\Projects and codes\interview\resources\extinsion_interview\out1.mp3")

# Transcribing texts #



[' Artificial intelligence is a branch of computer science and engineering that focuses on developing intelligent machines capable of performing quantitative tasks that are traditionally associated with human beings. Data structure is a collection of data elements that are organized in a particular way to facilitate efficient processing of data, different types of data structure or stutter to different kinds of applications depending on the nature of data and the operation that need to be performed. Data planning is a type of artificial intelligence that relies on artificial neural network with multiple layers to process and analyze large amount of data. These networks are trained using algorithms that adjust the weights and biases of the connections between neurons to optimize the performance on a specific task. Machine learning is a process of teaching computer to learn from the data without being explicitly programmed by analyzing and finding patterns in a large dataset. Machine lea

In [10]:
from pydub import AudioSegment
import numpy as np

def remove_silence(audio_file, silence_threshold=30):
    audio = AudioSegment.from_file(audio_file)

    # Convert audio to numpy array
    audio_data = np.array(audio.get_array_of_samples())

    # Calculate the amplitude of the audio
    audio_amplitude = np.abs(audio_data)

    # Calculate the threshold in dBFS (decibels relative to full scale)
    threshold = audio.rms + silence_threshold

    # Find the indices where the audio is above the threshold
    non_silent_indices = np.where(audio_amplitude > threshold)[0]

    # Find the start and end indices of the non-silent regions
    start_index = non_silent_indices[0]
    end_index = non_silent_indices[-1]

    # Trim the audio based on the start and end indices
    trimmed_audio = audio[start_index:end_index]

    return trimmed_audio

# Usage
audio_file = r"D:\Projects and codes\interview\resources\extinsion_interview\out.mp3"
trimmed_audio = remove_silence(audio_file)
trimmed_audio.export(r"D:\Projects and codes\interview\resources\extinsion_interview\out_trimed.wav", format="WAV")


<_io.BufferedRandom name='D:\\Projects and codes\\interview\\resources\\extinsion_interview\\out_trimed.wav'>

In [16]:
from pydub import AudioSegment
from pydub.silence import detect_leading_silence, detect_leading_silence

def trim_silence(input_file, output_file, silence_threshold=-25.0, silence_duration=500):
    audio = AudioSegment.from_file(input_file)

    # Trim silence from the beginning
    start_trim = detect_leading_silence(audio, silence_threshold=silence_threshold)
    audio_trimmed = audio[start_trim:]

    # Trim silence from the end
    end_trim = detect_leading_silence(audio_trimmed.reverse(), silence_threshold=silence_threshold)
    audio_trimmed = audio_trimmed[:len(audio_trimmed)-end_trim]

    audio_trimmed.export(output_file, format='mp3')

# Example usage
input_file = r"D:\Projects and codes\interview\resources\extinsion_interview\out.mp3"
output_file = r"D:\Projects and codes\interview\resources\extinsion_interview\out_trimmed.mp3"
trim_silence(input_file, output_file)


In [1]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(embedding1, embedding2):
    # Calculate cosine similarity between two embeddings
    similarity_score = cosine_similarity([embedding1], [embedding2])[0][0]
    return similarity_score


In [3]:
import openai

# Set up OpenAI API credentials
openai.api_key = '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'

def getScore(sentence1, sentence2):
    # Your preprocessing code here
    
    # Create embeddings
    response = openai.Embedding.create(
        input=[sentence1, sentence2],
        engine="text-similarity-davinci-001"
    )
    
    # Process the response
    embeddings = response['embeddings']
    print(embeddings)

getScore("Rahul is a vertain in computer science","Rahul hates summer season and rainy seasons")


RateLimitError: You exceeded your current quota, please check your plan and billing details.

In [2]:
from google.cloud import speech

def transcribe_audio(audio_path):
    client = speech.SpeechClient()

    # Read the audio file
    with open(audio_path, "rb") as audio_file:
        content = audio_file.read()

    # Configure audio settings
    audio = speech.RecognitionAudio(content=content)
    config = speech.RecognitionConfig(
        encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
        sample_rate_hertz=44100,
        language_code="en-US",
    )

    # Request transcription
    response = client.recognize(config=config, audio=audio)

    # Retrieve and format the transcript
    transcript = ""
    for result in response.results:
        transcript += result.alternatives[0].transcript

    return transcript


# Usage
audio_url = r"D:\Projects and codes\interview\resources\extinsion_interview\out.wav"
transcript = transcribe_audio(audio_url)
print("Transcript:", transcript)


DefaultCredentialsError: Your default credentials were not found. To set up Application Default Credentials, see https://cloud.google.com/docs/authentication/external/set-up-adc for more information.

In [2]:
# import re
import torch
import whisper
from typing import NewType
import warnings
import timeit
from functools import lru_cache
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning)

# path_of_audio = NewType('path_of_i_th_audio',str)

def banner(text):
    # """Display a message when the script is working in the background"""
    print(f"# {text} #\n")


def check_device():
    
    # """Check CUDA availability."""
    if torch.cuda.is_available() == 1:
        device = "cuda"
        
    else:
        device = "cpu"
        
    return device
    # return "cpu"

# """Get speech recognition model."""
# model_name = input("Select speech recognition model name (tiny, base, small, medium, large): ")
# @lru_cache(maxsize=None)  # Decorator to enable caching
def load_model(model_name):
    return whisper.load_model(model_name, device=check_device())

def convertText(AUDIOFILE):
    
    # list_temp=[]
    warnings.filterwarnings("ignore", category=UserWarning)
    #choose a mode defaulted
    """tiny"""

    model_name = "base"

    banner("Transcribing texts")
    model = load_model(model_name)
        
    # for i in range(0,len(AUDIOFILE)):
        
    result = model.transcribe(AUDIOFILE)
    warnings.resetwarnings()
    # print("Result: ",result["text"])
    return result["text"]
        
   
    
    # return list_temp
    
# measure execution time of my_function
print(convertText(r"D:\Projects and codes\interview\resources\extinsion_interview\out1.mp3"))


# Transcribing texts #

 Artificial intelligence is a branch of computer science and engineering that focuses on developing intelligent machines capable of performing quantitative tasks that are traditionally associated with human beings. Data structure is a collection of data elements that are organized in a particular way to facilitate efficient processing of data, different types of data structure or stutter to different kinds of applications depending on the nature of data and the operation that need to be performed. Data planning is a type of artificial intelligence that relies on artificial neural network with multiple layers to process and analyze large amount of data. These networks are trained using algorithms that adjust the weights and biases of the connections between neurons to optimize the performance on a specific task. Machine learning is a process of teaching computer to learn from the data without being explicitly programmed by analyzing and finding patterns in a larg

In [9]:
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

#Model
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

#Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')





[A[A[A[A

ConnectionError: HTTPSConnectionPool(host='cdn-lfs.huggingface.co', port=443): Read timed out.

In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForQuestionAnswering

tokenizer = AutoTokenizer.from_pretrained("valhalla/longformer-base-4096-finetuned-squadv1")
model = AutoModelForQuestionAnswering.from_pretrained("valhalla/longformer-base-4096-finetuned-squadv1")

text = "Huggingface has democratized NLP. Huge thanks to Huggingface for this."
question = "What has Huggingface done ?"
encoding = tokenizer(question, text, return_tensors="pt")
input_ids = encoding["input_ids"]

# default is local attention everywhere
# the forward method will automatically set global attention on question tokens
attention_mask = encoding["attention_mask"]

start_scores, end_scores = model(input_ids, attention_mask=attention_mask)
all_tokens = tokenizer.convert_ids_to_tokens(input_ids[0].tolist())

answer_tokens = all_tokens[torch.argmax(start_scores) :torch.argmax(end_scores)+1]
answer = tokenizer.decode(tokenizer.convert_tokens_to_ids(answer_tokens))
# output => democratized NLP


  from .autonotebook import tqdm as notebook_tqdm
Downloading (…)okenizer_config.json: 100%|██████████| 26.0/26.0 [00:00<00:00, 21.2kB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 757/757 [00:00<?, ?B/s] 
Downloading (…)olve/main/vocab.json: 100%|██████████| 798k/798k [00:01<00:00, 789kB/s]
Downloading (…)olve/main/merges.txt: 100%|██████████| 456k/456k [00:00<00:00, 619kB/s]
Downloading (…)cial_tokens_map.json: 100%|██████████| 150/150 [00:00<00:00, 62.2kB/s]
Downloading pytorch_model.bin: 100%|██████████| 595M/595M [07:58<00:00, 1.24MB/s]


TypeError: argmax(): argument 'input' (position 1) must be Tensor, not str

In [11]:
import torch
from transformers import BertForQuestionAnswering
from transformers import BertTokenizer

#Model
model = BertForQuestionAnswering.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')

#Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking-finetuned-squad')






Downloading (…)lve/main/config.json: 100%|██████████| 443/443 [00:00<00:00, 125kB/s]





[A[A[A[A[A

KeyboardInterrupt: 

In [4]:
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import cosine

def compare(sentence1, sentence2):
    model = SentenceTransformer('bert-base-nli-mean-tokens')
    embeddings = model.encode([sentence1, sentence2])
    similarity_score1 = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    # similarity_score2 = 1 - cosine([embeddings[0]], [embeddings[1]])
    print(similarity_score1)

# Example usage
# sentence1 = 'Artificial Intelligence (AI) refers to the ability of machines to perform tasks that typically require human-like intelligence, such as learning, reasoning, problem-solving, perception, and natural language processing.'
# sentence1 = "Mount Everest is the highest peak in the world, located in the Himalayas. It stands at an elevation of approximately 8,848 meters (29,029 feet) above sea level. Climbing Mount Everest is a challenging and dangerous feat that requires extensive preparation, physical endurance, and mountaineering skills. Many climbers attempt to conquer Everest each year, braving extreme weather conditions and navigating treacherous terrain. The summit offers breathtaking views and a sense of accomplishment for those who reach the top."
sentence1 = "Machine Learning (ML) is a subfield of artificial intelligence that focuses on the development of algorithms and models that allow computers to learn and make predictions or decisions without being explicitly programmed. It involves the study of statistical models and algorithms that enable systems to automatically analyze and interpret complex patterns and relationships in data.In machine learning, computers are trained on large datasets, where they learn from examples and experience to recognize patterns and make informed decisions or predictions. The process involves extracting meaningful features from the data, selecting appropriate algorithms or models, and optimizing them based on the training data. The trained models can then be used to make predictions or take actions on new, unseen data."
# sentence2 = "Artificial intelligence is a branch of computer science and engineering that focuses on developing intelligent machines capable of performing quantitative tasks that are traditionally associated with human beings. Artificial intelligence (AI) refers to the development of computer systems that possess the ability to perform tasks that typically require human intelligence. It encompasses a wide range of techniques and methodologies aimed at creating intelligent machines capable of learning, reasoning, problem-solving, perceiving, and processing natural language. AI systems are designed to emulate human-like cognitive abilities and make autonomous decisions or take actions based on data and patterns. By leveraging algorithms, statistical models, and large datasets, AI enables machines to recognize patterns, make predictions, adapt to changing conditions, and interact with humans in a natural and intelligent manner. AI has applications in various domains, including robotics, healthcare, finance, transportation, and many others, where it has the potential to revolutionize industries, improve efficiency, and enhance decision-making processes."
# sentence1 = "Artificial intelligence (AI) is a branch of computer science that focuses on developing intelligent machines capable of performing tasks that typically require human intelligence. It involves the study of algorithms and models that enable computers to learn, reason, problem-solve, and understand natural language. AI has various applications in fields like robotics, healthcare, finance, and transportation, where it aims to revolutionize industries and improve efficiency."
sentence2 = "Machine learning (ML) is a subset of AI that deals with the development of algorithms and models that enable computers to learn and make predictions based on data. It involves training computers on large datasets to recognize patterns and make informed decisions without being explicitly programmed. ML has applications in areas like image and speech recognition, recommendation systems, and fraud detection, where it plays a vital role in automating tasks and extracting meaningful insights from data."
compare(sentence1, sentence2)
# print("Similarity score:", similarity_score1,similarity_score2)


0.9630601


In [31]:
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean

def compare(sentence1, sentence2):
    model = SentenceTransformer('bert-base-nli-mean-tokens')
    embeddings = model.encode([sentence1, sentence2])
    
    # Calculate Euclidean distance
    distance = euclidean(embeddings[0], embeddings[1])

    # Calculate similarity score
    similarity_score = 1 / (1 + distance)  # Inverse of the distance

    # Alternatively, you can normalize the distance between 0 and 1
    # using min-max normalization
    min_distance = 0.0  # Minimum possible distance
    max_distance = 10.0  # Maximum possible distance

    normalized_distance = (distance - min_distance) / (max_distance - min_distance)

    # Transform the normalized distance to a similarity score between 0 and 100
    similarity_score = 100 * (1 - normalized_distance)
    
    return similarity_score*3.3


# Example usage
# sentence1 = 'Artificial Intelligence (AI) refers to the ability of machines to perform tasks that typically require human-like intelligence, such as learning, reasoning, problem-solving, perception, and natural language processing.'
# sentence2 = "Mount Everest is the highest peak in the world, located in the Himalayas. It stands at an elevation of approximately 8,848 meters (29,029 feet) above sea level. Climbing Mount Everest is a challenging and dangerous feat that requires extensive preparation, physical endurance, and mountaineering skills. Many climbers attempt to conquer Everest each year, braving extreme weather conditions and navigating treacherous terrain. The summit offers breathtaking views and a sense of accomplishment for those who reach the top."
sentence2 = "Artificial intelligence is a branch of computer science and engineering that focuses on developing intelligent machines capable of performing quantitative tasks that are traditionally associated with human beings. Artificial intelligence (AI) refers to the development of computer systems that possess the ability to perform tasks that typically require human intelligence. It encompasses a wide range of techniques and methodologies aimed at creating intelligent machines capable of learning, reasoning, problem-solving, perceiving, and processing natural language. AI systems are designed to emulate human-like cognitive abilities and make autonomous decisions or take actions based on data and patterns. By leveraging algorithms, statistical models, and large datasets, AI enables machines to recognize patterns, make predictions, adapt to changing conditions, and interact with humans in a natural and intelligent manner. AI has applications in various domains, including robotics, healthcare, finance, transportation, and many others, where it has the potential to revolutionize industries, improve efficiency, and enhance decision-making processes."
sentence2 = "Machine learning (ML) is a subset of AI that deals with the development of algorithms and models that enable computers to learn and make predictions based on data. It involves training computers on large datasets to recognize patterns and make informed decisions without being explicitly programmed. ML has applications in areas like image and speech recognition, recommendation systems, and fraud detection, where it plays a vital role in automating tasks and extracting meaningful insights from data."

similarity= compare(sentence1, sentence2)
# print("Cosine Similarity:", similarity_cosine)
print("Euclidean Distance:", similarity if similarity>0 else abs(similarity)/10)


Euclidean Distance: 21.790569305419915


In [8]:
from scipy.spatial.distance import euclidean
from sentence_transformers import SentenceTransformer,util
from sklearn.metrics.pairwise import cosine_similarity
from time import sleep
import random
from math import ceil
def calculate_similarity(sentences_list,sentence):
    # Encode sentences and obtain embeddings
    """ Can even use 'bert-base-nli-stsb-mean-tokens' or 'bert-base-nli-max-tokens'"""
    model = SentenceTransformer('bert-base-nli-mean-tokens')        #180 97 -366 218 -124 -309
    # model = SentenceTransformer('bert-base-nli-stsb-mean-tokens') #177 79 -384 214 -108 -352
    # model = SentenceTransformer('bert-base-nli-max-tokens')       #149 65 -307 157 -109 -277  """ Reject this model """

    sentence_embeddings = model.encode([sentence])
    sentences_embeddings = model.encode(sentences_list)
    # print(sentence_embeddings)
    
    # # Calculate Cosine similarity
    # cos_similarity = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]
    # cosine_similarity_score = (ceil(101 * cos_similarity)) if cos_similarity > 0 else 0
    
    # # Calculate Euclidean distance
    # euclidean_distance = euclidean(embeddings[0], embeddings[1])
    # euclidean_similarity_score = ceil(calculate_similarity_score(abs(euclidean_distance)))
    

    # # Calculate cosine similarity
    # cos_similarity = util.pytorch_cos_sim(embeddings[0], embeddings[1])
    # cosine_similarity_score2 = ceil(cos_similarity.item() * 100) if  ceil(cos_similarity.item() * 100) > 0 else 0
    
    # print(f"Similarity Score = {euclidean_similarity_score} , {cosine_similarity_score} , {cosine_similarity_score2}")
    
    # weighted_similarity = ceil(euclidean_similarity_score * .70 + cosine_similarity_score * .15 + cosine_similarity_score2*.15)
    # print(f"Final similarity is {weighted_similarity}")
    similarity_score = []

    for embedding in sentences_embeddings:
        # Calculate cosine similarity
        cos_similarity = util.pytorch_cos_sim(sentence_embeddings[0], embedding)
        similarity_score_cos = ceil(cos_similarity.item() * 100) if  ceil(cos_similarity.item() * 100) > 0 else 0

        # Calculate Euclidean distance
        euclidean_distance = euclidean(sentence_embeddings[0], embedding)
        similarity_score_euc = ceil(calculate_similarity_score(abs(euclidean_distance)))
        
        cosi_similarity = cosine_similarity([sentence_embeddings[0]], [embedding])[0][0]
        cosine_similarity_score = (ceil(101 * cosi_similarity)) if cosi_similarity > 0 else 0

        # Combine the similarity scores (adjust the weights as desired)
        similarity_score.append(ceil(0.15 * similarity_score_cos + 0.70 * similarity_score_euc + 0.15 * cosine_similarity_score))

    print(similarity_score)
    return max(similarity_score)    
    

def calculate_similarity_score(distance):
    # Normalize the distance between 0 and 1
    min_distance = 0.0  # Minimum possible distance
    max_distance = 10.0  # Maximum possible distance

    normalized_distance = (distance - min_distance) / (max_distance - min_distance)

    # Transform the normalized distance to a similarity score between 0 and 100
    similarity = 100 * (1 - normalized_distance) * 3.3
    # similarity = 100 * (1 - normalized_distance)

    return similarity if similarity > 0 and similarity < 100 else random.randint(97,100) if similarity > 100 else 0
    # return similarity

# sentence1 = 'Artificial Intelligence (AI) refers to the ability of machines to perform tasks that typically require human-like intelligence, such as learning, reasoning, problem-solving, perception, and natural language processing.'
# sentence2 = "Artificial intelligence is a branch of computer science and engineering that focuses on developing intelligent machines capable of performing quantitative tasks that are traditionally associated with human beings. Artificial intelligence (AI) refers to the development of computer systems that possess the ability to perform tasks that typically require human intelligence. It encompasses a wide range of techniques and methodologies aimed at creating intelligent machines capable of learning, reasoning, problem-solving, perceiving, and processing natural language. AI systems are designed to emulate human-like cognitive abilities and make autonomous decisions or take actions based on data and patterns. By leveraging algorithms, statistical models, and large datasets, AI enables machines to recognize patterns, make predictions, adapt to changing conditions, and interact with humans in a natural and intelligent manner. AI has applications in various domains, including robotics, healthcare, finance, transportation, and many others, where it has the potential to revolutionize industries, improve efficiency, and enhance decision-making processes."
sentence1 = ["Machine learning (ML) is a subset of AI that deals with the development of algorithms and models that enable computers to learn and make predictions based on data. It involves training computers on large datasets to recognize patterns and make informed decisions without being explicitly programmed. ML has applications in areas like image and speech recognition, recommendation systems, and fraud detection, where it plays a vital role in automating tasks and extracting meaningful insights from data.",'Machine learning (ML) is a subset of artificial intelligence (AI) that focuses on the development of algorithms and models that enable computers to learn and make predictions based on data. It involves training machines on large datasets to recognize patterns, extract insights, and make informed decisions without being explicitly programmed.']
# sentence2 = "Mount Everest is the highest peak in the world, located in the Himalayas. It stands at an elevation of approximately 8,848 meters (29,029 feet) above sea level. Climbing Mount Everest is a challenging and dangerous feat that requires extensive preparation, physical endurance, and mountaineering skills. Many climbers attempt to conquer Everest each year, braving extreme weather conditions and navigating treacherous terrain. The summit offers breathtaking views and a sense of accomplishment for those who reach the top."
# sentence2 = ""
sentence2 = "Machine Learning has numerous applications across various domains, including image and speech recognition, natural language processing, recommendation systems, and fraud detection. It has revolutionized industries and transformed the way businesses operate. With ML, companies can automate tasks, improve efficiency, and gain valuable insights from their data."
# sentence2 = 'Machine learning (ML) is a subset of artificial intelligence (AI) that focuses on the development of algorithms and models that enable computers to learn and make predictions based on data. It involves training machines on large datasets to recognize patterns, extract insights, and make informed decisions without being explicitly programmed.'
# sentence1 = "My name is Rahul"
# sentence2 = "Rahul is not my name"
calculate_similarity(sentence1, sentence2)
# """Succesfull test"""


[[-4.80075985e-01  7.46048987e-01  8.99167001e-01 -1.11929864e-01
   9.99872833e-02 -6.65144801e-01 -1.31204531e-01  2.94809908e-01
   3.32077533e-01 -2.26748407e-01  1.74631655e-01  6.98763072e-01
   1.07620746e-01  7.90148914e-01 -6.92231119e-01  9.93288085e-02
  -8.81605029e-01 -4.83663261e-01  1.23142436e-01  2.55781144e-01
  -4.30965841e-01 -6.84490204e-01  2.96399087e-01  6.50981009e-01
  -1.38950218e-02  2.50701725e-01 -6.77251160e-01  3.52156669e-01
  -9.63392496e-01  1.99173778e-01 -8.31316948e-01 -1.62037894e-01
   1.04588248e-01 -8.75517011e-01 -7.75868475e-01  9.92624164e-01
   5.93554676e-01  6.52876124e-02 -9.68944728e-02  2.71096349e-01
  -5.14553905e-01 -7.82622844e-02  2.01307952e-01  2.91496664e-01
  -1.42707038e+00 -5.10521650e-01 -1.38222396e+00  4.41120327e-01
   1.43147483e-01  1.84224322e-01 -3.64681035e-01  4.95001674e-01
  -3.44197005e-01 -6.68913126e-01 -2.99958169e-01  6.79977834e-01
   1.20468192e-01 -1.71773791e+00 -1.20194882e-01 -1.01770051e-01
  -4.30761

22

In [2]:
from transformers import BertTokenizer, BertModel
import torch

def calculate_similarity(sentence1, sentence2):
    # Load pre-trained BERT model and tokenizer
    model_name = 'bert-base-uncased'
    tokenizer = BertTokenizer.from_pretrained(model_name)
    model = BertModel.from_pretrained(model_name)

    # Tokenize the sentences
    tokens1 = tokenizer.tokenize(sentence1)
    tokens2 = tokenizer.tokenize(sentence2)

    # Add special tokens and obtain input IDs
    encoded_input = tokenizer.encode_plus(tokens1, tokens2, add_special_tokens=True, padding='longest', truncation=True)
    input_ids = torch.tensor(encoded_input['input_ids']).unsqueeze(0)  # Add batch dimension

    # Forward pass through the BERT model
    with torch.no_grad():
        outputs = model(input_ids)
        embeddings = outputs.last_hidden_state[:, 0, :]  # Extract the embedding for the [CLS] token

    # Calculate cosine similarity between the embeddings
    similarity_score = torch.nn.functional.cosine_similarity(embeddings, embeddings, dim=1)

    return similarity_score.item()


  from .autonotebook import tqdm as notebook_tqdm


In [6]:
sentence1 = 'Artificial Intelligence (AI) refers to the ability of machines to perform tasks that typically require human-like intelligence, such as learning, reasoning, problem-solving, perception, and natural language processing.'
# sentence2 = "Artificial intelligence is a branch of computer science and engineering that focuses on developing intelligent machines capable of performing quantitative tasks that are traditionally associated with human beings. Artificial intelligence (AI) refers to the development of computer systems that possess the ability to perform tasks that typically require human intelligence. It encompasses a wide range of techniques and methodologies aimed at creating intelligent machines capable of learning, reasoning, problem-solving, perceiving, and processing natural language. AI systems are designed to emulate human-like cognitive abilities and make autonomous decisions or take actions based on data and patterns. By leveraging algorithms, statistical models, and large datasets, AI enables machines to recognize patterns, make predictions, adapt to changing conditions, and interact with humans in a natural and intelligent manner. AI has applications in various domains, including robotics, healthcare, finance, transportation, and many others, where it has the potential to revolutionize industries, improve efficiency, and enhance decision-making processes."
# sentence2 = "Machine learning (ML) is a subset of AI that deals with the development of algorithms and models that enable computers to learn and make predictions based on data. It involves training computers on large datasets to recognize patterns and make informed decisions without being explicitly programmed. ML has applications in areas like image and speech recognition, recommendation systems, and fraud detection, where it plays a vital role in automating tasks and extracting meaningful insights from data."
sentence2 = "Mount Everest is the highest peak in the world, located in the Himalayas. It stands at an elevation of approximately 8,848 meters (29,029 feet) above sea level. Climbing Mount Everest is a challenging and dangerous feat that requires extensive preparation, physical endurance, and mountaineering skills. Many climbers attempt to conquer Everest each year, braving extreme weather conditions and navigating treacherous terrain. The summit offers breathtaking views and a sense of accomplishment for those who reach the top."
# sentence2 = ""
calculate_similarity(sentence1,sentence2)

# """Failure"""

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


0.9999998211860657

In [36]:
import tensorflow_hub as hub
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

def calculate_similarity(sentence1, sentence2):
    # Load the Universal Sentence Encoder
    use_model = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

    # Encode the sentences
    embeddings = use_model([sentence1, sentence2])

    # Calculate cosine similarity score
    similarity_score = cosine_similarity([embeddings[0]], [embeddings[1]])[0][0]

    return similarity_score

# Example usage+-
# sentence1 = 'Artificial Intelligence (AI) refers to the ability of machines to perform tasks that typically require human-like intelligence, such as learning, reasoning, problem-solving, perception, and natural language processing.'
# sentence2 = "Machine learning (ML) is a subset of AI that deals with the development of algorithms and models that enable computers to learn and make predictions based on data. It involves training computers on large datasets to recognize patterns and make informed decisions without being explicitly programmed. ML has applications in areas like image and speech recognition, recommendation systems, and fraud detection, where it plays a vital role in automating tasks and extracting meaningful insights from data."
sentence1 = "My name is Rahul"
sentence2 = "Rahul is not my name"
similarity_score = calculate_similarity(sentence1, sentence2)
print(similarity_score)

"""Failure"""


0.8055061


In [None]:
from scipy.spatial.distance import euclidean
from sentence_transformers import SentenceTransformer
import numpy as np

def calculate_similarity(sentence, sentences_list):
    # Encode sentences and obtain embeddings
    model = SentenceTransformer('bert-base-nli-mean-tokens')
    sentence_embeddings = model.encode([sentence])
    sentences_embeddings = model.encode(sentences_list)

    max_similarity_score = 0

    for embedding in sentences_embeddings:
        # Calculate cosine similarity
        cos_similarity = util.pytorch_cos_sim(sentence_embeddings[0], embedding)
        similarity_score_cos = cos_similarity.item()

        # Calculate Euclidean distance
        euclidean_distance = euclidean(sentence_embeddings[0], embedding)
        similarity_score_euc = 1 / (1 + euclidean_distance)

        # Combine the similarity scores (adjust the weights as desired)
        similarity_score = 0.7 * similarity_score_cos + 0.3 * similarity_score_euc

        if similarity_score > max_similarity_score:
            max_similarity_score = similarity_score

    return max_similarity_score


In [4]:
import sys

sys.path.append(r"D:\Projects and codes\interview\resources")

from NLP_Transformer import SimilarityCalculator

obj = SimilarityCalculator()

sentence1 = ['Machine learning is a branch of artificial intelligence that involves developing algorithms and models that can learn from data and make predictions or decisions without being explicitly programmed. Machine learning algorithms use mathematical and statistical techniques to analyze and find patterns within large datasets. There are three main types of machine learning: supervised, unsupervised, and reinforcement learning.', 'Machine learning is the process of teaching computers to learn from data, without being explicitly programmed. By analyzing and finding patterns in large datasets, machine learning algorithms can be used to make predictions or decisions in a variety of applications.', 'At its core, machine learning involves using statistical and mathematical techniques to enable computers to learn and make decisions based on data. This has a wide range of practical applications, from speech recognition and image analysis to fraud detection and personalized recommendations.', 'Machine learning is a subset of artificial intelligence that uses mathematical and statistical techniques to analyze data and recognize patterns, enabling computers to make decisions and predictions without being explicitly programmed. It has practical applications in various fields such as image recognition, natural language processing, fraud detection, and recommendation systems.']
sentence2 = 'Machine learning is the process of teaching computers to learn from data, without being explicitly programmed. By analyzing and finding patterns in large datasets, machine learning algorithms can be used to make predictions or decisions in a variety of applications.'
score = obj.calculate_similarity(sentence1,sentence2)

print(score)

99


In [19]:
from sentence_transformers import SentenceTransformer, util
from sklearn.metrics.pairwise import cosine_similarity
from scipy.spatial.distance import euclidean
from math import ceil

class Nlp_trans_SimCalc:
    def __init__(self, model_name='bert-base-nli-mean-tokens'):
        self.model = SentenceTransformer(model_name)

    def calculate_similarity(self, sentences_list, sentence):
        sentence_embeddings = self.model.encode([sentence])
        sentences_embeddings = self.model.encode(sentences_list)

        similarity_scores = []

        cos_similarities = util.pytorch_cos_sim(sentence_embeddings, sentences_embeddings).squeeze()
        cosine_similarity_scores = [ceil(similarity.item() * 100) if similarity.item() > 0 else 0 for similarity in cos_similarities]

        euclidean_distances = [euclidean(sentence_embeddings[0], embedding) for embedding in sentences_embeddings]
        euclidean_similarity_scores = [self.calculate_similarity_score(abs(distance)) for distance in euclidean_distances]

        for cosine_score, euclidean_score in zip(cosine_similarity_scores, euclidean_similarity_scores):
            similarity_scores.append(ceil(0.15 * cosine_score + 0.70 * euclidean_score + 0.15 * cosine_score))

        return max(similarity_scores)

    @staticmethod
    def calculate_similarity_score(distance):
        min_distance = 0.0  # Minimum possible distance
        max_distance = 10.0  # Maximum possible distance

        normalized_distance = (distance - min_distance) / (max_distance - min_distance)
        similarity = 100 * (1 - normalized_distance) * 3.3

        return similarity if 0 < similarity < 100 else 100 if similarity > 100 else 0

        
        
# Create an instance of the Nlp_eng_SimCalc class
sim_calc = Nlp_trans_SimCalc()

# Sample test cases
sentence1 = ["The cat is sitting on the mat.", "I like to play tennis.", "The sky is blue."]
sentence2 = "The cat is on the mat."
score = sim_calc.calculate_similarity(sentence1, sentence2)
print("Score:", score)

sentence1 = ["The cat is sitting on the mat.", "I like to play tennis.", "The sky is blue."]
sentence2 = "I enjoy playing tennis."
score = sim_calc.calculate_similarity(sentence1, sentence2)
print("Score:", score)

sentence1 = ["The cat is sitting on the mat.", "I like to play tennis.", "The sky is blue."]
sentence2 = "The sky is red."
score = sim_calc.calculate_similarity(sentence1, sentence2)
print("Score:", score)


Score: 100
Score: 100
Score: 8


In [33]:
import warnings
import whisper
import torch
from functools import lru_cache


class SpeechRecognizer:
    def __init__(self, model_name="base"):
        self.model_name = model_name
        self.device = "cpu"
        self.model = self.load_model()

        # Set up warnings filter
        warnings.filterwarnings("ignore", category=UserWarning)

    def banner(self, text):
        print(f"# {text} #\n")

    @lru_cache(maxsize=None)
    def load_model(self):
        return whisper.load_model(self.model_name, device=self.device)

    def convert_text(self, audio_file):
        self.banner("Transcribing texts")
        try:
            result = self.model.transcribe(audio_file)
            return result["text"]
        except Exception as e:
            print(f"Error occurred during audio file conversion: {e}")
            return None

In [31]:
from functools import lru_cache
import warnings
import whisper
import torch



class SpeechRecognizer:
    
    def __init__(self, model_name="base"):
        self.model_name = model_name
        self.device = "cpu"
        self.model = self.load_model()

    def banner(self, text):
        print(f"# {text} #\n")

    @lru_cache(maxsize=None)
    def load_model(self):
        return whisper.load_model(self.model_name, device=self.device)

    def convert_text(self, audio_file):
        warnings.filterwarnings("ignore", category=UserWarning)
        self.banner("Transcribing texts")
        result = self.model.transcribe(audio_file)
        warnings.resetwarnings()
        return result["text"]


In [35]:
def decorator_func(func):
    def wrapper():
        print("Before function execution")
        func()
        print("After function execution")
    return wrapper

@decorator_func
def hello():
    print("Hello, world!")

hello()


Before function execution
Hello, world!
After function execution


In [36]:
# Specify the path to your JSON file
json_file_path = r'D:\Projects and codes\interview\resources\extinsion_interview\test\result.json'

# Open the JSON file in write mode and truncate its content
with open(json_file_path, 'w') as f:
    f.truncate(0)


In [29]:
from transformers import pipeline
from time import sleep
import pymongo

class SentimentAnalyzer:
    def __init__(self):
        self.classifier = pipeline("sentiment-analysis", model="distilbert-base-uncased-finetuned-sst-2-english", revision="af0f99b")


    def analyze_sentiment(self, text):
        result = self.classifier(text, truncation=True)[0]
        sentiment = result['label']
        score = result['score']
        positive_prob = score if sentiment == 'POSITIVE' else 1 - score
        negative_prob = 1 - positive_prob

        return sentiment, positive_prob, negative_prob\
            
    def push_to_Db(self,data):
        
        try:
            # Establish a connection to the MongoDB server
            client = pymongo.MongoClient("mongodb+srv://webinterview:12345@cluster0.unj3vql.mongodb.net/?retryWrites=true&w=majority")

            # Access the desired database
            db = client["main"]

            # Access the desired collection
            collection = db["Review"]

            # Insert the data into the collection
            collection.insert_one(data)

            print("Data successfully pushed to MongoDB.")
        except Exception as error:
            print("Error while connecting to MongoDB:", error)
    
def main():
    
    analyzer = SentimentAnalyzer()
    
    file_path = r"D:\Projects and codes\interview\resources\extinsion_interview\test\review.txt"
    while True:
        try:
            with open(file_path, 'r') as file:
                text = file.read()
                if text != "":
                    print("Got the text")
                    break
                else:
                    print("Waiting for text")
                    sleep(5)
                    
        except FileNotFoundError:
            print(f"File '{file_path}' not found.")
            sleep(5)
            
        
    sentiment, positive_prob, negative_prob = analyzer.analyze_sentiment(text)

    data = {
        "sentiment":sentiment,
        "positive_prob":positive_prob,
        "negative_prob":negative_prob,
        "text":text
    }
    
    print("Review:", sentiment)
    # print("Positive probability:", positive_prob)
    # print("Negative probability:", negative_prob)
    analyzer.push_to_Db(data)
    
if __name__=="__main__":
    main()

Got the text
Review: POSITIVE
Data successfully pushed to MongoDB.


In [16]:
analyzer = SentimentAnalyzer()
text = "However, one aspect where XYZ Software falls short is its customer support. While the software itself is exceptional, the response time and assistance from the support team have been subpar. There have been instances where queries took longer than expected to be addressed, which can be frustrating when encountering critical issues. Improving the responsiveness and effectiveness of customer support would greatly enhance the overall user experience."
sentiment, positive_prob, negative_prob = analyzer.analyze_sentiment(text)

print("Sentiment:", sentiment)
print("Positive probability:", positive_prob)
print("Negative probability:", negative_prob)

Sentiment: NEGATIVE
Positive probability: 0.05779218673706055
Negative probability: 0.9422078132629395


In [31]:
import requests
import json
import urllib.parse

def generate_answers(question, api_key="AIzaSyAni60XCniMwfWiU3ZVHWX2TCFSgDf3N9M", cx="87e385fa22560468e"):

    # Create the search request.
    params = {
        'key': api_key,
        'cx': cx,
        'q': question,
        'num': 10,
        'safe': 'off',
    }
    url = 'https://www.googleapis.com/customsearch/v1?' + urllib.parse.urlencode(params)

    # Make the search request.
    response = requests.get(url)

    # Check the response status code.
    if response.status_code != 200:
        raise Exception('Error searching Google: {}'.format(response.status_code))

    # Parse the response JSON.
    results = json.loads(response.content)

    # Extract and categorize the answers.
    categorized_answers = {
        'detailed': [],
        'brief': [],
        'one_line': []
    }
    for result in results['items']:
        answer = result['snippet']
        if len(answer.split()) >= 20:
            categorized_answers['detailed'].append(answer)
        elif len(answer.split()) >= 5:
            categorized_answers['brief'].append(answer)
        else:
            categorized_answers['one_line'].append(answer)

    return categorized_answers


# Example usage
question = "What is the capital of France?"
api_key = "AIzaSyAni60XCniMwfWiU3ZVHWX2TCFSgDf3N9M"
cx = "87e385fa22560468e"
answers = generate_answers(question)
print(answers)
# # Print the retrieved answers
# print("Detailed answers:")
# for answer in answers['detailed']:
#     print("- ", answer)

# print("\nBrief answers:")
# for answer in answers['brief']:
#     print("- ", answer)

# print("\nOne-line answers:")
# for answer in answers['one_line']:
#     print("- ", answer)

{'detailed': ["As the capital of France, Paris is the seat of France's national government. For the executive, the two chief officers each have their own official\xa0...", 'Paris is the capital of France, the largest country of Europe with 550 000 km2 (65 millions inhabitants). Paris has 2.234 million inhabitants end 2011.', "The capital and by far the most important city of France is Paris, one of the world's preeminent cultural and commercial centres.", 'List of capitals of France · Bordeaux (September 1914) The French government was relocated from Paris to Bordeaux very briefly during World War I, when it was\xa0...', 'Paris, city and capital of France, situated in the north-central part of the country. People were living on the site of the present-day city, located along\xa0...', "Oct 4, 2013 ... THIS YEAR MARSEILLE is a European Capital of Culture, so new museums have opened, the streets have been spruced up for tourists, and there's a\xa0...", 'Paris is the capital and most popul

In [1]:
from transformers import pipeline

# Load the question-answering model
nlp = pipeline("question-answering", model="distilbert-base-uncased-distilled-squad")

# Example questions
questions = [
    "What is supervised learning?",
    "What are decision trees and how are they used in machine learning?",
    "What is clustering and how is it used in machine learning?",
    "What is an operating system and what are its main functions?",
    "What is a training set and its importance in machine learning?",
    "What is a test set and its role in evaluating machine learning models?",
]

# Get answers for each question
answers = []
for question in questions:
    answer = nlp(question=question, context=text)
    answers.append(answer)

# Print the answers
for i, answer in enumerate(answers):
    print(f"Question: {questions[i]}")
    print(f"Answer: {answer['answer']}")
    print(f"Score: {answer['score']}")
    print()


  from .autonotebook import tqdm as notebook_tqdm


NameError: name 'text' is not defined

In [45]:
import pymongo

# Connect to MongoDB
client = pymongo.MongoClient("mongodb+srv://webinterview:12345@cluster0.unj3vql.mongodb.net/?retryWrites=true&w=majority")
db = client["main"]
collection = db["questions"]

# Read data from text file
# Read data from text file
with open(r"D:\Projects and codes\interview\resources\extinsion_interview\test\review.txt", "r") as file:
    lines = file.readlines()
# Process each line and insert into the database
# Process each question and answers and insert into the database
for i in range(0, len(lines), 6):
    question = lines[i].strip().strip('"')
    answer1 = lines[i+1].strip().split(": ")[1].strip('"')
    answer2 = lines[i+2].strip().split(": ")[1].strip('"')
    answer3 = lines[i+3].strip().split(": ")[1].strip('"')
    answer4 = lines[i+4].strip().split(": ")[1].strip('"')
    answer5 = lines[i+5].strip().split(": ")[1].strip('"')

    document = {
        "question": question,
        "answer1": answer1,
        "answer2": answer2,
        "answer3": answer3,
        "answer4": answer4,
        "answer5": answer5
    }

    # Insert the document into the database
    collection.insert_one(document)

# Close the database connection
client.close()
