<a href="https://colab.research.google.com/github/Saketkr06/NLP/blob/main/Practical_NLP_1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [61]:
documents=["Dog bites man.", "Man bites dog.", "Dog eats meat.", "Man eats food."]
processed_docs=[doc.lower().replace('.','') for doc in documents]
processed_docs

['dog bites man', 'man bites dog', 'dog eats meat', 'man eats food']

In [62]:
vocab={}
count=0
for doc in processed_docs:
  for word in doc.split():
    if word not in vocab:
      count+=1
      vocab[word]=count

print(vocab)

{'dog': 1, 'bites': 2, 'man': 3, 'eats': 4, 'meat': 5, 'food': 6}


In [63]:
def get_onehot_vector(somestring):
  onehot_encoded=[]
  for word in somestring.split():
    temp=[0]*len(vocab)
    if word in vocab:
      temp[vocab[word]-1]=1

    onehot_encoded.append(temp)

  return onehot_encoded

In [64]:
processed_docs[1]

'man bites dog'

In [65]:
get_onehot_vector(processed_docs[1])

[[0, 0, 1, 0, 0, 0], [0, 1, 0, 0, 0, 0], [1, 0, 0, 0, 0, 0]]

In [66]:
S1 = 'dog bites man'
S2 = 'man bites dog'
S3 = 'dog eats meat'
S4 = 'man eats food'

In [67]:
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
data = [S1.split(), S2.split(), S3.split(), S4.split()]
values = data[0]+data[1]+data[2]+data[3]
print("The data: ",values)

The data:  ['dog', 'bites', 'man', 'man', 'bites', 'dog', 'dog', 'eats', 'meat', 'man', 'eats', 'food']


In [68]:
label_encoder=LabelEncoder()
integer_encoded=label_encoder.fit_transform(values)
integer_encoded

array([1, 0, 4, 4, 0, 1, 1, 2, 5, 4, 2, 3])

In [69]:
#One-Hot Encoding
onehot_encoder = OneHotEncoder()
onehot_encoded = onehot_encoder.fit_transform(data).toarray()
print("Onehot Encoded Matrix:\n",onehot_encoded)

Onehot Encoded Matrix:
 [[1. 0. 1. 0. 0. 0. 1. 0.]
 [0. 1. 1. 0. 1. 0. 0. 0.]
 [1. 0. 0. 1. 0. 0. 0. 1.]
 [0. 1. 0. 1. 0. 1. 0. 0.]]


## Bag of Words

In [70]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect=CountVectorizer()
bow_rep=count_vect.fit_transform(processed_docs)

print('Our Vocabulary',count_vect.vocabulary_)
print( "BoW representation for 'dog bites man': ",bow_rep[0].toarray())
print("BoW representation for 'man bites dog: ",bow_rep[1].toarray())

#Get the representation using this vocabulary, for a new text
temp = count_vect.transform(["dog and dog are friends"])
print("Bow representation for 'dog and dog are friends':", temp.toarray())

Our Vocabulary {'dog': 1, 'bites': 0, 'man': 4, 'eats': 2, 'meat': 5, 'food': 3}
BoW representation for 'dog bites man':  [[1 1 0 0 1 0]]
BoW representation for 'man bites dog:  [[1 1 0 0 1 0]]
Bow representation for 'dog and dog are friends': [[0 2 0 0 0 0]]


In [71]:
#BoW with binary vectors
count_vect = CountVectorizer(binary=True)
count_vect.fit(processed_docs)
temp = count_vect.transform(["dog and dog are friends"])
print("Bow representation for 'dog and dog are friends':", temp.toarray())

Bow representation for 'dog and dog are friends': [[0 1 0 0 0 0]]


## BAG of N-Grams

In [72]:
from sklearn.feature_extraction.text import CountVectorizer

count_vect=CountVectorizer(ngram_range=(1,3))
#Build a BOW representation for the corpus
bow_rep = count_vect.fit_transform(processed_docs)

#Look at the vocabulary mapping
print("Our vocabulary: ", count_vect.vocabulary_)

#see the BOW rep for first 2 documents
print("BoW representation for 'dog bites man': ", bow_rep[0].toarray())
print("BoW representation for 'man bites dog: ",bow_rep[1].toarray())

#Get the representation using this vocabulary, for a new text
temp = count_vect.transform(["dog and dog are friends"])

print("Bow representation for 'dog and dog are friends':", temp.toarray())

Our vocabulary:  {'dog': 3, 'bites': 0, 'man': 12, 'dog bites': 4, 'bites man': 2, 'dog bites man': 5, 'man bites': 13, 'bites dog': 1, 'man bites dog': 14, 'eats': 8, 'meat': 17, 'dog eats': 6, 'eats meat': 10, 'dog eats meat': 7, 'food': 11, 'man eats': 15, 'eats food': 9, 'man eats food': 16}
BoW representation for 'dog bites man':  [[1 0 1 1 1 1 0 0 0 0 0 0 1 0 0 0 0 0]]
BoW representation for 'man bites dog:  [[1 1 0 1 0 0 0 0 0 0 0 0 1 1 1 0 0 0]]
Bow representation for 'dog and dog are friends': [[0 0 0 2 0 0 0 0 0 0 0 0 0 0 0 0 0 0]]


In [73]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer()
bow_rep_tfidf=tfidf.fit_transform(processed_docs)
#IDF for all words in the vocabulary
print("IDF for all words in the vocabulary",tfidf.idf_)
print("-"*10)
#All words in the vocabulary.
print("All words in the vocabulary",tfidf.get_feature_names_out)
print("-"*10)

#TFIDF representation for all documents in our corpus
print("TFIDF representation for all documents in our corpus\n",bow_rep_tfidf.toarray())
print("-"*10)

temp = tfidf.transform(["dog and man are friends"])
print("Tfidf representation for 'dog and man are friends':\n", temp.toarray())

IDF for all words in the vocabulary [1.51082562 1.22314355 1.51082562 1.91629073 1.22314355 1.91629073]
----------
All words in the vocabulary <bound method CountVectorizer.get_feature_names_out of TfidfVectorizer()>
----------
TFIDF representation for all documents in our corpus
 [[0.65782931 0.53256952 0.         0.         0.53256952 0.        ]
 [0.65782931 0.53256952 0.         0.         0.53256952 0.        ]
 [0.         0.44809973 0.55349232 0.         0.         0.70203482]
 [0.         0.         0.55349232 0.70203482 0.44809973 0.        ]]
----------
Tfidf representation for 'dog and man are friends':
 [[0.         0.70710678 0.         0.         0.70710678 0.        ]]


In [74]:
import requests

# Replace 'file_id' with the actual file ID from your Google Drive link
file_id = '0B7XkCwpI5KDYNlNUTTlSS21pQmM'

# URL to download the file
url = f'https://drive.google.com/uc?id={file_id}'

# Path to save the downloaded file
downloaded_file_path = '/content/word2vec.bin.gz'

# Download the file
response = requests.get(url)
with open(downloaded_file_path, 'wb') as f:
    f.write(response.content)

print("File downloaded successfully.")


File downloaded successfully.


In [75]:

# installing the Kaggle library
!pip install kaggle



In [76]:
# configuring the path of Kaggle.json file
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json


In [77]:
!kaggle datasets download -d leadbest/googlenewsvectorsnegative300

googlenewsvectorsnegative300.zip: Skipping, found more recently modified local copy (use --force to force download)


In [78]:
# extracting the compressed dataset
from zipfile import ZipFile

dataset = '/content/googlenewsvectorsnegative300.zip'

with ZipFile(dataset, 'r') as zip:
  zip.extractall()
  print('The dataset is extracted')

The dataset is extracted


In [79]:
import warnings #This module ignores the various types of warnings generated
warnings.filterwarnings("ignore")

import psutil #This module helps in retrieving information on running processes and system resource utilization
process = psutil.Process(os.getpid())
from psutil import virtual_memory
mem = virtual_memory()

import time #This module is used to calculate the time

In [None]:
from gensim.models import Word2Vec,KeyedVectors
gen_vec_path='/content/GoogleNews-vectors-negative300.bin'
pretrainedpath=gen_vec_path
pre=process.memory_info().rss

print("Memory used in GB before Loading the Model: %0.2f"%float(pre/(10**9))) #Check memory usage before loading the model
print('-'*10)

start_time = time.time() #Start the timer
ttl = mem.total #Toal memory available

w2v_model=KeyedVectors.load_word2vec_format(pretrainedpath,binary=True)
print("%0.2f seconds taken to load"%float(time.time() - start_time)) #Calculate the total time elapsed since starting the timer
print('-'*10)

print('Finished loading Word2Vec')
print('-'*10)

post = process.memory_info().rss
print("Memory used in GB after Loading the Model: {:.2f}".format(float(post/(10**9)))) #Calculate the memory used after loading the model
print('-'*10)

print("Percentage increase in memory usage: {:.2f}% ".format(float((post/pre)*100))) #Percentage increase in memory after loading the model
print('-'*10)

 #Number of words in the vocabulary

Memory used in GB before Loading the Model: 9.07
----------


In [None]:
print("Numver of words in vocablulary: ",len(w2v_model.key_to_index))

In [None]:
w2v_model.most_similar('beautiful')

In [None]:
#Let us try with another word!
w2v_model.most_similar('toronto')

In [None]:
#What is the vector representation for a word?
w2v_model['computer']

In [None]:
!python -m spacy download en_core_web_md

In [None]:
import spacy
nlp=spacy.load('en_core_web_md')
mydoc=nlp('India is most populated country')
print(mydoc.vector)

In [None]:
from gensim.models import Word2Vec
import warnings
warnings.filterwarnings('ignore')

In [None]:
# define training data
#Genism word2vec requires that a format of ‘list of lists’ be provided for training where every document contained in a list.
#Every list contains lists of tokens of that document.
corpus = [['dog','bites','man'], ["man", "bites" ,"dog"],["dog","eats","meat"],["man", "eats","food"]]

In [None]:
model_cbow=Word2Vec(corpus,min_count=1,sg=0)
model_skipgram=Word2Vec(corpus,min_count=1,sg=1)

In [None]:
#Summarize the loaded model
print(model_cbow)

#Summarize vocabulary
words = list(model_cbow.wv.index_to_key)
print(words)

#Acess vector for one word
print(model_cbow.wv['dog'])

In [None]:
#Compute similarity
print("Similarity between eats and bites:",model_cbow.similarity('eats', 'bites'))
print("Similarity between eats and man:",model_cbow.similarity('eats', 'man'))

In [None]:
#Most similarity
model_cbow.wv.most_similar('meat')

In [None]:
# save model
model_cbow.save('model_cbow.bin')

# load model
new_model_cbow = Word2Vec.load('model_cbow.bin')
print(new_model_cbow)

In [None]:
#Summarize the loaded model
print(model_skipgram)

#Summarize vocabulary
words = list(model_skipgram.wv.index_to_key)
print(words)

#Acess vector for one word
print(model_skipgram.wv['dog'])

In [None]:
import os
import requests

os.makedirs('data/en', exist_ok= True)
file_name = "data/en/enwiki-latest-pages-articles-multistream14.xml-p13159683p14324602.bz2"
file_id = "11804g0GcWnBIVDahjo5fQyc05nQLXGwF"

def download_file_from_google_drive(id, destination):
    URL = "https://docs.google.com/uc?export=download"

    session = requests.Session()

    response = session.get(URL, params = { 'id' : id }, stream = True)
    token = get_confirm_token(response)

    if token:
        params = { 'id' : id, 'confirm' : token }
        response = session.get(URL, params = params, stream = True)

    save_response_content(response, destination)

def get_confirm_token(response):
    for key, value in response.cookies.items():
        if key.startswith('download_warning'):
            return value

    return None

def save_response_content(response, destination):
    CHUNK_SIZE = 32768

    with open(destination, "wb") as f:
        for chunk in response.iter_content(CHUNK_SIZE):
            if chunk: # filter out keep-alive new chunks
                f.write(chunk)

if not os.path.exists(file_name):
    download_file_from_google_drive(file_id, file_name)
else:
    print("file already exists, skipping download")

print(f"File at: {file_name}")

In [None]:
from gensim.corpora.wikicorpus import WikiCorpus
from gensim.models.word2vec import Word2Vec
from gensim.models.fasttext import FastText
import time

In [60]:
wiki=WikiCorpus(file_name,dictionary={})
sentences=list(wiki.get_texts())

Process InputQueue-2:
Traceback (most recent call last):
  File "/usr/lib/python3.10/multiprocessing/process.py", line 314, in _bootstrap
    self.run()
  File "/usr/local/lib/python3.10/dist-packages/gensim/utils.py", line 1292, in run
    wrapped_chunk = [list(chunk)]
  File "/usr/local/lib/python3.10/dist-packages/gensim/corpora/wikicorpus.py", line 682, in <genexpr>
    texts = (
  File "/usr/local/lib/python3.10/dist-packages/gensim/corpora/wikicorpus.py", line 412, in extract_pages
    elem = next(elems)
  File "/usr/local/lib/python3.10/dist-packages/gensim/corpora/wikicorpus.py", line 406, in <genexpr>
    elems = (elem for _, elem in iterparse(f, events=("end",)))
  File "/usr/lib/python3.10/xml/etree/ElementTree.py", line 1255, in iterator
    data = source.read(16 * 1024)
  File "/usr/lib/python3.10/bz2.py", line 164, in read
    return self._buffer.read(size)
  File "/usr/lib/python3.10/_compression.py", line 68, in readinto
    data = self.read(len(byte_view))
  File "/usr