In [99]:
import nltk
import spacy
import gensim
from gensim.models import Word2Vec,FastText
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag, ne_chunk
from collections import Counter
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


In [82]:
import nltk
nltk.data.path.append("C:/Users/DELL8/AppData/Roaming/nltk_data")
nltk.download('punkt', download_dir="C:/Users/DELL8/AppData/Roaming/nltk_data")


[nltk_data] Downloading package punkt to
[nltk_data]     C:/Users/DELL8/AppData/Roaming/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [83]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('maxent_ne_chunker')
nltk.download('words')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\DELL8\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\DELL8\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\DELL8\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\DELL8\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package maxent_ne_chunker to
[nltk_data]     C:\Users\DELL8\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\DELL8\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-

True

#### READ DATA

In [86]:
# Define file path
file_path = "D:\wiki file\enwiki-latest-abstract3.xml"

text_data = []
with open(file_path, "r", encoding="utf-8") as file:
    for _ in range(10000):  
        text_data.append(file.readline().strip())


#### Cleaning

In [87]:
import re

clean_text = re.sub(r"<.*?>", " ", raw_text)  
clean_text = re.sub(r"https?://\S+", " ", clean_text) 
clean_text = re.sub(r"[^a-zA-Z\s]", " ", clean_text)  




In [88]:
text = " ".join(text_data)

In [89]:


print("Raw Text (First 500 chars):", clean_text[:500])


Raw Text (First 500 chars):      Wikipedia  Diego Maradona stadium       Diego Maradona stadium can refer to       All article disambiguation pages        All disambiguation pages        Place name disambiguation pages        Short description is different from Wikidata             Wikipedia  White Stone       White Stone may refer to       See also             Wikipedia  Yes Tor         grid ref UK   SX            In popular culture        References        External links             Wikipedia  Watermelon Man  composition


#### lemmatizer

In [90]:

lemmatizer = WordNetLemmatizer()
lemmatized_text = " ".join([lemmatizer.lemmatize(word) for word in clean_text.split()])

print("Lemmatized Text (First 500 chars):", lemmatized_text[:500])


Lemmatized Text (First 500 chars): Wikipedia Diego Maradona stadium Diego Maradona stadium can refer to All article disambiguation page All disambiguation page Place name disambiguation page Short description is different from Wikidata Wikipedia White Stone White Stone may refer to See also Wikipedia Yes Tor grid ref UK SX In popular culture References External link Wikipedia Watermelon Man composition length Herbie Hancock version Mongo Santamar a version Chart performance Herbie Hancock version Other version Samples Personnel R


#### count_vectorizer

In [125]:
count_vectorizer = CountVectorizer(max_features=10000)
count_vectors = count_vectorizer.fit_transform([lemmatized_text])

print("Count Vectorizer (First 10 features):", count_vectorizer.get_feature_names_out()[:10])

Count Vectorizer (First 10 features): ['abort' 'about' 'access' 'accessdate' 'account' 'acid' 'act' 'active'
 'addison' 'additional']


#### tfidf_vectorizer

In [127]:

tfidf_vectorizer = TfidfVectorizer(max_features=1000)
tfidf_vectors = tfidf_vectorizer.fit_transform([clean_text])
print("TF-IDF Features:", tfidf_vectorizer.get_feature_names_out()[:10])


TF-IDF Features: ['abort' 'about' 'access' 'accessdate' 'account' 'acid' 'act' 'active'
 'addison' 'additional']


In [128]:
print("Sample words in vocabulary:", list(word2vec_cbow.wv.index_to_key)[:20])


Sample words in vocabulary: ['the', 'wikipedia', 'of', 'references', 'in', 'and', 'link', 'external', 'a', 'also', 'is', 'see', 'to', 'disambiguation', 'page', 'career', 'history', 'with', 'life', 'all']


#### Word2Vec CBOW Model

In [129]:
word2vec_cbow = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4, sg=0)
word2vec_cbow.save("word2vec_cbow.model")
print("Vocabulary Size:", len(word2vec_cbow.wv))
print("Sample Words in Vocabulary:", list(word2vec_cbow.wv.index_to_key)[:10])
word_to_check = "wikipedia"  # Choose a valid word from vocabulary
if word_to_check in word2vec_cbow.wv:
    print("Similar Words:", word2vec_cbow.wv.most_similar(word_to_check, topn=5))
else:
    print(f"'{word_to_check}' not found in vocabulary. Try another word.")

Vocabulary Size: 1039
Sample Words in Vocabulary: ['the', 'wikipedia', 'of', 'references', 'in', 'and', 'link', 'external', 'a', 'also']
Similar Words: [('trials', 0.3639206290245056), ('space', 0.32055142521858215), ('mathematical', 0.25173982977867126), ('first', 0.25064992904663086), ('stadium', 0.24792534112930298)]


#### Word2Vec Skip-gram Model

In [130]:
word2vec_sg = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4, sg=1)
word2vec_sg.save("word2vec_skipgram.model")
print("Skip-gram Vocabulary Size:", len(word2vec_sg.wv))
print("Skip-gram Sample Words:", list(word2vec_sg.wv.index_to_key)[:10])
if "wikipedia" in word2vec_sg.wv:
    print("Skip-gram Similar Words:", word2vec_sg.wv.most_similar("wikipedia", topn=5))

Skip-gram Vocabulary Size: 1039
Skip-gram Sample Words: ['the', 'wikipedia', 'of', 'references', 'in', 'and', 'link', 'external', 'a', 'also']
Skip-gram Similar Words: [('the', 0.9531342387199402), ('and', 0.9510596990585327), ('references', 0.9473447799682617), ('a', 0.9407880902290344), ('in', 0.9359356164932251)]


#### FastText Model

In [115]:
fasttext_model = FastText(sentences, vector_size=100, window=5, min_count=1, workers=4)
fasttext_model.save("fasttext.model")
print("FastText Vocabulary Size:", len(fasttext_model.wv))
print("FastText Sample Words:", list(fasttext_model.wv.index_to_key)[:10])
if "wikipedia" in fasttext_model.wv:
    print("FastText Similar Words:", fasttext_model.wv.most_similar("wikipedia", topn=5))


FastText Vocabulary Size: 1039
FastText Sample Words: ['the', 'wikipedia', 'of', 'references', 'in', 'and', 'link', 'external', 'a', 'also']
FastText Similar Words: [('nation', 0.6517307162284851), ('national', 0.6498042941093445), ('station', 0.6383634209632874), ('relation', 0.6306263208389282), ('international', 0.6161552667617798)]
