In [44]:
import wikipediaapi
import nltk
import ssl
import re
import numpy as np
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans

try:
    _create_unverified_https_context = ssl._create_unverified_context
except AttributeError:
    pass
else:
    ssl._create_default_https_context = _create_unverified_https_context

# Now, try to download the data again
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

print("NLTK data downloaded successfully using the manual method.")

NLTK data downloaded successfully using the manual method.


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\pasin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\pasin\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\pasin\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [45]:
wiki_api = wikipediaapi.Wikipedia('test_pr/1.0', 'en')
page = wiki_api.page('Nirvana (band)')

if page.exists():
    print("Title:", page.title, "/n")
    print("Text:", page.text, "/n")

Title: Nirvana (band) /n
Text: Nirvana was an American rock band formed in Aberdeen, Washington, in 1987. Founded by lead singer and guitarist Kurt Cobain and bassist Krist Novoselic, the band went through a succession of drummers, most notably Chad Channing, before recruiting Dave Grohl in 1990. Nirvana's success popularized alternative rock, and they were often referenced as the figurehead band of Generation X. Their music maintains a popular following and continues to influence rock culture.
In the late 1980s, Nirvana established itself as part of the Seattle grunge scene, releasing its first album, Bleach, for the independent record label Sub Pop in 1989. They developed a sound that relied on dynamic contrasts, often between quiet verses and loud, heavy choruses. After signing to the major label DGC Records in 1990, Nirvana found unexpected mainstream success with "Smells Like Teen Spirit", the first single from its landmark second album, Nevermind (1991). A cultural phenomenon of 

In [46]:
article_titles = [
    "Galaxy", "Black Hole", "Supernova",
    "DNA", "Photosynthesis", "Evolution",
    "Machine learning", "Artificial Intelligence", "Computer Science"
]

wiki_api = wikipediaapi.Wikipedia('MyClusteringProject/1.0', 'en')

documents = []

for title in article_titles:
    page = wiki_api.page(title)
    if page.exists():
        documents.append(page.text)
    else:
        print("Such Document Doesn't Exist")
    
print("\nFinal Document List:", documents)





In [47]:
stop_words = set(stopwords.words('english'))
lemmeatizer = WordNetLemmatizer()

def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    processed_words = [lemmeatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(processed_words)

processed_documents = [preprocess_text(doc) for doc in documents]
print(processed_documents)




In [48]:
vectorizer = TfidfVectorizer(max_features=1000)
tfidf_matrix = vectorizer.fit_transform(processed_documents)
print(tfidf_matrix)

print("TF-IDF maxtrix created successfully")
print(f"Shape of the matrix: {tfidf_matrix.shape}")


<Compressed Sparse Row sparse matrix of dtype 'float64'
	with 5238 stored elements and shape (9, 1000)>
  Coords	Values
  (0, 382)	0.8697331752334779
  (0, 903)	0.011600328464334625
  (0, 866)	0.28342742451698
  (0, 871)	0.02500830216326294
  (0, 776)	0.004910265858239363
  (0, 472)	0.027006462220316493
  (0, 385)	0.06295149987608824
  (0, 270)	0.029461595149436175
  (0, 215)	0.0392821268659149
  (0, 552)	0.03205636263284271
  (0, 109)	0.008763036744009368
  (0, 932)	0.011803406226766544
  (0, 402)	0.016672201442175294
  (0, 992)	0.004808454394926405
  (0, 572)	0.1361563117777649
  (0, 761)	0.002900082116083656
  (0, 982)	0.07105201184404958
  (0, 193)	0.005320825107854087
  (0, 848)	0.013144555116014053
  (0, 310)	0.009836171855638787
  (0, 573)	0.017630999448063487
  (0, 738)	0.004808454394926405
  (0, 839)	0.021639578082405332
  (0, 271)	0.055574004807250985
  (0, 510)	0.007250205290209141
  :	:
  (8, 492)	0.10921317898566449
  (8, 118)	0.023402824068356677
  (8, 869)	0.023402824068

In [49]:
k = 3
## The below line is functionality identical to the above line
# kmeans = KMeans(nclusters= k, init="k-means++, random_state=42, n_init=10")
kmeans = KMeans(n_clusters=k, random_state=42, n_init=10)
kmeans.fit(tfidf_matrix)

#Get the cluster assignments for each document
labels = kmeans.labels_
print(labels)

[2 2 2 0 0 0 1 1 1]


In [50]:
sets = "An algorithm is a set of well-defined instructions designed to perform a specific task or solve a computational problem. In computer science,"
" the study of algorithms is fundamental to creating efficient and scalable software. Data structures, such as arrays and hash tables, are used to organize"
" data in a way that allows these algorithms to access and manipulate it effectively."

processed_sets = preprocess_text(sets)
new_tfidf_vector = vectorizer.transform([processed_sets])
predicted_label = kmeans.predict(new_tfidf_vector)

print(f"\nThe new document belongs to cluster: {predicted_label[0]}")


The new document belongs to cluster: 1
