In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from NltkPreprocessingSteps import NltkPreprocessingSteps
import os
from sklearn.cluster import KMeans

In [2]:
# Load data from Excel sheet
os.chdir("E:\Code\OvGU\Information Retrieval\Search Result Clustering")
preprocessed_doc_file_path = 'pre_processed docs with id.xlsx'  
original_doc_file_path = "big dataset input docs.xlsx"
preprocessed_doc_path_exist = os.path.exists(preprocessed_doc_file_path)
original_doc_path_exits = os.path.exists(original_doc_file_path)

print(preprocessed_doc_path_exist, original_doc_path_exits)

True True


In [3]:
ppdoc_df = pd.read_excel(preprocessed_doc_file_path)
odoc_df = pd.read_excel(original_doc_file_path)
ppdoc_df

Unnamed: 0,doc_id,filenames,text
0,1,Ab Shirin.txt,ab shirin abshirin persian b shyryn may refer ...
1,2,Abdullah Al Shami.txt,abdullah alshami arabic bdllh lshmy bear march...
2,3,Abdullah Al Shemali.txt,abdullah al shemali bear june kuwait city kuwa...
3,4,Abdullah Al Yahya.txt,abdullah al yahya arabic bd llh lyhy bear febr...
4,5,Abdullah Alawi.txt,abdullah alawi arabicbd llh lwy bear august qa...
...,...,...,...
1954,1955,"Zwolle, Gelderland.txt",zwolle small village east netherlands locate m...
1955,1956,"Zwolle, Louisiana.txt",zwolle zawallee small town sabine parish louis...
1956,1957,Zwolle.txt,zwolle dutch zvol listen city municipality nor...
1957,1958,Zwollerkerspel.txt,zwollerkerspel low saxon zwollerkarspel former...


In [4]:
# Accept query string from the user
#query = input("Enter your query: ")
query = "World Cup"
query_dict = {
    "doc_id":-1,
    "filenames":"query",
    "text": query
}
query_df = pd.DataFrame(query_dict, index=[0])
query_df

Unnamed: 0,doc_id,filenames,text
0,-1,query,World Cup


In [5]:
txt_preproc = NltkPreprocessingSteps(query_df['text'])
processed_query = \
            txt_preproc \
            .remove_html_tags()\
            .replace_diacritics()\
            .expand_contractions()\
            .remove_numbers()\
            .remove_punctuations_except_periods()\
            .lemmatize()\
            .remove_double_spaces()\
            .remove_all_punctuations()\
            .remove_stopwords()\
            .get_processed_text()

resource corpora/wordnet not found. Downloading now...


[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\sinha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\sinha\AppData\Roaming\nltk_data...


resource corpora/omw-1.4 not found. Downloading now...


[nltk_data]   Package omw-1.4 is already up-to-date!


Executing remove_html_tags
Executing replace_diacritics
Executing expand_contractions
Executing remove_numbers
Executing remove_punctuations_except_periods
Executing lemmatize
Executing remove_double_spaces
Executing remove_all_punctuations
Executing remove_stopwords


In [6]:
print("Prev: ",ppdoc_df.shape)
ppdoc_df.loc[len(ppdoc_df.index)] = [-1, "query", processed_query[0]]
print("After: ",ppdoc_df.shape)
ppdoc_df

Prev:  (1959, 3)
After:  (1960, 3)


Unnamed: 0,doc_id,filenames,text
0,1,Ab Shirin.txt,ab shirin abshirin persian b shyryn may refer ...
1,2,Abdullah Al Shami.txt,abdullah alshami arabic bdllh lshmy bear march...
2,3,Abdullah Al Shemali.txt,abdullah al shemali bear june kuwait city kuwa...
3,4,Abdullah Al Yahya.txt,abdullah al yahya arabic bd llh lyhy bear febr...
4,5,Abdullah Alawi.txt,abdullah alawi arabicbd llh lwy bear august qa...
...,...,...,...
1955,1956,"Zwolle, Louisiana.txt",zwolle zawallee small town sabine parish louis...
1956,1957,Zwolle.txt,zwolle dutch zvol listen city municipality nor...
1957,1958,Zwollerkerspel.txt,zwollerkerspel low saxon zwollerkarspel former...
1958,1959,Zwolnieni z Teorii.txt,zwolnieni z teorii internet platform create zw...


In [7]:
# TF-IDF Vectorization
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix = tfidf_vectorizer.fit_transform(ppdoc_df["text"])

In [8]:
print(tfidf_matrix.shape)
print(tfidf_matrix[-1])

(1960, 20297)
  (0, 4297)	0.8136733299898108
  (0, 19918)	0.581322382214286


In [9]:
clustering_model = KMeans(n_clusters = 25, 
                          init = 'k-means++',
                          max_iter = 300, n_init = 10)
clustering_model.fit(tfidf_matrix)

In [10]:
sorted_centroids = clustering_model.cluster_centers_.argsort()[:, ::-1]
terms = tfidf_vectorizer.get_feature_names_out() 

for i in range(25):
        print("Cluster %d:" % i, end='')
        for ind in sorted_centroids[i, :10]:
            print(' %s' % terms[ind], end='')
        print()
        print()
 
print()

Cluster 0: gene encode enzyme human protein sulfotransferase sulfate succinylcoa cytosolic sulf

Cluster 1: film work series write direct novel star bear american book

Cluster 2: comic marvel book artist captain spiderman series issue drug sight

Cluster 3: may refer american bear january seidelmann miss serve michelle gas

Cluster 4: football play league player team american bear baseball basketball professional

Cluster 5: station railway line radio fm serve license train road rail

Cluster 6: university college professor science academic law school study bear oxford

Cluster 7: specie family genus moth find plant describe beetle endemic america

Cluster 8: oriental company building church bank hall service mottram locate day

Cluster 9: state district united municipality court region town peak elmenhorst village

Cluster 10: play footballer bear midfielder professional rugby august defender league former

Cluster 11: olympics summer compete event men medal win winter bear champions

In [11]:
res = []

for i in range(tfidf_matrix.shape[0]-1):
    sim = cosine_similarity(tfidf_matrix[i], tfidf_matrix[-1])
    if sim > 0:
        cluster_pred = clustering_model.predict(tfidf_matrix[i])
        res.append([sim, odoc_df.loc[i], cluster_pred])
    

In [12]:
res.sort(key=lambda r: r[0], reverse=True)
print("Query: ",query, end="\n\n")
cluster_dict = {}

display_res_limit = 10 if len(res) > 10 else len(res)

for x in range(display_res_limit):
    print("Rank",(x+1))
    print("Sim: ",res[x][0], "Cluster: ",res[x][2])
    print("File name: ",res[x][1]["filenames"])
    print("Doc: ", res[x][1]["text"], end="\n\n")

    # inc cluster count in dict
    cluster_dict[res[x][2][0]]  = cluster_dict.get(res[x][2][0], 0) + 1

print(cluster_dict)

Query:  World Cup

Rank 1
Sim:  [[0.3508111]] Cluster:  [4]
File name:  Bob Storey.txt
Doc:  Bob Storey is a former offensive and defensive back, and kick returner, who played four seasons in the Canadian Football League, winning 2 Grey Cups. He played 2 seasons and 28 games for the Hamilton Tiger-Cats, winning a Cup in 1967, and 2 seasons and 27 games for the Montreal Alouettes, winning another cup in 1970.
His father was Red Storey, famed Canadian football player (and Grey Cup champion) and sportsman.

Rank 2
Sim:  [[0.34551797]] Cluster:  [24]
File name:  Suranga Sampath.txt
Doc:  Suranga Sampath is a Sri Lankan blind cricketer. He was part of the Sri Lankan team during the 2017 ICC World T20 for the Blind. In the 2017 Blind T20 World Cup, he was the top scorer throughout the tournament with an aggregate of 733 runs with an average of excess 150. He also scored 5 centuries during the tournament, the most by any player in a single Blind T20 World Cup series; thus becoming the only ba