# Importing modules

In [6]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np

# Download NLTK resources

In [10]:
# Download NLTK.resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\SwarnadeepPramanik\AppData\Roaming\nltk_data.
[nltk_data]     ..
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\SwarnadeepPramanik\AppData\Roaming\nltk_data.
[nltk_data]     ..
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\SwarnadeepPramanik\AppData\Roaming\nltk_data.
[nltk_data]     ..


True

In [11]:
df = pd.read_csv('ML_Data_Ready.csv')

In [12]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Text Processing

In [13]:
def preprocess_text(text):
    # Tokenization
    tokens = word_tokenize(text.lower())
    # Remove stopwords
    tokens = [token for token in tokens if token not in stop_words]
    
    # Lemmatization
    tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return ' '.join(tokens)


In [14]:
df.head()

Unnamed: 0,USER_ID,FULL_NAME,text
0,169,Maximo McCauley,CyberSphere Building a recommendation engine b...
1,169,Maximo McCauley,DataBlast Designing a decentralized social med...
2,169,Maximo McCauley,ByteGenius Designing a decentralized content d...
3,158,Kania Haggus,CodeCraft Building a blockchain-based decentra...
4,158,Kania Haggus,CodeCraft Building a blockchain-based decentra...


In [15]:
df['preprocessed_text'] = df['text'].apply(preprocess_text)

In [16]:
df

Unnamed: 0,USER_ID,FULL_NAME,text,preprocessed_text
0,169,Maximo McCauley,CyberSphere Building a recommendation engine b...,cybersphere building recommendation engine bas...
1,169,Maximo McCauley,DataBlast Designing a decentralized social med...,datablast designing decentralized social mediu...
2,169,Maximo McCauley,ByteGenius Designing a decentralized content d...,bytegenius designing decentralized content del...
3,158,Kania Haggus,CodeCraft Building a blockchain-based decentra...,codecraft building blockchain-based decentrali...
4,158,Kania Haggus,CodeCraft Building a blockchain-based decentra...,codecraft building blockchain-based decentrali...
...,...,...,...,...
735,424,Loydie Wych,HTML/CSS Intermediate,html/css intermediate
736,509,Karen Seckington,Ruby on Rails Intermediate,ruby rail intermediate
737,179,Swen Yarrington,CodeNest Building a collaborative project mana...,codenest building collaborative project manage...
738,254,Hinze Teresi,CodeCraft Designing a decentralized finance (D...,codecraft designing decentralized finance ( de...


# Vectorization & Model

In [18]:
# Vectorization
vectorizer = TfidfVectorizer()
text_vectors = vectorizer.fit_transform(df['preprocessed_text'])


In [19]:
def find_most_similar_indices_with_scores(string, vectorizer, text_vectors, k=5):
    # Vectorize the new string
    string_vector = vectorizer.transform([string])

    # Calculate cosine similarity between the new string vector and existing text vectors
    cosine_similarities = cosine_similarity(string_vector, text_vectors).flatten()

    # Find the indices of the top k most similar texts
    top_indices = cosine_similarities.argsort()[-k:][::-1]

    # Get the corresponding cosine similarity scores
    similarity_scores = cosine_similarities[top_indices]

    return top_indices, similarity_scores

# NLP Testing

In [20]:
text_input = "data governance"
find_most_similar_indices_with_scores(text_input,vectorizer,text_vectors)

(array([203, 366, 142, 207, 172], dtype=int64),
 array([0.47694247, 0.47499496, 0.47284904, 0.43332297, 0.38445572]))

In [21]:
df.iloc[203]['FULL_NAME']

'Rey Polkinghorne'