### 1. Input Preprocessing

Importing Necessary Libraries:

In [None]:
import nltk #The Natural Language Toolkit.
import spacy #Another NLP library for building pipelines.
import string #Used to access the "punctuation" constant within the "string" module --> "punctuation" contains all the punctuation characters defined in the ASCII standard.
import torch

#Input Processing Libraries:
nltk.download("punkt") #A module from NLTK for effective tokenization.
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
from nltk.corpus import stopwords
nltk.download('averaged_perceptron_tagger') #A module from NLTK for POS Tagging.
from nltk import pos_tag

!pip install transformers
from transformers import pipeline, BertTokenizer, BertModel #Need it to access the BERT transformer.

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


Collecting transformers
  Downloading transformers-4.31.0-py3-none-any.whl (7.4 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.4/7.4 MB[0m [31m60.7 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.14.1 (from transformers)
  Downloading huggingface_hub-0.16.4-py3-none-any.whl (268 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m268.8/268.8 kB[0m [31m35.1 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1 (from transformers)
  Downloading tokenizers-0.13.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (7.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.8/7.8 MB[0m [31m114.7 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting safetensors>=0.3.1 (from transformers)
  Downloading safetensors-0.3.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (1.3 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m81.7 MB/s[0m eta [36m0:00:

Loading BERT Transformer:

In [None]:
#Loading BERT model for feature extraction:
model = "bert-large-uncased" #Using the base BERT model with uncased (lowercase) text.
# pipelines are a high-level interface from the Huggingface transformers library for interacting with the BERT model
bertNLP = pipeline("feature-extraction", model = model) #BERT-based pipeline object --> Initialized for feature extraction.

Downloading (…)lve/main/config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.34G [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/28.0 [00:00<?, ?B/s]

Downloading (…)solve/main/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)/main/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Input Processing from scratch

In [None]:
#This is a series of functions that are going to be used to break down the user input (input processing).
#All commented functions below are no longer needed as BERT already captures much
#of the relevant linguistic information in its encoded representations.

def tokenizeText(userInput):
  tokenizedWords = word_tokenize(userInput) #Uses the NLTK library to perform word-level tokenziation (based on whitespace & punctuation).
  return tokenizedWords


def stopwordRemoval(encodedText):
  stopwordSet = set(stopwords.words("english")) #Using NLTK library to build a set of all stopwords in the english language.
  filteredTokens = []
  for token in encodedText:
    tokenString = str(token)  # Convert token to string
    if tokenString.lower() not in stopwordSet: #If lowercase version of the current token is not in stopwordSet --> token isn't a stopword.
      filteredTokens.append(token)

  return filteredTokens #A set of all tokens that aren't stopwords.


def ner(userInput): #Named Entity Recognition.
  nlp = spacy.load("en_core_web_sm") #Loading the spaCy model for English language processing.
  doc = nlp(userInput) #Processing the text using the spaCy model.

  namedEntities = [] #List to store the named entities.

  for entity in doc.ents: #Iterate over each named entity in the processed text.
    #Extracting the label & text of each named entity:
    entityLabel = entity.label_
    entityText = entity.text

    #Creating a tuple containing the entity's label & text:
    entityTuple = (entityText, entityLabel)
    namedEntities.append(entityTuple)

  return namedEntities


def dependencyParsing(userInput):
  nlp = spacy.load("en_core_web_sm")
  doc = nlp(userInput)

  dependencies = [] # Initialize an empty list to store the dependency parsing results.

  for token in doc: #Iterating over each token in the processed text.
    #Extracting the text, dependency label,and head text of the token:
    tokenText = token.text
    tokenDependecyLabel = token.dep_
    tokenHeadText = token.head.text

    #Creating a tuple containing the text, dependency label,and head text:
    dependencyTuple = (tokenText, tokenDependecyLabel, tokenHeadText)
    dependencies.append(dependencyTuple)

  return dependencies


# POS tagging is the only function not fully handled by BERT
def posTagging(tokenizedWords): #Part-of-Speech tagging.
  taggedTokens = []

  for token in tokenizedWords:
    tokenString = str(token)
    taggedToken = nltk.pos_tag([tokenString]) #tokenString is passed to nltk.pos_tag as a list.
    taggedTokens.append(taggedToken[0]) #Extracting the first tuple.

  return taggedTokens


Preprocessing Test Case:

In [None]:
projectDescription = "I'm building an app that automates employee onboarding in a company."

tokenizedWords = tokenizeText(projectDescription)
filteredTokens = stopwordRemoval(tokenizedWords)
taggedTokens = posTagging(filteredTokens)
namedEntities = ner(projectDescription)
dependencyParsed = dependencyParsing(projectDescription)

#Printing the results:
print("Tokens:", tokenizedWords)
print("Filtered Tokens:", filteredTokens)
print("POS Tagging:", taggedTokens)
print("Named Entities:", namedEntities)
print("Dependency Parsing:", dependencyParsed)

Tokens: ['I', "'m", 'building', 'an', 'app', 'that', 'automates', 'employee', 'onboarding', 'in', 'a', 'company', '.']
Filtered Tokens: ["'m", 'building', 'app', 'automates', 'employee', 'onboarding', 'company', '.']
POS Tagging: [("'m", 'VBP'), ('building', 'NN'), ('app', 'NN'), ('automates', 'NNS'), ('employee', 'NN'), ('onboarding', 'VBG'), ('company', 'NN'), ('.', '.')]
Named Entities: []
Dependency Parsing: [('I', 'nsubj', 'building'), ("'m", 'aux', 'building'), ('building', 'ROOT', 'building'), ('an', 'det', 'app'), ('app', 'dobj', 'building'), ('that', 'nsubj', 'automates'), ('automates', 'relcl', 'app'), ('employee', 'compound', 'onboarding'), ('onboarding', 'dobj', 'automates'), ('in', 'prep', 'onboarding'), ('a', 'det', 'company'), ('company', 'pobj', 'in'), ('.', 'punct', 'building')]


In [None]:
"""
Function to preprocess text using BERT pipeline and NLTK (POS tagging)
"""
def preprocessText(userInput: str) -> "dict[str, tuple[torch.Tensor, str]]":
  tokenizer = BertTokenizer.from_pretrained('bert-large-uncased') #Loading the BERT tokenizer
  tokenizedWords = tokenizer.tokenize(userInput) #Tokenizing the input into individual tokens.

  embeddings = bertNLP(userInput) #Utilizes BERT pipeline to apply BERT tokenization (advanced tokenization), and pass through BERT model.
  #The line above will return a [[[]], [[]], ...] of BERT embeddings or encoded representations for each token in the text.
  #Outer list = each sentence and Inner list = tokens in each sentence.

  embeddingsTensors = [] #List to store the tensor representations of the embeddings.
  for embedding in embeddings:
    embeddingsTensors.append(torch.tensor(embedding)) #Converting embeddings to tensors as its a more compatable format/data type.

  flattenedTensors = torch.cat(embeddingsTensors, dim = 0)  #Flattening the tensors -> a list of 2D tensors into single 2D tensor.

  #Filtering Tokens:
  filteredTokens = []
  for token in tokenizedWords:
    #Want to exclude padding and unkown tokens from the list.
    if token not in ["[CLS]", "[SEP]", "[PAD]", "[UNK]"]:
      filteredTokens.append(token)
  print("Filtered Tokens: ", filteredTokens)

  #Mapping tokens & corresponding tensors in a dictionary:
  tokenTensors = {} #Key = Token & Value = Corresponding Tensors
  for token, tensor in zip(filteredTokens, flattenedTensors):
    tokenTensors[token] = tensor

  #Performing POS tagging using NLTK
  posTags = nltk.pos_tag(filteredTokens)

  #Updating the tokenTensors dictionary with POS tags
  for token, posTag in posTags:
    if token in tokenTensors.keys():
      tokenTensors[token] = (tokenTensors[token], posTag)
    else:
      print("Token doesn't exist in tokenTensors dictionary.")

  return tokenTensors

# 2. BERT-only preprocessing

In [None]:
MAX_TOKEN_LENGTH = 50

def preprocess_input_text(input_text: str) -> list:
    # 1. Tokenize the input text using BERT tokenizer
    tokenizedWords = word_tokenize(input_text) #Uses the NLTK library to perform word-level tokenziation (based on whitespace & punctuation).

    # 2. Lowercase the tokens
    tokenized_text = [token.lower() for token in tokenizedWords]

    # 3. Take only a maximum number of tokens
    tokenized_text = tokenized_text[:MAX_TOKEN_LENGTH]

    # 4. Convert the list of tokenized tokens back to a string
    processed_text = " ".join(tokenized_text)

    return processed_text


In [None]:
"""
Function to preprocess text using BERT pipeline and NLTK (POS tagging)
"""
def generate_sentence_embeddings(sentence: str) -> torch.Tensor:

  embeddings = bertNLP(sentence) #Utilizes BERT pipeline to apply BERT tokenization (advanced tokenization), and pass through BERT model.
  #The line above will return a [[[]], [[]], ...] of BERT embeddings or encoded representations for each token in the text.
  #Outer list = each sentence and Inner list = tokens in each sentence.

  # Convert each token embedding to a tensor
  # The shape of the tensor will be (sequence_length, embedding size)
  embeddings_list = [torch.tensor(embedding) for embedding in embeddings[0]]
  embeddings_tensor = torch.stack(embeddings_list)

  # [CLS] token is a classification tag
  cls_index = 0
  # Since [CLS] token represents the meaning at the sentence-level
  cls_embedding = embeddings_tensor[cls_index]

  return cls_embedding

In [None]:
line_1 = "I'm building an app that automates employee onboarding in a company."
processed_text1 = preprocess_input_text(line_1)
embedding_1 = generate_sentence_embeddings(processed_text1)
print(f'line 1 embedding: {embedding_1.shape}')

line_2 = "workplace onboarding"
processed_text_2 = preprocess_input_text(line_2)
embedding_2 = generate_sentence_embeddings(processed_text_2)
print(f'line 2 embedding: {embedding_2.shape}')

similarity = torch.nn.functional.cosine_similarity(embedding_1, embedding_2, dim=0)
#Cosine similiarity is being perfomed on the CLS embeddings of each sentence.
#The CLS embedding is what represents the meaning of the phrase so we need to project description by which their meanings are most similar.

print(similarity)

line 1 embedding: torch.Size([1024])
line 2 embedding: torch.Size([1024])
tensor(0.5932)


In [None]:
def test_cosine_similarity():
    # List of example sentences
    sentences = [
        "The sun rises in the east.",
        "Daylight emerges from the eastern horizon.",
        "Mornings start with the appearance of the sun on the eastern side.",
        "Sunlight becomes visible as it ascends in the sky from the east.",
        "The break of dawn occurs when the sun comes up in the eastern direction.",
        "Early mornings are marked by the sun's presence in the east.",
        "The eastern horizon brightens as the sun begins to rise.",
        "Sunbeams gradually illuminate the eastern sky at daybreak.",
        "At dawn, the sun starts its journey from the east.",
        "As the day begins, the sun's rays become visible on the eastern side."
    ]

    embeddings = [generate_sentence_embeddings(preprocess_input_text(sentence)) for sentence in sentences]
    print(embeddings[0].shape)

    # Calculate cosine similarity for all pairs of sentences
    similarity_scores = []
    for i in range(len(embeddings)):
        for j in range(i + 1, len(embeddings)):
            similarity = torch.nn.functional.cosine_similarity(embeddings[i], embeddings[j], dim=0)
            similarity_scores.append((similarity.item(), i, j))

    # Sort the similarity scores in descending order
    similarity_scores.sort(reverse=True, key=lambda x: x[0])

    # Print the ordered similarity scores along with the sentences
    for score, i, j in similarity_scores:
        print(f"Sentence {i+1}: {sentences[i]}")
        print(f"Sentence {j+1}: {sentences[j]}")
        print(f"Similarity score: {score}")
        print()

In [None]:
test_cosine_similarity()

torch.Size([1024])
Sentence 7: The eastern horizon brightens as the sun begins to rise.
Sentence 9: At dawn, the sun starts its journey from the east.
Similarity score: 0.8732309341430664

Sentence 2: Daylight emerges from the eastern horizon.
Sentence 7: The eastern horizon brightens as the sun begins to rise.
Similarity score: 0.8656920194625854

Sentence 1: The sun rises in the east.
Sentence 9: At dawn, the sun starts its journey from the east.
Similarity score: 0.8619586229324341

Sentence 4: Sunlight becomes visible as it ascends in the sky from the east.
Sentence 10: As the day begins, the sun's rays become visible on the eastern side.
Similarity score: 0.8485180139541626

Sentence 4: Sunlight becomes visible as it ascends in the sky from the east.
Sentence 8: Sunbeams gradually illuminate the eastern sky at daybreak.
Similarity score: 0.8410884141921997

Sentence 5: The break of dawn occurs when the sun comes up in the eastern direction.
Sentence 6: Early mornings are marked by

# 3. Dataset Cleanup

In [None]:
import pandas as pd
from google.colab import drive
drive.mount('/content/gdrive')

import os
import sys

Mounted at /content/gdrive


In [None]:
devpost_df = pd.read_csv("/content/gdrive/MyDrive/SYDE-Project/Rec-Eng/Data/devpost_scraper_output_long_1.csv")

In [None]:
import re

def remove_special_characters(input_text):
  # Remove newline characters
  input_text = input_text.replace('\n', ' ')
  input_text = input_text.replace('\r', ' ')

  # Remove all backslashes followed by any character
  input_text = re.sub(r'\\.', '', input_text)
  return input_text

"""
Method to apply any cleanup methods to the data
"""
def cleanup_data_text(text):
  cleaned_text = remove_special_characters(text)

  return cleaned_text

Cleaning up dataframe

In [None]:
devpost_df['description'].apply(type).unique()
float_rows = devpost_df[devpost_df['description'].apply(lambda x: isinstance(x, float))]
print(f"Columns with data type 'float': {float_rows}")
devpost_df.shape[0]

Columns with data type 'float':                                          url description        technologies
6180  https://devpost.com/software/carboncut         NaN  javascript, r, wix


10190

In [None]:
string_rows = devpost_df[devpost_df['description'].apply(lambda x: isinstance(x, str))]

# Remove rows with non-string values in the 'description' column
devpost_df_cleaned = string_rows.dropna()
devpost_df_cleaned.shape[0]

10189

In [None]:
devpost_df_cleaned['cleaned_description'] = devpost_df_cleaned['description'].apply(lambda x: cleanup_data_text(x))
devpost_df_cleaned.head()

Unnamed: 0,url,description,technologies,cleaned_description
0,https://devpost.com/software/empower-ai,Image Captioning and TexttoSpeech System\nThe ...,"cuda, git, python, pytorch, react",Image Captioning and TexttoSpeech System The I...
1,https://devpost.com/software/realideas,What it does This web serve as a platform to a...,"cloudinary, express.js, mongodb, node.js, post...",What it does This web serve as a platform to a...
2,https://devpost.com/software/multilingual-tran...,It summarizes any long article to a brief summ...,"css, javascript, machine-learning, natural-lan...",It summarizes any long article to a brief summ...
3,https://devpost.com/software/chatcaptain,As we acknowledge that there is room for impro...,"firebase, javascript, node.js, react",As we acknowledge that there is room for impro...
4,https://devpost.com/software/volunteerin-1vehon,VolunteerIn is an online platform where high s...,"css, html, javascript, react",VolunteerIn is an online platform where high s...


# 4. Dataset preprocessing by BERT (2)

In [None]:
devpost_df_cleaned['preprocessed_text'] = devpost_df_cleaned['cleaned_description'].apply(lambda x: preprocess_input_text(x))
devpost_df_cleaned.head()

Unnamed: 0,url,description,technologies,cleaned_description,preprocessed_text
0,https://devpost.com/software/empower-ai,Image Captioning and TexttoSpeech System\nThe ...,"cuda, git, python, pytorch, react",Image Captioning and TexttoSpeech System The I...,image captioning and texttospeech system the i...
1,https://devpost.com/software/realideas,What it does This web serve as a platform to a...,"cloudinary, express.js, mongodb, node.js, post...",What it does This web serve as a platform to a...,what it does this web serve as a platform to a...
2,https://devpost.com/software/multilingual-tran...,It summarizes any long article to a brief summ...,"css, javascript, machine-learning, natural-lan...",It summarizes any long article to a brief summ...,it summarizes any long article to a brief summ...
3,https://devpost.com/software/chatcaptain,As we acknowledge that there is room for impro...,"firebase, javascript, node.js, react",As we acknowledge that there is room for impro...,as we acknowledge that there is room for impro...
4,https://devpost.com/software/volunteerin-1vehon,VolunteerIn is an online platform where high s...,"css, html, javascript, react",VolunteerIn is an online platform where high s...,volunteerin is an online platform where high s...


In [None]:
#devpost_df_cleaned = devpost_df_cleaned.head(100) # CHANGE THIS FOR TESTING TO TRY DIFFERENT SIZED DATASETS
devpost_df_cleaned['embeddings'] = devpost_df_cleaned['preprocessed_text'].apply(lambda x: generate_sentence_embeddings(x))
devpost_df_cleaned.head()

Unnamed: 0,url,description,technologies,cleaned_description,preprocessed_text,embeddings
0,https://devpost.com/software/empower-ai,Image Captioning and TexttoSpeech System\nThe ...,"cuda, git, python, pytorch, react",Image Captioning and TexttoSpeech System The I...,image captioning and texttospeech system the i...,"[tensor(-0.2477), tensor(-0.1445), tensor(-0.7..."
1,https://devpost.com/software/realideas,What it does This web serve as a platform to a...,"cloudinary, express.js, mongodb, node.js, post...",What it does This web serve as a platform to a...,what it does this web serve as a platform to a...,"[tensor(-0.5837), tensor(-0.8866), tensor(-0.9..."
2,https://devpost.com/software/multilingual-tran...,It summarizes any long article to a brief summ...,"css, javascript, machine-learning, natural-lan...",It summarizes any long article to a brief summ...,it summarizes any long article to a brief summ...,"[tensor(-0.0907), tensor(-0.2703), tensor(-0.4..."
3,https://devpost.com/software/chatcaptain,As we acknowledge that there is room for impro...,"firebase, javascript, node.js, react",As we acknowledge that there is room for impro...,as we acknowledge that there is room for impro...,"[tensor(-0.0143), tensor(0.2096), tensor(-0.79..."
4,https://devpost.com/software/volunteerin-1vehon,VolunteerIn is an online platform where high s...,"css, html, javascript, react",VolunteerIn is an online platform where high s...,volunteerin is an online platform where high s...,"[tensor(-0.2154), tensor(-0.0140), tensor(-1.1..."


Saving select columns from df to csv

In [None]:
columns_to_save = ['url', 'description', 'embeddings']
devpost_df_to_save = devpost_df_cleaned.loc[:, columns_to_save]

#Specifying the complete file path for the csv file with embeddings to be stoed:
if not os.path.isdir("/content/gdrive/MyDrive/SYDE-Project/Rec-Eng/Data/Output"):
  os.makedirs("/content/gdrive/MyDrive/SYDE-Project/Rec-Eng/Data/Output")
%cd /content/gdrive/MyDrive/SYDE-Project/Rec-Eng/Data/Output
devpost_df_to_save.to_csv("devpost_data_with_embeddings.csv", index=False)

/content/gdrive/MyDrive/SYDE-Project/Rec-Eng/Data/Output


# 5. V0 Recommendations
using cosine similarity and BERT embeddings (no other augmentations)

Generate recommendations

In [None]:
def preprocess_input(input_description: str) -> torch.Tensor:
  processed_text = preprocess_input_text(input_description)
  embedding = generate_sentence_embeddings(processed_text)
  return embedding

def cosine_similarity_recommendations(input_dataset: pd.DataFrame, input_embeddings: torch.Tensor, count: int) -> pd.DataFrame:
  cosine_similarities = input_dataset['embeddings'].apply(lambda emb: torch.nn.functional.cosine_similarity(input_embeddings, emb, dim=0))
  # Add a new column for cosine similarities to the DataFrame
  input_dataset['cosine_similarity'] = cosine_similarities

  # Sort the DataFrame based on cosine similarity in descending order to get recommendations
  recommendations = input_dataset.sort_values(by='cosine_similarity', ascending=False).head(count)

  return recommendations

Test Recommendations

In [None]:
query = "food donation app"
query_embedding = preprocess_input(query)
recommendations_df = cosine_similarity_recommendations(devpost_df_cleaned, query_embedding, 10)
recommendations_df.head()

Unnamed: 0,url,description,technologies,cleaned_description,preprocessed_text,embeddings,cosine_similarity
688,https://devpost.com/software/stockprophet,stock price predictor,machine-learning,stock price predictor,stock price predictor,"[tensor(-0.6010), tensor(-0.1646), tensor(-0.0...",tensor(0.9087)
7372,https://devpost.com/software/local-community,built an app,android,built an app,built an app,"[tensor(-0.2970), tensor(-0.7017), tensor(-0.2...",tensor(0.9070)
7404,https://devpost.com/software/app-o0ref1,built an app,android,built an app,built an app,"[tensor(-0.2970), tensor(-0.7017), tensor(-0.2...",tensor(0.9070)
2734,https://devpost.com/software/potluck-pal-yl43d9,Automatically organizes potluck menus from ing...,javascript,Automatically organizes potluck menus from ing...,automatically organizes potluck menus from ing...,"[tensor(-0.5192), tensor(-0.4525), tensor(-0.3...",tensor(0.9057)
3149,https://devpost.com/software/ai-trivia,Asks you AI generated questions,"css3, flask, html5, javascript, python, websoc...",Asks you AI generated questions,asks you ai generated questions,"[tensor(-0.3389), tensor(-0.5953), tensor(-0.3...",tensor(0.9052)


In [None]:
# Make recommendation based on sorted dataframe
highest_similarity_row = recommendations_df.iloc[0]
technolgogies_to_recommend = highest_similarity_row['technologies']
print(f'{technolgogies_to_recommend}')

machine-learning
