In [20]:
sample_text = [
    """Description
    We're looking for an experienced NLP & Search research engineer to work within AppGallery Search & Recommendation projects.

    AppGallery is a package manager and application distribution platform, or marketplace 'app store', developed by Huawei for the Android operating system. AppGallery is used by 400+ million active users on 700+ million Huawei devices. Our team in Turkey is responsible for improving the AI driven search & recommendation capabilities of the application for the Asia, Africa, Latin America, Europe and Russia markets.

    What can you expect in this role?

    As a part of our research team you'll work with talented researchers and developers at every level to improve our search and recommendation solutions within AppGallery. This role allows you to directly affect user experience of millions of AppGallery users.

    You'll have a chance to thrive in a multinational environment where you can work closely with other colleagues from China, Singapore and Ireland teams. Within this role you can expect to carry out following tasks;

    Requirements
    Design, develop, and optimize retrieval models (RMs) for efficient and accurate search functionality in app store domain.
    Implement NLP techniques and train, evaluate (offline & online), optimize deep learning models to enhance semantic search capabilities.
    Design, build and deploy scalable ML services on Huawei's MLOps platform to address business requirements.
    Collaborate with cross-functional teams to analyze complex data requirements and design innovative software solutions.
    Conduct code reviews to ensure high-quality, maintainable, and efficient code following industry best practices.
    Mentor and guide junior engineers in the areas of search, information retrieval, deep learning, and NLP.
    Stay updated with the latest advancements in search, information retrieval, deep learning, and NLP, and integrate them into our products and services.
    Produce academic outputs in the form of papers, patents, and technical talks.

    What are the qualifications to fit the role?

    Minimum MS degree or PHD degree with focus on NLP, IR and DL, preferably in a computer science & engineering or related fields.
    Minimum 3 years of experience preferably in industry.
    Strong Python programming skills with proven experience crafting, prototyping, and delivering machine learning solutions into production.
    Experience with popular deep learning frameworks (TensorFlow, PyTorch) and NLP libraries.
    Publication records in journals or conferences related with NLP especially in the areas of vector search, dense retrieval, semantic search.
    Experience in integrating retrieval models (RMs) for large-scale search engines using these specialized techniques.
    Previous project experience in search or recommendation systems domain is a big plus.
    Java service development experience using Spring and related topics & technologies is a plus (RESTful services, Redis, ElasticSearch, RDBMS)
    Fluency in English is important, reading/writing skills in Russian and/or Arabic is a plus.

    Am i right for the team?

    If you're excellent in analysis, modeling and problem-solving, and can see the essence of problems from complex data, you can be perfect fit for the task at hand.
    If you're easy to communicate, open for suggestions and improvements, can work independently, pro-actively and well aligned then you can be perfect fit for our team culture."""
]



### TOKENIZATION

In [21]:
from nltk.tokenize import word_tokenize


In [22]:
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/tugbakayhan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [23]:
sentences = []
word_set = []

for sent in sample_text:
    words = [word.lower() for word in word_tokenize(sent) if word.isalpha()]
    sentences.append(words)
    for word in words:
        if word not in word_set:
            word_set.append(word)
# Set of words
word_set = set(word_set)


In [24]:
# total documents in our corpus
total_docs = len(sample_text)
print('Total documents: ', total_docs)
print('Total words: ', len(word_set))

Total documents:  1
Total words:  252


### İNDEXİNG

In [25]:
word_index = {}
for i, word in enumerate(word_set):
    word_index[word] = i

### Create a dictionary to keep the count of the number of documents containing the given word.



In [26]:
def count_dict(sentences):
    count_dict = {}
    for word in word_set:
        count_dict[word] = 0
    for sent in sentences:
        for word in sent:
            count_dict[word] += 1
    return count_dict
word_count = count_dict(sentences)
print(word_count)

{'ml': 1, 'store': 2, 'code': 2, 'reviews': 1, 'journals': 1, 'well': 1, 'to': 11, 'domain': 2, 'on': 3, 'popular': 1, 'open': 1, 'latin': 1, 'app': 1, 'every': 1, 'technical': 1, 'solutions': 3, 'china': 1, 'develop': 1, 'service': 1, 'mentor': 1, 'evaluate': 1, 'areas': 2, 'latest': 1, 'previous': 1, 'them': 1, 'other': 1, 'manager': 1, 'analyze': 1, 'europe': 1, 'package': 1, 'design': 3, 'phd': 1, 'programming': 1, 'millions': 1, 'publication': 1, 'information': 2, 'software': 1, 'degree': 2, 'responsible': 1, 'deploy': 1, 'easy': 1, 'this': 3, 'thrive': 1, 'android': 1, 'preferably': 2, 'rms': 2, 'pytorch': 1, 'ireland': 1, 'devices': 1, 'arabic': 1, 'focus': 1, 'following': 2, 'user': 1, 'capabilities': 2, 'distribution': 1, 'right': 1, 'multinational': 1, 'aligned': 1, 'advancements': 1, 'java': 1, 'especially': 1, 'am': 1, 'elasticsearch': 1, 'if': 2, 'machine': 1, 'crafting': 1, 'colleagues': 1, 'spring': 1, 'best': 1, 'marketplace': 1, 'where': 1, 'practices': 1, 'vector': 1,

### term frequenct calculation in the corpus- TF

In [27]:
def term_frequency(document, word):
    N = len(document)
    occurance = len([token for token in document if token == word])
    return occurance / N

### inverse document frequency in the corpus-IDF

In [28]:
def inverse_document_frequency(word):
    try:
        word_occurance = word_count[word] + 1
    except:
        word_occurance = 1
    return np.log(total_docs / word_occurance)

### combining tf-idf

In [29]:
def tf_idf(sentence):
    vec = np.zeros((len(word_set),))
    for word in sentence:
        tf = term_frequency(sentence, word)
        idf = inverse_document_frequency(word)
        vec[word_index[word]] = tf * idf
    return vec

In [30]:
import numpy as np


In [31]:
vectors = []
for sent in sentences:
    vectors.append(tf_idf(sent))

print(vectors)

[array([-0.00143806, -0.00455856, -0.00455856, -0.00143806, -0.00143806,
       -0.00143806, -0.05670949, -0.00455856, -0.00862839, -0.00143806,
       -0.00143806, -0.00143806, -0.00143806, -0.00143806, -0.00143806,
       -0.00862839, -0.00143806, -0.00143806, -0.00143806, -0.00143806,
       -0.00143806, -0.00455856, -0.00143806, -0.00143806, -0.00143806,
       -0.00143806, -0.00143806, -0.00143806, -0.00143806, -0.00143806,
       -0.00862839, -0.00143806, -0.00143806, -0.00143806, -0.00143806,
       -0.00455856, -0.00143806, -0.00455856, -0.00143806, -0.00143806,
       -0.00143806, -0.00862839, -0.00143806, -0.00143806, -0.00455856,
       -0.00455856, -0.00143806, -0.00143806, -0.00143806, -0.00143806,
       -0.00143806, -0.00455856, -0.00143806, -0.00455856, -0.00143806,
       -0.00143806, -0.00143806, -0.00143806, -0.00143806, -0.00143806,
       -0.00143806, -0.00143806, -0.00143806, -0.00455856, -0.00143806,
       -0.00143806, -0.00143806, -0.00143806, -0.00143806, -0.0

#### Bu çıktı, vectors listesinin içindeki her bir TF-IDF vektörünü içerir. Her vektör, cümle içindeki 
#### her kelimenin TF-IDF skorunu içeren bir liste olarak temsil edilir. TF-IDF skorları, belirli bir cümlenin
#### içindeki kelimelerin önem sırasını belirlemeye yardımcı olan istatistiksel bir ölçümdür.