# 0. Imports

In [217]:
import pandas as pd
import numpy as np
import pickle

from collections import defaultdict

from search import Document
from lemmatization import lemmatize_column
from lemmatization import lemmatize_sentence

""" Some libraries used in imported functions: """

    # import re

    # from tqdm import tqdm
    # from joblib import Parallel, delayed

    # import nltk

    # from nltk import pos_tag
    # from nltk.corpus import stopwords, wordnet
    # from nltk.stem import WordNetLemmatizer


""" You may also need: """

    # nltk.download('averaged_perceptron_tagger')
    # nltk.download('wordnet')
    # nltk.download('stopwords')
    # nltk.download('omw-1.4')

# 1. Data reading

Let's read our datasets and have a look on them. There are 2 ones so we need to take some steps to join them.

In [152]:
quotes1 = pd.read_csv('quotes1.csv')
quotes1

Unnamed: 0,quote,author,category
0,"I'm selfish, impatient and a little insecure. ...",Marilyn Monroe,"attributed-no-source, best, life, love, mistak..."
1,You've gotta dance like there's nobody watchin...,William W. Purkey,"dance, heaven, hurt, inspirational, life, love..."
2,You know you're in love when you can't fall as...,Dr. Seuss,"attributed-no-source, dreams, love, reality, s..."
3,A friend is someone who knows all about you an...,Elbert Hubbard,"friend, friendship, knowledge, love"
4,Darkness cannot drive out darkness: only light...,"Martin Luther King Jr., A Testament of Hope: T...","darkness, drive-out, hate, inspirational, ligh..."
...,...,...,...
499704,I do believe the most important thing I can do...,John C. Stennis,"Past, Believe, Help"
499705,I'd say I'm a bit antimadridista although I do...,Isco,"Team, Humility, Know"
499706,The future is now.,Nam June Paik,Now
499707,"In all my life and in the future, I will alway...",Norodom Sihamoni,"Life, My Life, Servant"


In [153]:
quotes2 = pd.read_csv('quotes2.csv')
quotes2

Unnamed: 0,QUOTE,AUTHOR,TITLE,LIKES,TAGS
0,“My life amounts to no more than one drop in a...,David Mitchell,Cloud Atlas,4589,
1,“A half-read book is a half-finished love affa...,David Mitchell,Cloud Atlas,3181,"['love', 'reading']"
2,“Our lives are not our own. We are bound to ot...,David Mitchell,Cloud Atlas,2534,
3,"“Travel far enough, you meet yourself.”",David Mitchell,Cloud Atlas,1547,['travel']
4,"“Books don't offer real escape, but they can s...",David Mitchell,Cloud Atlas,1490,"['escape', 'reading']"
...,...,...,...,...,...
348080,"“It's going to look pretty good, then, isn't i...",Terry Pratchett,Sourcery,174,[]
348081,“It was sad music. But it waved its sadness li...,Terry Pratchett,Soul Music,173,"['humor', 'music']"
348082,“The merest accident of microgeography had mea...,Terry Pratchett,Small Gods,173,['religion']
348083,"“Look, that's why there's rules, understand? S...",Terry Pratchett,Thief of Time,173,['rules']


# 2. Data preprocessing

It would be nice to have some quotes gradation correlated with something close to popularity, which could be used in ranking. Since our second dataset contains applicable column 'LIKE', we wish to make something similar one in the first dataset and then concatenate datasets.

## First dataset

In [154]:
quotes1.sample(5)

Unnamed: 0,quote,author,category
90801,How could anyone catch all of the beauty in th...,"Cintra Wilson, Colors Insulting To Nature","beauty, life, misery"
188014,Imagination can create crazy things in a place...,"Mina Marial Nicoli, The Magic of Avalon Eyrelin","avalon-eyrelin, imagination, magic"
46925,Knowledge and education are the key to this hu...,"Christina Engela, Fearotica: An Anthology of E...","bonfire, education, fueled, hate, human-traged..."
9480,Insanity does NOT run in my family. It strolls...,"Darynda Jones, Fourth Grave Beneath My Feet",humor
315937,It is difficult to exaggerate the adverse infl...,Hypatia Bradlaugh Bonner,"equality, happiness, influence, motherhood, op..."


In [155]:
quotes1.duplicated().sum()

2442

In [156]:
quotes1.drop_duplicates(inplace=True)

In [157]:
quotes1.isna().sum()

quote          1
author      1753
category      63
dtype: int64

Only one quote contains NaN so we can simply delete it.

In [158]:
quotes1 = quotes1[quotes1['quote'].notna()]

Let Nan's in column **'author'** be replaced by special value ***'unknown author'*** and just empty string for column **'category'**.

In [None]:
quotes1['author'].fillna('unknown author', inplace=True)
quotes1['category'].fillna('', inplace=True)

The column **'author'** contains both author's name and title (book's name, etc.) separated by comma so we can split this column into two ones.

In [160]:
quotes1.sample(5)

Unnamed: 0,quote,author,category
108458,To suffer destructive consequences is to obey ...,Sunday Adelaja,"consequence, human, law, nature, obey"
104278,"In 1914, Franz Ferdinand, the Austrian imperia...","Loren D. Estleman, Gas City","assassination, history, reform, sarajevo, worl..."
31687,Why is it we have so little choice? We live li...,"Banana Yoshimoto, Kitchen","death, life, love"
268256,I'd rather you shot at tin cans in the back ya...,Harper Lee,"mockingbird, music, sin, singing"
36879,i hope thatwhoever you arewherever you areand ...,Sanober Khan,"feelings, inspirational, keep-strong, life-quo..."


In [None]:
quotes1[['author', 'title']] = quotes1['author'].str.split(',', 1, expand=True)

quotes1['author'] = quotes1['author'].apply(lambda x: x.strip())
quotes1['title'] = quotes1['title'].apply(lambda x: x.strip() if x else '')

Some values in **'author'** column are equal to ***'Anonymous'*** so let's replace them with value ***'unknown author'***.

In [162]:
(quotes1['author'] == 'Anonymous').sum()

1581

In [None]:
quotes1.loc[quotes1['author'].str.lower() == 'anonymous', 'author'] = 'unknown author'

The assumption is that how often authors and categories occur in dataset is correlated with popularity. Rough and disputable yet satisfying.

Let's check it out with second dataset.

It seems that max number of likes per each author or title represents popularity (nearly) and that exactly what we need. The problem of this approach is that we can overestimate number of likes for unpopular quotes by frequently encountered authors and titles, but I believe it's okay as we want to count it approximately.

So we can calculate correlation between number of each author quotes and above sum to check our assumption.

In [164]:
authors_occurrences = quotes2['AUTHOR'].value_counts().to_dict()
titles_occurrences = quotes2['TITLE'].value_counts().to_dict()

likes_per_author = quotes2.groupby('AUTHOR')['LIKES'].max().reset_index()
likes_per_title = quotes2.groupby('TITLE')['LIKES'].max().reset_index()

likes_per_author['cnt'] = likes_per_author['AUTHOR'].apply(lambda x: authors_occurrences[x])
likes_per_title['cnt'] = likes_per_title['TITLE'].apply(lambda x: titles_occurrences[x])

In [165]:
likes_per_author.corr()

Unnamed: 0,LIKES,cnt
LIKES,1.0,0.333332
cnt,0.333332,1.0


In [166]:
likes_per_title.corr()

Unnamed: 0,LIKES,cnt
LIKES,1.0,0.347557
cnt,0.347557,1.0


Correlation isn't too big but still not too small so the assumption feels like the truth. 

Let's add column **'popularity index'** to first dataset.

In [167]:
author_lower = quotes1['author'].apply(lambda x: str.lower(x))
title_lower = quotes1['title'].apply(lambda x: str.lower(x))

In [168]:
authors_occurrences = author_lower.value_counts().to_dict()
titles_occurrences = title_lower.value_counts().to_dict()

In [169]:
def count_unique_words(data):
    words_count = {}

    for s in data:
        words = [str.lower(str.strip(i)) for i in s.split(',')]

        for word in words:
            if word in words_count:
                words_count[word] += 1
            else:
                words_count[word] = 1

    return words_count

In [170]:
categories_occurrences = count_unique_words(quotes1['category'].values)
sorted(categories_occurrences.items(), key=lambda item: -item[1])[:10]

[('love', 42393),
 ('life', 40514),
 ('inspirational', 28996),
 ('philosophy', 15076),
 ('humor', 14096),
 ('god', 13903),
 ('truth', 12352),
 ('wisdom', 11125),
 ('happiness', 10543),
 ('people', 10302)]

In [171]:
def count_category_index(data):
    sum = 0

    cats = [str.lower(str.strip(i)) for i in data.split(',')]

    for cat in cats:
        sum += categories_occurrences[cat]

    return sum

In [None]:
quotes1['author_index'] = author_lower.apply(lambda x: 0 if x == 'unknown author' else authors_occurrences[x])
quotes1['title_index'] = title_lower.apply(lambda x: 0 if x == '' else titles_occurrences[x])
quotes1['category_index'] = quotes1['category'].apply(count_category_index)

In [173]:
quotes1.sample(5)

Unnamed: 0,quote,author,category,title,author_index,title_index,category_index
247575,Tears flow and smiles fade to the same rhythm ...,Rémy de Gourmont,"ephemeral, life, memories, oblivion","Philosophic Nights in Paris,: Being Selections...",26,15,42264
266738,"When you’re here, an i in the Ocean, you’re no...",Jean-Pierre Weill,"health, perspective, selfhood, stories, well-b...",The Well of Being: A Children's Book for Adults,1,1,5041
427862,I hope I'm always learning something.,Kate Winslet,"Learning, Always, Something",,36,0,3743
449570,"Happiness isn't getting what you want, it's wa...",Garth Brooks,"Want, You, Wanting",,12,0,8554
270119,the rent here may be low but i believe we have...,Jane Austen,"humility, rent, sense, sensibility",,458,0,1836


In [174]:
quotes1[['author_index', 'title_index', 'category_index']].max()

author_index       10620
title_index         3059
category_index    463522
dtype: int64

In [None]:
quotes1['popularity_index'] = (quotes1['author_index'] / quotes1['author_index'].max() + \
                               quotes1['title_index'] / quotes1['title_index'].max() + \
                               quotes1['category_index'] / quotes1['category_index'].max())

min_index = quotes1['popularity_index'].min()
quotes1['popularity_index'] = (quotes1['popularity_index'] - min_index) / (quotes1['popularity_index'].max() - min_index) * 100

## Second dataset

In [176]:
quotes2.columns = map(str.lower, quotes2.columns)

In [177]:
quotes2.sample(5)

Unnamed: 0,quote,author,title,likes,tags
26694,"“In any case, Cide Hamete Benengeli was a very...",Miguel de Cervantes Saavedra,Don Quixote,3,['writing']
313211,“I cannot pretend that the reading I have done...,Diane Setterfield,The Thirteenth Tale,0,
327257,"“Those who had easy answers, be they on the ri...",Harlan Coben,Missing You,3,
62931,"“It's not your fault,"" Carlisle comforted me w...",Stephenie Meyer,New Moon,21,
151776,"“In the throws of depression, one reaches a st...",Andrew Solomon,The Noonday Demon: An Atlas of Depression,75,


In [178]:
(quotes2['tags'] == 'None').sum()

247757

Since there are a lot of ***'None'*** values in column **'tags'** we have to delete it. As we delete this column, we also have to delete similar column **'category'** in the first dataset.

In [179]:
quotes2 = quotes2.drop(['tags'], axis=1)

In [180]:
quotes2.duplicated().sum()

0

In [181]:
quotes2.isna().sum()

quote     1
author    0
title     0
likes     0
dtype: int64

In [182]:
quotes2 = quotes2[quotes2['quote'].notna()]

In [183]:
print((quotes2['title'] == 'None').sum())
print((quotes2['author'] == 'anonymous').sum())

4481
3


In [184]:
quotes2.loc[quotes2['title'] == 'None', 'title'] = ''

In [185]:
quotes2.loc[quotes2['author'].str.lower() == 'anonymous', 'author'] = 'unknown author'

In [186]:
quotes2.describe()

Unnamed: 0,likes
count,348084.0
mean,52.315631
std,603.99974
min,-1.0
25%,0.0
50%,2.0
75%,13.0
max,95186.0


In [187]:
(quotes2['likes'] == -1).sum()

36

It looks like **'-1'** value in the **'likes'** column was used for an unknown number of likes so we can replace this value with **0**.

In [188]:
quotes2.loc[quotes2['likes'] == -1, 'likes'] = 0

Many values in the **'likes'** column are close to **0** so large values are like outliers.

In [189]:
(quotes2['likes'] > 5000).sum()

360

There are few values greater than 5000, that's why we can replace them with 5000 making min-max scaling and distribution more usable.

In [190]:
quotes2.loc[quotes2['likes'] > 5000, 'likes'] = 5000

In [191]:
quotes2['popularity_index'] = quotes2['likes'] / quotes2['likes'].max() * 100 # min value = 0

## Datasets  concatenation

In [192]:
quotes = pd.concat([quotes1[['quote', 'author', 'title', 'popularity_index']],
                    quotes2[['quote', 'author', 'title', 'popularity_index']]])

In [193]:
quotes[['quote', 'author', 'title']].duplicated().sum()

2848

In [194]:
quotes = quotes[~quotes[['quote', 'author', 'title']].duplicated()]

In [195]:
quotes = quotes.sort_values(by='popularity_index', ascending=False).reset_index(drop=True)

In [196]:
quotes

Unnamed: 0,quote,author,title,popularity_index
0,“I took a deep breath and listened to the old ...,Sylvia Plath,The Bell Jar,100.0
1,"“You haven't got a letter on yours,"" George ob...",J.K. Rowling,Harry Potter and the Sorcerer's Stone,100.0
2,“The fact that we live at the bottom of a deep...,Douglas Adams,The Salmon of Doubt: Hitchhiking the Galaxy On...,100.0
3,“There is a theory which states that if ever a...,Douglas Adams,The Restaurant at the End of the Universe,100.0
4,"“For instance, on the planet Earth, man had al...",Douglas Adams,The Hitchhiker's Guide to the Galaxy,100.0
...,...,...,...,...
842497,"“History is written by victors,” said Duko. “B...",Raymond E. Feist,Shards of a Broken Crown,0.0
842498,“Never offer an oath lightly. For you pledge n...,Raymond E. Feist,King of Foxes,0.0
842499,“It’s tempting to think of yourself as powerfu...,Raymond E. Feist,Shards of a Broken Crown,0.0
842500,"“A man makes choices,” Tal said.\n“True, but w...",Raymond E. Feist,King of Foxes,0.0


# 3. Lemmatization

In [198]:
quote_lemmatized = lemmatize_column(quotes, 'quote')
quotes['quote_lemmatized'] = quote_lemmatized

100%|██████████| 843/843 [06:42<00:00,  2.09it/s]


In [199]:
author_lemmatized = lemmatize_column(quotes, 'author')
quotes['author_lemmatized'] = author_lemmatized

100%|██████████| 843/843 [00:56<00:00, 14.81it/s]


In [200]:
title_lemmatized = lemmatize_column(quotes, 'title')
quotes['title_lemmatized'] = title_lemmatized

100%|██████████| 843/843 [00:51<00:00, 16.41it/s]


In [213]:
quotes.sample(5)

Unnamed: 0,quote,author,title,popularity_index,quote_lemmatized,author_lemmatized,title_lemmatized
250698,"You erased my famine, unpicked my angerYour en...",Jalaluddin Rumi,Words of Paradise: Selected Poems,1.426038,erase famine unpicked angeryour energy charge ...,jalaluddin rumi,word paradise select poem
586814,I'm not intimidated by anyone. Everyone is mad...,Josephine Baker,,0.12537,intimidate anyone everyone make two arm two le...,josephine baker,
777006,“A new Dragon’s Egg?” asked”,Melissa de la Cruz,Return to the Isle of the Lost,0.0,new dragon egg ask,melissa de la cruz,return isle lose
281824,Each day of our lives we make deposits in the ...,Charles R. Swindoll,,1.179047,day life make deposit memory bank child,charles r swindoll,
132190,What makes and experience a memory is when we ...,Jeremy Aldana,Boys Will Be Boys,4.027288,make experience memory share someone emotion felt,jeremy aldana,boy boys


# 4. Index building

In [215]:
index = []

for idx, row in quotes.iterrows():
    index.append(
        Document(idx,
                 row.quote,
                 row.author,
                 row.title,
                 row.quote_lemmatized,
                 row.author_lemmatized,
                 row.title_lemmatized))

In [218]:
with open('index.pickle', 'wb') as f:
    pickle.dump(index, f)

# 5. Inverted indices building

In [219]:
def build_inverted_index(data):
    inv_index = defaultdict(list)

    for idx, row in enumerate(data.values):
        for word in set(row.split()):
            inv_index[word].append(idx)

    return inv_index

In [220]:
quotes_inv_index = build_inverted_index(quotes['quote_lemmatized'])

In [221]:
titles_inv_index = build_inverted_index(quotes['title_lemmatized'])

In [223]:
with open('quotes_inv_index.pickle', 'wb') as f:
    pickle.dump(quotes_inv_index, f)

In [224]:
with open('titles_inv_index.pickle', 'wb') as f:
    pickle.dump(titles_inv_index, f)

# 6. **Score** func estimation

Let's implement some prototypes of the functions that will be used in the search engine and simulate it's work in order to select and estimate **score** function.

The **score** function would be based on simplification that all terms in query are independent and we can use additivity such that:

$score(q, doc) = \sum\limits_{t ∈ q}score(t, doc)$, $where$ $q = query, t = term.$

The first version of **score** calculates **tf_idf** value of lemmatized query by quote only.

In [226]:
DOCS_NUMBER = quotes.shape[0]

In [228]:
def retrieve(query):
    query_lemmatized = lemmatize_sentence(query)
    keywords = query_lemmatized.split()

    if not keywords:
        return []

    docs_by_quotes = set(quotes_inv_index[keywords[0]])
    docs_by_titles = set(titles_inv_index[keywords[0]])

    for word in keywords[1:]:
        docs_by_quotes = docs_by_quotes.intersection(set(quotes_inv_index[word]))
        docs_by_titles = docs_by_titles.intersection(set(titles_inv_index[word]))

    return [index[i] for i in docs_by_titles.union(docs_by_quotes)][:300]

In [732]:
def score(query, document):
    doc_quote_words = document.quote_lemmatized.split()

    def score_by_term(term):
        tf = doc_quote_words.count(term)
        df = len(quotes_inv_index[term])

        tf_idf = tf * np.log(DOCS_NUMBER / df)
        
        return tf_idf

    query_lemmatized = lemmatize_sentence(query)

    sum = 0
    
    for term in query_lemmatized.split():
        sum += score_by_term(term)

    return sum

In [230]:
def search(query):
    documents = retrieve(query)
    
    scored = [(doc, score(query, doc)) for doc in documents]
    scored = sorted(scored, key=lambda doc: -doc[1])

    return scored

We will use Mean reciprocal rank ([**MRR**](https://en.wikipedia.org/wiki/Mean_reciprocal_rank "see more on wiki")) to evaluate **score** function. For this I chose some random quotes and create a query that have to be respond by this quote for each one (in my humble opinion). Some of them contain title words. Function **mrr** calculates reciprocal rank for each query and returns mean value for all queries.

In [716]:
def mrr(queries):
    value = 0
    for query, doc_id in queries:
        docs = [doc[0].id for doc in search(query)]

        if doc_id in docs:
            value += 1 / (docs.index(doc_id) + 1)

    return value / len(queries)

In [613]:
quotes.iloc[[i for i in range(0, int(DOCS_NUMBER / 4), int(DOCS_NUMBER / 100))]]

Unnamed: 0,quote,author,title,popularity_index,quote_lemmatized,author_lemmatized,title_lemmatized
0,“I took a deep breath and listened to the old ...,Sylvia Plath,The Bell Jar,100.0,take deep breath listen old brag heart,sylvia plath,bell jar
8425,Let the moment be memorable.,Lailah Gifty Akita,,44.053126,let moment memorable,lailah gifty akita,
16850,Christ wants to give you hope for the future. ...,Billy Graham,Billy Graham in Quotes,33.840238,christ want give hope future want learn mean w...,billy graham,billy graham quote
25275,"“When her body first hit the net, all I regist...",Veronica Roth,Allegiant,24.58,body first hit net register gray blur pull acr...,veronica roth,allegiant
33700,Cemeteries are full of unfulfilled dreams... c...,Steve Maraboli,Unapologetically You: Reflections on Life and ...,13.993505,cemetery full unfulfilled dream countless echo...,steve maraboli,unapologetically reflections life human experi...
42125,"“I was glad I wasn't in love, that I wasn't ha...",Charles Bukowski,Women,10.3,glad love happy world like odds everything peo...,charles bukowski,woman
50550,"Endeavor to live a simple life, but filled wit...",Auliq-Ice,,8.707614,endeavor live simple life fill complex love,auliq ice,
58975,"​You can't Believe in Yourself, till People Be...",Vineet Raj Kapoor,,7.570749,believe till people believe,vineet raj kapoor,
67400,Do you not see how necessary a world of pains ...,John Keats,Letters of John Keats,6.595342,see necessary world pain troubles school intel...,john keats,letter john keats
75825,Do you wonder why we wander?” Cal had asked.It...,David Levithan,Are We There Yet?,5.91686,wonder wander cal ask night first snow could h...,david levithan,yet


In [717]:
queries = [ ('brag of my heart', 0),
            ('Let the moment be memorable.', 8425),
            ('Christ and eternal life begins', 16850),
            ('first jump', 25275),
            ('cemeteries echoes', 33700),
            ('people in love are dangerous', 42125),
            ('simple life but complex love', 50550),
            ('believe in yourself till people believe in you', 58975),
            ('a world of pains and troubles is to school an intelligence', 67400),
            ('Do you wonder why we wander?', 75825),
            ('know what is about overall', 84250),
            ('enmity', 92675),
            ('love Prozac', 101100),
            ('team of one', 109525),
            ('fall into melancholies', 117950),
            ('A Memoir of Promiscuity', 126375),
            ('I am not everybody, I am single', 134800),
            ('more treasure in people than in mines', 143225),
            ('The Art of Racing in the Rain', 151650),
            ('successes and setbacks', 160075),
            ('beat generation', 168500),
            ('A Farewell to Arms', 176925),
            ('The Virtue of Selfishness', 185350),
            ('Sloane about sickly creatures', 193775),
            ('promotion of knowledge', 202200)
            ]

In [733]:
mrr(queries)

0.2419157402179967

Now let's modify **score** by adding **tf_idf** of title and slightly change the formula.
Here I picked up the best weights for **tf_idf_quote** and **tf_idf_title**.

In [844]:
def score(query, document):
    doc_quote_words = document.quote_lemmatized.split()
    doc_title_words = document.title_lemmatized.split()

    def score_by_term(term):
        tf_quote = doc_quote_words.count(term)
        tf_title = doc_title_words.count(term)

        df_quote = len(quotes_inv_index[term])
        df_title = len(titles_inv_index[term])

        tf_idf_quote = tf_quote * np.log(DOCS_NUMBER / (1 + df_quote))
        tf_idf_title = tf_title * np.log(DOCS_NUMBER / (1 + df_title))
        
        return tf_idf_quote * 0.3 + tf_idf_title * 0.7

    query_lemmatized = lemmatize_sentence(query)

    sum = 0
    
    for term in query_lemmatized.split():
        sum += score_by_term(term)

    return sum

In [843]:
mrr(queries)

0.27007086167800454

We have slightly improved the result.

Now let's modify our formula again. We can add smoothing and start taking into account the length of documents. This idea was taken from [**BM25**](https://en.wikipedia.org/wiki/Okapi_BM25 "see more on wiki") function.

I picked up best values for parameters **k** and **b**, as well as for weights of **quote_score** and **title_score**.

In [903]:
avg_words_quote = quotes['quote_lemmatized'].apply(lambda x: len(x.split())).mean()
avg_words_title = quotes['title_lemmatized'].apply(lambda x: len(x.split())).mean()

In [932]:
def score(query, document):
    doc_quote_words = document.quote_lemmatized.split()
    doc_title_words = document.title_lemmatized.split()

    k = 5
    b = 0.7

    l1 = len(doc_quote_words)
    l2 = len(doc_title_words)

    def score_by_term(term):
        tf_quote = doc_quote_words.count(term)
        tf_title = doc_title_words.count(term)

        df_quote = len(quotes_inv_index[term])
        df_title = len(titles_inv_index[term])

        quote_score = ((tf_quote*(k+1)) / (k*(1 - b + b * l1 / avg_words_quote) + tf_quote)) \
                        * np.log(DOCS_NUMBER / (1 + df_quote))
                        
        title_score = ((tf_title*(k+1)) / (k*(1 - b + b * l2 / avg_words_title) + tf_title)) \
                        * np.log(DOCS_NUMBER / (1 + df_title))
        
        return quote_score * 0.4 + title_score * 0.6

    query_lemmatized = lemmatize_sentence(query)

    sum = 0
    
    for term in query_lemmatized.split():
        sum += score_by_term(term)

    return sum

In [933]:
mrr(queries)

0.6198160841732271

Wow! Not a bad improvement. We will use this implementation of function **score** in our pipeline.