# BM25 : 以 IMDB dataset 為例
- Author: Lynn
- Created: 2020/11/2
- Dataset: 
    - IMDB data from 2006 to 2016
    - A data set of 1,000 popular movies on IMDB in the last 10 years
    - https://www.kaggle.com/PromptCloudHQ/imdb-data
    

### 載入資料集

In [None]:
import pandas as pd

### 先連結google drive
csv = '/content/drive/MyDrive/shared_folder/dataset/imdb/IMDB-Movie-Data.csv'
df = pd.read_csv(csv)
df.head()

Unnamed: 0,Rank,Title,Genre,Description,Director,Actors,Year,Runtime (Minutes),Rating,Votes,Revenue (Millions),Metascore
0,1,Guardians of the Galaxy,"Action,Adventure,Sci-Fi",A group of intergalactic criminals are forced ...,James Gunn,"Chris Pratt, Vin Diesel, Bradley Cooper, Zoe S...",2014,121,8.1,757074,333.13,76.0
1,2,Prometheus,"Adventure,Mystery,Sci-Fi","Following clues to the origin of mankind, a te...",Ridley Scott,"Noomi Rapace, Logan Marshall-Green, Michael Fa...",2012,124,7.0,485820,126.46,65.0
2,3,Split,"Horror,Thriller",Three girls are kidnapped by a man with a diag...,M. Night Shyamalan,"James McAvoy, Anya Taylor-Joy, Haley Lu Richar...",2016,117,7.3,157606,138.12,62.0
3,4,Sing,"Animation,Comedy,Family","In a city of humanoid animals, a hustling thea...",Christophe Lourdelet,"Matthew McConaughey,Reese Witherspoon, Seth Ma...",2016,108,7.2,60545,270.32,59.0
4,5,Suicide Squad,"Action,Adventure,Fantasy",A secret government agency recruits some of th...,David Ayer,"Will Smith, Jared Leto, Margot Robbie, Viola D...",2016,123,6.2,393727,325.02,40.0


### 前處理

In [None]:
import pandas as pd
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from gensim import corpora
from gensim.summarization import bm25

In [None]:
# 印出所有欄位名稱
print(df.columns)

Index(['Rank', 'Title', 'Genre', 'Description', 'Director', 'Actors', 'Year',
       'Runtime (Minutes)', 'Rating', 'Votes', 'Revenue (Millions)',
       'Metascore'],
      dtype='object')


In [None]:
docs = df['Description'].tolist()
titles = df['Title'].tolist()

print(len(docs))

query = 'robot'
print(len([d for d in docs if query in d]))

1000
5


In [None]:
processed_docs = [[token for token in simple_preprocess(text) if token not in STOPWORDS] for text in docs]
print(len(processed_docs))

print(titles[10])
print(processed_docs[10])


1000
Fantastic Beasts and Where to Find Them
['adventures', 'writer', 'newt', 'scamander', 'new', 'york', 'secret', 'community', 'witches', 'wizards', 'seventy', 'years', 'harry', 'potter', 'reads', 'book', 'school']


### 建立模型

In [None]:
model = bm25.BM25(processed_docs)

average_idf = sum(map(lambda k: float(model.idf[k]), model.idf.keys())) / len(model.idf.keys())
print(average_idf)

6.08925130050923


In [None]:
# 對所有詞彙套用lamda函數
# map(lambda k: float(model.idf[k]), model.idf.keys())

x = []
for k in model.idf.keys():
    #print(k)
    #print(model.idf[k])
    f = float(model.idf[k])
    x.append(f)

# 計算idf平均
average_idf = sum(x) / len(model.idf.keys())
print(average_idf)

6.08925130050923


### Top 1 of ranking

In [None]:
query = ['robot']

scores = model.get_scores(query,average_idf)

idx = scores.index(max(scores))
print('movie id of top1 : ')
print(titles[idx])

[0, 0, 0, 0, 0]
movie id of top1 : 
Real Steel


### Top 10 of ranking

In [None]:
df2 = pd.DataFrame(list(zip(titles,scores)), columns=['Title','Score'])
df2

Unnamed: 0,Title,Score
0,Guardians of the Galaxy,0.0
1,Prometheus,0.0
2,Split,0.0
3,Sing,0.0
4,Suicide Squad,0.0
...,...,...
995,Secret in Their Eyes,0.0
996,Hostel: Part II,0.0
997,Step Up 2: The Streets,0.0
998,Search Party,0.0


In [None]:
df3 = df2.sort_values(by=['Score'], ascending=False)
df3

Unnamed: 0,Title,Score
892,Real Steel,8.114812
634,WALL·E,5.350594
517,Chappie,4.760426
448,Big Hero 6,4.632680
904,RoboCop,4.396710
...,...,...
338,The Fault in Our Stars,0.000000
339,Blended,0.000000
340,Fast & Furious,0.000000
341,Looper,0.000000


In [None]:
df3.head(10)

Unnamed: 0,Title,Score
0,Guardians of the Galaxy,0
671,Child 44,0
658,Eastern Promises,0
659,The Daughter,0
660,Pineapple Express,0
661,The First Time,0
662,Gone Baby Gone,0
663,The Heat,0
664,L'avenir,0
665,Anna Karenina,0
