#### 案例1

提前安装：python -m pip install elasticsearch

官网Doc链接：https://elasticsearch-py.readthedocs.io/en/master/index.html

In [1]:
from elasticsearch import Elasticsearch

import warnings
warnings.filterwarnings("ignore")

In [2]:
# 1. 连接ES

es = Elasticsearch("https://localhost:9200",basic_auth=('elastic', 'm31xbilrpQpurPFBbUeV'),verify_certs=False)

In [3]:
es.info()

ObjectApiResponse({'name': 'tangguoliangdeMacBook-Pro.local', 'cluster_name': 'elasticsearch', 'cluster_uuid': 'POMjtjsMQ92qPnb_m-Uv5w', 'version': {'number': '8.4.1', 'build_flavor': 'default', 'build_type': 'tar', 'build_hash': '2bd229c8e56650b42e40992322a76e7914258f0c', 'build_date': '2022-08-26T12:11:43.232597118Z', 'build_snapshot': False, 'lucene_version': '9.3.0', 'minimum_wire_compatibility_version': '7.17.0', 'minimum_index_compatibility_version': '7.0.0'}, 'tagline': 'You Know, for Search'})

In [4]:
# 2. 读取数据
import pandas as pd

df = pd.read_csv("wiki_movie_plots_deduped.csv")

In [5]:
df.shape

(34886, 8)

In [6]:
df.sample(5)

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
9474,1978,The Driver,American,Walter Hill,"Ryan O'Neal, Bruce Dern, Isabelle Adjani",crime drama,https://en.wikipedia.org/wiki/The_Driver,The Driver (Ryan O'Neal) - real name unknown -...
14599,2005,The Ballad of Jack and Rose,American,Rebecca Miller,"Daniel Day-Lewis, Camilla Belle",drama,https://en.wikipedia.org/wiki/The_Ballad_of_Ja...,"Jack Slavin, a Scottish farmer with a heart ai..."
19998,1969,Some Girls Do,British,Ralph Thomas,"Richard Johnson, Daliah Lavi",spy,https://en.wikipedia.org/wiki/Some_Girls_Do,A series of inexplicable accidents befall the ...
9244,1976,Gator,American,Burt Reynolds,"Burt Reynolds, Lauren Hutton, Jack Weston",action,https://en.wikipedia.org/wiki/Gator_(film),"Following the events of White Lightning, Gator..."
21331,2013,The Selfish Giant,British,Director: Clio Barnard,Director: Clio Barnard\r\nCast: Conner Chapman...,unknown,https://en.wikipedia.org/wiki/The_Selfish_Gian...,The film follows the lives of Arbor and Swifty...


In [7]:
df.dropna(inplace=True) # 清理掉有null的数据，否则，es写入会报错

In [8]:
df.shape

(33464, 8)

In [9]:
# 随机选择100条测试

df = df.sample(100, random_state=666)

In [10]:
df.head()

Unnamed: 0,Release Year,Title,Origin/Ethnicity,Director,Cast,Genre,Wiki Page,Plot
3446,1942,This Above All,American,Anatole Litvak,"Tyrone Power, Joan Fontaine",romance,https://en.wikipedia.org/wiki/This_Above_All_(...,On the day that France surrenders to Nazi Germ...
2868,1940,Lucky Partners,American,Lewis Milestone,"Ronald Colman, Ginger Rogers, Jack Carson",romantic comedy,https://en.wikipedia.org/wiki/Lucky_Partners,Portrait painter and caricaturist David Grant ...
23514,2004,2046,Hong Kong,Wong Kar-wai,"Tony Leung Chiu Wai, Zhang Ziyi, Faye Wong",sci-fi romance,https://en.wikipedia.org/wiki/2046_(film),The film is the third chapter of a shared stor...
17938,2017,Ali's Wedding,Australian,Jeffrey Walker,"Osamah Sami, Don Hany, Helana Sawires",drama,https://en.wikipedia.org/wiki/Ali%27s_Wedding,After a reckless lie sets off a catastrophic c...
14023,2002,Chicago,American,Rob Marshall,"Renée Zellweger, Catherine Zeta-Jones, Richard...",musical,https://en.wikipedia.org/wiki/Chicago_(2002_film),"In 1924, Roxie Hart sees star Velma Kelly perf..."


In [11]:
# 2. 创建索引

# ① 定义mappings（映射）
mappings = {
        "properties": {
            "title": {"type": "text"},
            "ethnicity": {"type": "text"},
            "director": {"type": "text"},
            "cast": {"type": "text"},
            "genre": {"type": "text"},
            "plot": {"type": "text"},
            "year": {"type": "integer"},
            "wiki_page": {"type": "keyword"}
    }
}

if es.indices.exists(index="movies"):
    es.indices.delete(index="movies")
    
es.indices.create(index="movies", mappings=mappings)

ObjectApiResponse({'acknowledged': True, 'shards_acknowledged': True, 'index': 'movies'})

In [12]:
# 3. 插入数据
import time

# 方式1：.index() 一次插入一条数据
start = time.time() # 开始时间

for i, row in df.iterrows():
    doc = {
        "title": row["Title"],
        "ethnicity": row["Origin/Ethnicity"],
        "director": row["Director"],
        "cast": row["Cast"],
        "genre": row["Genre"],
        "plot": row["Plot"],
        "year": row["Release Year"],
        "wiki_page": row["Wiki Page"]
    }
            
    es.index(index="movies", id=i, document=doc)
    
print("总耗时：", time.time() - start)

总耗时： 7.354357719421387


In [13]:
# 方式2：.bulk() 同时添加多条数据

# 先清空索引，重新插入数据
if es.indices.exists(index="movies"):
    es.indices.delete(index="movies")
es.indices.create(index="movies", mappings=mappings)



from elasticsearch.helpers import bulk

start = time.time() # 开始时间

bulk_data = []

for i, row in df.iterrows():
    bulk_data.append({
        "_index": "movies",
        "_id": i,
        "_source":{
            "title": row["Title"],
            "ethnicity": row["Origin/Ethnicity"],
            "director": row["Director"],
            "cast": row["Cast"],
            "genre": row["Genre"],
            "plot": row["Plot"],
            "year": row["Release Year"],
            "wiki_page": row["Wiki Page"]
        }
    })
    
# 批量写入es
bulk(es, bulk_data)

print("总耗时：", time.time() - start)

总耗时： 0.09152603149414062


In [14]:
# 4. 查看索引中的文档数量

es.indices.refresh(index="movies") # 刷新

es.cat.count(index="movies", format="json") # 统计，以json格式显示

ListApiResponse([{'epoch': '1665388213', 'timestamp': '07:50:13', 'count': '100'}])

In [15]:
# 5. match_all 查询所有

body = {
    "match_all": {}
}

result = es.search(index="movies", query=body)

print(result)

{'took': 3, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 100, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': 'movies', '_id': '3446', '_score': 1.0, '_source': {'title': 'This Above All', 'ethnicity': 'American', 'director': 'Anatole Litvak', 'cast': 'Tyrone Power, Joan Fontaine', 'genre': 'romance', 'plot': 'On the day that France surrenders to Nazi Germany in 1940, Prudence "Pru" Cathaway (Joan Fontaine) a strong-willed young woman from the upper class, joins the Women\'s Auxiliary Air Force (WAAF), a military organisation linked to the Royal Air Force, to her family\'s surprise. Her aunt Iris and uncle Wilbur disapprove since she has chosen to serve as a private rather than as an officer. However, family butler Parsons privately expresses his support.\r\nShe goes off to training camp, where she makes friends with fellow WAAF Violet Worthing. As a favor to Violet, Prudence agrees to go on a double date 

In [16]:
# 5. 根据 term 查询

# term主要用于精确匹配哪些值，比如数字，日期，布尔值或 not_analyzed 的字符串(未经切词的文本数据类型)

body = {
    "term": {
        "wiki_page": 'https://en.wikipedia.org/wiki/Legally_Blonde_2:_Red,_White_%26_Blonde'
    }
}

result = es.search(index="movies", query=body)

print(result)

{'took': 1, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 1, 'relation': 'eq'}, 'max_score': 4.2096553, 'hits': [{'_index': 'movies', '_id': '14299', '_score': 4.2096553, '_source': {'title': 'Legally Blonde 2: Red, White & Blonde', 'ethnicity': 'American', 'director': 'Charles Herman-Wurmfeld', 'cast': 'Reese Witherspoon, Sally Field, Luke Wilson, Bob Newhart', 'genre': 'comedy', 'plot': 'Elle Woods wants her Chihuahua, Bruiser, to reunite with his mother, because she would like Bruiser\'s mom to attend Elle and Emmett\'s wedding. Elle hires a detective to find Bruiser\'s mother, only to discover that the owner of her dog\'s mother is C\'est Magnifique, a cosmetics company that uses Bruiser\'s mother for "testing". She finds out that her law firm represents the C\'est Magnifique Corporation.\r\nElle decides to leave Boston, where she and Bruiser have settled with her fiancé Emmett, and go to Washington, D.C., to wo

In [17]:
# 6. 根据 terms 查询

# terms 跟 term 有点类似，但 terms 允许指定多个匹配条件。 如果某个字段指定了多个值，那么文档需要一起去做匹配

body = {
    "terms": {
        "year": [2004, 2002]
    }
}

result = es.search(index="movies", query=body)

print(result)

{'took': 3, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 5, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': 'movies', '_id': '23514', '_score': 1.0, '_source': {'title': '2046', 'ethnicity': 'Hong Kong', 'director': 'Wong Kar-wai', 'cast': 'Tony Leung Chiu Wai, Zhang Ziyi, Faye Wong', 'genre': 'sci-fi romance', 'plot': 'The film is the third chapter of a shared story that began with Days of Being Wild and continued with In the Mood for Love.\r\nThere are four main story arcs to the film. Three are about the relations of Chow with women that he meets after losing Su Li-zhen. The first concerns Chow and Wang Jing-wen, the second is about Chow and Bai Ling, and the third is about Chow and a different woman who is also named Su Li-zhen. The fourth takes place in Chow\'s mysterious world of 2046 and concerns a Japanese passenger falling in love with a gynoid. Typical of Wong Kar-wai films, the arcs are presente

In [18]:
# 7. range

# 按照指定范围查找一批数据:
'''
gt : 大于
gte : 大于等于
lt : 小于
lte : 小于等于
'''
body = {
    "range":{
        "year": {
            "gt": 2000
        }
    }
}

result = es.search(index="movies", query=body)

print(result)

{'took': 2, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 35, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': 'movies', '_id': '23514', '_score': 1.0, '_source': {'title': '2046', 'ethnicity': 'Hong Kong', 'director': 'Wong Kar-wai', 'cast': 'Tony Leung Chiu Wai, Zhang Ziyi, Faye Wong', 'genre': 'sci-fi romance', 'plot': 'The film is the third chapter of a shared story that began with Days of Being Wild and continued with In the Mood for Love.\r\nThere are four main story arcs to the film. Three are about the relations of Chow with women that he meets after losing Su Li-zhen. The first concerns Chow and Wang Jing-wen, the second is about Chow and Bai Ling, and the third is about Chow and a different woman who is also named Su Li-zhen. The fourth takes place in Chow\'s mysterious world of 2046 and concerns a Japanese passenger falling in love with a gynoid. Typical of Wong Kar-wai films, the arcs are present

In [19]:
# 8. exists / missing

# 查找文档中是否包含指定字段或没有某个字段，类似于SQL语句中的IS_NULL条件

body = {
    "exists": {
        "field": "cast"
    }
}

result = es.search(index="movies", query=body)

print(result)

{'took': 4, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 100, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': 'movies', '_id': '3446', '_score': 1.0, '_source': {'title': 'This Above All', 'ethnicity': 'American', 'director': 'Anatole Litvak', 'cast': 'Tyrone Power, Joan Fontaine', 'genre': 'romance', 'plot': 'On the day that France surrenders to Nazi Germany in 1940, Prudence "Pru" Cathaway (Joan Fontaine) a strong-willed young woman from the upper class, joins the Women\'s Auxiliary Air Force (WAAF), a military organisation linked to the Royal Air Force, to her family\'s surprise. Her aunt Iris and uncle Wilbur disapprove since she has chosen to serve as a private rather than as an officer. However, family butler Parsons privately expresses his support.\r\nShe goes off to training camp, where she makes friends with fellow WAAF Violet Worthing. As a favor to Violet, Prudence agrees to go on a double date 

In [20]:
# 9. bool过滤

# 合并多个过滤条件查询结果的布尔逻辑
'''
must :: 多个查询条件的完全匹配,相当于 and。
must_not :: 多个查询条件的相反匹配，相当于 not。
should :: 至少有一个查询条件匹配, 相当于 or。
'''

# 案例1
body = {
    "bool": {
        "must": {
            "terms": {"year": [2004,2002,1984,2017,2005]}
        },
        "filter":{
            "match":{"genre":"romantic"}
        }
    }
}

result = es.search(index="movies", query=body)

print(result)

{'took': 1, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 1, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': 'movies', '_id': '22502', '_score': 1.0, '_source': {'title': 'Springtime in a Small Town', 'ethnicity': 'Chinese', 'director': 'Tian Zhuangzhuang', 'cast': 'Hu Jingfan,\r\nWu Jun,\r\nXin Baiqing', 'genre': 'romantic drama', 'plot': "The film follows Fei Mu's original fairly closely. Zhang Zhichen (Xin Baiqing), a city doctor, comes to visit his old friend from school Dai Liyan (Wu Jun) shortly after the war against the Japanese has ended. Dai is sickly although Zhang suspects it to be mainly a case of hypochondria. While visiting, he meets Liyan's wife, Yuwen (Hu Jingfan) and Liyan's young teenage sister Dai Xiu (Lu Sisi).\r\nZhang and Yuwen has had a passionate love affair ten years earlier before she had been engaged to marry her husband. Due to her husband's sickness however, the couple has cease

In [21]:
# 10. multi_match

# match查询的基础上同时搜索多个字段，在多个字段中同时查一个

body = {
    "multi_match": {
        "query": "romantic",
        "fields": ["genre","plot"]
    }
}

result = es.search(index="movies", query=body)

print(result)



In [22]:
# 11. regexp 查询

body = {
    "regexp": {
        "director": "john*"
    }
}

result = es.search(index="movies", query=body)

print(result)

{'took': 1, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 5, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': 'movies', '_id': '1194', '_score': 1.0, '_source': {'title': 'Scandal Sheet', 'ethnicity': 'American', 'director': 'John Cromwell', 'cast': 'George Bancroft, Kay Francis, Clive Brook', 'genre': 'drama', 'plot': "Newspaper editor Mark Flint cares about only two things, reporting a big story, no matter whose life it adversely affects, and Edith, his wife. He is unaware that Edith, bored by him, has been having a romantic affair with Noel Adams, a banker.\r\nAdams gives a 24-hour deadline to Edith to leave her husband or end the affair. He books passage on a steamship and packs his bags. But after a crisis develops that could ruin his bank, Flint finds out, confronts Adams and, seeing his luggage, mistakenly believes Adams is fleeing the country. He prints the story without giving Adams a chance to mana

In [23]:
# 12. prefix 查询

# 以什么字符开头

body = {
    "prefix": {
        "director": "john"
    }
}

result = es.search(index="movies", query=body)

print(result)

{'took': 2, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 5, 'relation': 'eq'}, 'max_score': 1.0, 'hits': [{'_index': 'movies', '_id': '1194', '_score': 1.0, '_source': {'title': 'Scandal Sheet', 'ethnicity': 'American', 'director': 'John Cromwell', 'cast': 'George Bancroft, Kay Francis, Clive Brook', 'genre': 'drama', 'plot': "Newspaper editor Mark Flint cares about only two things, reporting a big story, no matter whose life it adversely affects, and Edith, his wife. He is unaware that Edith, bored by him, has been having a romantic affair with Noel Adams, a banker.\r\nAdams gives a 24-hour deadline to Edith to leave her husband or end the affair. He books passage on a steamship and packs his bags. But after a crisis develops that could ruin his bank, Flint finds out, confronts Adams and, seeing his luggage, mistakenly believes Adams is fleeing the country. He prints the story without giving Adams a chance to mana

In [24]:
# 15. phrase match 短语匹配

# 寻找临近的几个单词
body = {
    "match_phrase": {
        "plot": "In particular"
    }
}

result = es.search(index="movies", query=body)

print(result)

{'took': 8, 'timed_out': False, '_shards': {'total': 1, 'successful': 1, 'skipped': 0, 'failed': 0}, 'hits': {'total': {'value': 2, 'relation': 'eq'}, 'max_score': 5.699081, 'hits': [{'_index': 'movies', '_id': '1491', '_score': 5.699081, '_source': {'title': 'Diplomaniacs', 'ethnicity': 'American', 'director': 'William A. Seiter', 'cast': 'Bert Wheeler, Robert Woolsey, Marjorie White', 'genre': 'comedy', 'plot': 'The film concerns itself with the adventures of two men who have set up a failing business as barbers on an Indian reservation. When they are sent by the tribe as representatives to a peace conference in Europe, unbeknownst to them they face constant threats from other attendees. In particular, a group of armaments manufacturers want to ensure that the peace conference is a failure, and do everything they can to sabotage it.', 'year': 1933, 'wiki_page': 'https://en.wikipedia.org/wiki/Diplomaniacs'}}, {'_index': 'movies', '_id': '11991', '_score': 4.2469983, '_source': {'title

In [25]:
# 16. 根据id删除数据

es.delete(index="movies", id='1491')

ObjectApiResponse({'_index': 'movies', '_id': '1491', '_version': 2, 'result': 'deleted', '_shards': {'total': 2, 'successful': 1, 'failed': 0}, '_seq_no': 100, '_primary_term': 1})

In [26]:
# 17. delete_by_query 

# 删除满足条件的所有数据，查询条件必须符合DLS格式

body = {
    "query":{
        "match": {
            "genre": "crime"
        }
    }
}

result =es.delete_by_query(index="movies", body=body)

print(result)

{'took': 638, 'timed_out': False, 'total': 5, 'deleted': 5, 'batches': 1, 'version_conflicts': 0, 'noops': 0, 'retries': {'bulk': 0, 'search': 0}, 'throttled_millis': 0, 'requests_per_second': -1.0, 'throttled_until_millis': 0, 'failures': []}
