# 9-elasticsearch

In [44]:
import pandas as pd
import requests
import json

url = 'http://localhost:9200'
index = '/movies'

### We have chosen IMDB dataset from kaggle
https://www.kaggle.com/datasets/ashpalsingh1525/imdb-movies-dataset?resource=download

In [19]:
df = pd.read_csv('imdb_movies.csv').dropna().rename(columns={'names': 'name', 'budget_x': 'budget', 'date_x': 'date'})
df['date'] = pd.to_datetime(df['date'])
df.head()

Unnamed: 0,name,date,score,genre,overview,crew,orig_title,status,orig_lang,budget,revenue,country
0,Creed III,2023-03-02,73.0,"Drama, Action","After dominating the boxing world, Adonis Cree...","Michael B. Jordan, Adonis Creed, Tessa Thompso...",Creed III,Released,English,75000000.0,271616700.0,AU
1,Avatar: The Way of Water,2022-12-15,78.0,"Science Fiction, Adventure, Action",Set more than a decade after the events of the...,"Sam Worthington, Jake Sully, Zoe Saldaña, Neyt...",Avatar: The Way of Water,Released,English,460000000.0,2316795000.0,AU
2,The Super Mario Bros. Movie,2023-04-05,76.0,"Animation, Adventure, Family, Fantasy, Comedy","While working underground to fix a water main,...","Chris Pratt, Mario (voice), Anya Taylor-Joy, P...",The Super Mario Bros. Movie,Released,English,100000000.0,724459000.0,AU
3,Mummies,2023-01-05,70.0,"Animation, Comedy, Family, Adventure, Fantasy","Through a series of unfortunate events, three ...","Óscar Barberán, Thut (voice), Ana Esther Albor...",Momias,Released,"Spanish, Castilian",12300000.0,34200000.0,AU
4,Supercell,2023-03-17,61.0,Action,Good-hearted teenager William always lived in ...,"Skeet Ulrich, Roy Cameron, Anne Heche, Dr Quin...",Supercell,Released,English,77000000.0,340942000.0,US


In [20]:
requests.get(url).json()

{'name': '851839bcfcfa',
 'cluster_name': 'docker-cluster',
 'cluster_uuid': 'QOutGYOYQzGrXMGQnFU6SQ',
 'version': {'number': '7.17.9',
  'build_flavor': 'default',
  'build_type': 'docker',
  'build_hash': 'ef48222227ee6b9e70e502f0f0daa52435ee634d',
  'build_date': '2023-01-31T05:34:43.305517834Z',
  'build_snapshot': False,
  'lucene_version': '8.11.1',
  'minimum_wire_compatibility_version': '6.8.0',
  'minimum_index_compatibility_version': '6.0.0-beta1'},
 'tagline': 'You Know, for Search'}

In [21]:
requests.put(url+index).json()

{'acknowledged': True, 'shards_acknowledged': True, 'index': 'movies'}

In [22]:
requests.get(url+'/_cat/indices').text.split('\n')

['green  open .geoip_databases 9FMko0h1Sw6HdTamDGpbug 1 0 43 0 41mb 41mb',
 'yellow open movies           ZhFuZdvhR96upLKj1I-8NQ 1 1  0 0 226b 226b',
 '']

## Create mapping
we will import all the fields from the dataset.

In [23]:
mapping = {
    'properties': {
        'name': {
            'type': 'text'
        },
        'date': {
            'type': 'date'
        },
        'genre': {
            'type': 'text'
        },
        'overview': {
            'type': 'text'
        },
        'crew': {
            'type': 'text'
        },
        'orig_title': {
            'type': 'text'
        },
        'status': {
            'type': 'text'
        },
        'orig_lang': {
            'type': 'keyword'
        },
        'budget': {
            'type': 'long'
        },
        'revenue': {
            'type': 'double'
        },
        'country': {
            'type': 'keyword'
        }
    }
}

requests.put(url+index+'/_mapping', json=mapping).json()

{'acknowledged': True}

## Bulk import to elk
From the [documentation](https://www.elastic.co/guide/en/elasticsearch/reference/7.17/docs-bulk.html), we can see that we need to parse the json to ndjson and before each doc, we need to add what action should be execuded. We will index the following data.

In [24]:
data = []

for doc in df.to_dict('records'):
    data.append({'index': {'_index': index[1:], '_type':'_doc'}})
    doc['date'] = doc['date'].strftime('%Y-%m-%d')
    data.append(doc)

bulk_data = '\n'.join([json.dumps(doc) for doc in data]) + '\n'

requests.post(url+index+'/_bulk', data=bulk_data, headers={'Content-Type': 'application/x-ndjson'}).json()

{'took': 1016,
 'errors': False,
 'items': [{'index': {'_index': 'movies',
    '_type': '_doc',
    '_id': 'coWkFogBRKR1tV-toFx3',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 0,
    '_primary_term': 1,
    'status': 201}},
  {'index': {'_index': 'movies',
    '_type': '_doc',
    '_id': 'c4WkFogBRKR1tV-toFx4',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 1,
    '_primary_term': 1,
    'status': 201}},
  {'index': {'_index': 'movies',
    '_type': '_doc',
    '_id': 'dIWkFogBRKR1tV-toFx4',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
    '_seq_no': 2,
    '_primary_term': 1,
    'status': 201}},
  {'index': {'_index': 'movies',
    '_type': '_doc',
    '_id': 'dYWkFogBRKR1tV-toFx4',
    '_version': 1,
    'result': 'created',
    '_shards': {'total': 2, 'successful': 1, 'failed': 0},
 

In [25]:
query = {
    'size': 100,
    'from': 0,
    'query': {
        'bool': {
            'must': {
                'match': {
                'crew': 'Chris Pratt'
                }
            }
        }
    },
}

results = requests.get(url+index+'/_search', json=query).json()

for hit in results['hits']['hits']:
    print(hit['_source']['name'], hit['_source']['date'], hit['_score'])

Marvel Studios Assembled: The Making of Thor: Love and Thunder 2022-09-08 9.92476
Delivery Man 2014-12-18 8.985202
Jurassic Greatest Moments: Jurassic Park to Jurassic World 2022-06-04 8.869982
Thor: Love and Thunder 2022-07-06 8.767334
Bride Wars 2009-01-09 8.75768
Castle Freak 2020-12-04 8.75768
Deep in the Valley 2009-09-30 8.437209
The Guardians of the Galaxy Holiday Special 2022-11-25 8.236282
Passengers 2016-12-29 8.236282
Her 2014-01-16 8.139365
Zero Dark Thirty 2013-01-31 8.139365
Take Me Home Tonight 2011-03-04 8.139365
Guardians of the Galaxy Vol. 2 2017-04-25 8.044703
Moneyball 2011-09-23 8.044703
The Tomorrow War 2021-07-02 7.952216
Guardians of the Galaxy 2014-08-07 7.952216
The Magnificent Seven 2016-09-29 7.952216
Jennifer's Body 2010-03-31 7.952216
The Five-Year Engagement 2012-05-03 7.952216
Jurassic World 2015-06-11 7.8618326
Guardians of the Galaxy Volume 3 2023-05-04 7.7734795
Jurassic World: Fallen Kingdom 2018-06-21 7.7734795
Jurassic World Dominion 2022-06-09 7.6

In [9]:
query = {
    'size': 100,
    'from': 0,
    'sort': [
        {'date': {'order': 'desc'}},
        '_score',
    ],
    'query': {
        'match': {
        'crew': 'Chris Pratt'
        }
    },
}

results = requests.get(url+index+'/_search', json=query).json()

for hit in results['hits']['hits']:
    print(hit['_source']['name'], hit['_source']['date'], hit['_score'])

Extraction 2 2023-06-16 2.991301
Renfield 2023-05-25 2.8505056
Guardians of the Galaxy Volume 3 2023-05-04 7.7734795
Ghosted 2023-04-20 3.0287
The Super Mario Bros. Movie 2023-04-05 7.519949
Air 2023-04-05 3.8936238
Spinning Gold 2023-03-31 2.7849636
Dungeons & Dragons: Honor Among Thieves 2023-03-30 2.7849636
The Portable Door 2023-03-23 2.7849636
Boston Strangler 2023-03-17 2.7849636
Living 2023-03-16 2.991301
Chris Rock: Selective Outrage 2023-03-04 4.5721245
High Heat 2023-03-01 3.2306602
Winnie the Pooh: Blood and Honey 2023-02-14 2.991301
Unwelcome 2023-01-27 2.8505056
Teen Wolf: The Movie 2023-01-27 2.8173535
What's Love Got to Do with It? 2023-01-26 2.991301
Play Dead 2023-01-05 4.233883
The Mummy Resurrection 2023-01-02 3.511657
Mindcage 2022-12-16 2.8505056
Avatar: The Deep Dive - A Special Edition of 20/20 2022-12-13 3.146728
Diary of a Wimpy Kid: Rodrick Rules 2022-12-02 2.5775347
The Guardians of the Galaxy Holiday Special 2022-11-25 8.236282
Slumberland 2022-11-18 2.85050

### We can add aggregation of Action genre match on language and country of origin.

In [31]:
query = {
    'size': 100,
    'from': 0,
    'query': {
        'match': {
        'genre': 'Action'
        }
    },
    'aggs': {
        'group_by_orig_lang': {
            'terms': {
                'field': 'orig_lang'
            }
        },
        'group_by_coutry': {
            'terms': {
                'field': 'country'
            }
        }
    }
}

results = requests.get(url+index+'/_search', json=query).json()

for hit in results['hits']['hits']:
    print(hit['_source']['name'], hit['_source']['date'], hit['_score'])

Supercell 2023-03-17 1.7533091
Lord of the Streets 2022-04-22 1.7533091
Overdose 2022-11-04 1.7533091
The princess 2022-08-05 1.7533091
Vendetta 2022-05-17 1.7533091
One Shot 2022-01-01 1.7533091
Crazy Fist 2021-05-06 1.7533091
Narco Sub 2021-02-07 1.7533091
The Vault 2021-07-06 1.7533091
Redemption Day 2021-03-01 1.7533091
Never Back Down: Revolt 2021-11-16 1.7533091
Kate 2021-09-10 1.7533091
Sniper: Ultimate Kill 2017-10-03 1.7533091
The Protector 2 2013-10-23 1.7533091
Outcast 2015-02-27 1.7533091
Born to Race: Fast Track 2014-03-20 1.7533091
Hard Target 1993-08-20 1.7533091
The Debt Collector 2018-06-05 1.7533091
Lionheart 1990-11-29 1.7533091
The Marine 3: Homefront 2013-03-05 1.7533091
Tracers 2015-09-02 1.7533091
American Badger 2021-06-15 1.7533091
Enter the Dragon 1973-11-01 1.7533091
The Marine 2006-10-13 1.7533091
Jesse Rodriguez vs. Cristian Gonzalez 2023-04-08 1.7533091
Torque 2004-01-22 1.7533091
Bastille Day 2016-05-12 1.7533091
The Marine 5: Battleground 2017-03-29 1.75

In [32]:
results['aggregations']

{'group_by_coutry': {'doc_count_error_upper_bound': 0,
  'sum_other_doc_count': 90,
  'buckets': [{'key': 'AU', 'doc_count': 1588},
   {'key': 'US', 'doc_count': 583},
   {'key': 'JP', 'doc_count': 215},
   {'key': 'KR', 'doc_count': 65},
   {'key': 'HK', 'doc_count': 62},
   {'key': 'CN', 'doc_count': 47},
   {'key': 'FR', 'doc_count': 41},
   {'key': 'GB', 'doc_count': 24},
   {'key': 'IN', 'doc_count': 23},
   {'key': 'RU', 'doc_count': 12}]},
 'group_by_orig_lang': {'doc_count_error_upper_bound': 0,
  'sum_other_doc_count': 74,
  'buckets': [{'key': ' English', 'doc_count': 2025},
   {'key': ' Japanese', 'doc_count': 294},
   {'key': ' Korean', 'doc_count': 83},
   {'key': ' Cantonese', 'doc_count': 82},
   {'key': ' Chinese', 'doc_count': 81},
   {'key': ' French', 'doc_count': 45},
   {'key': ' Spanish, Castilian', 'doc_count': 20},
   {'key': ' Russian', 'doc_count': 18},
   {'key': ' Hindi', 'doc_count': 15},
   {'key': ' Italian', 'doc_count': 13}]}}

### **Japanese action!**

In [43]:
query = {
    'size': 100,
    'from': 0,
    'query': {
        'bool': {
            'must': [
                {'match': {'genre': 'Action'}},
                {'match': {'orig_lang': ' Japanese'}}
            ]
        }
    },
    'sort': [
        {'date': {'order': 'desc'}},
    ],
}

results = requests.get(url+index+'/_search', json=query).json()

for hit in results['hits']['hits']:
    print(hit['_source']['name'], hit['_source']['date'])

Black Clover: Sword of the Wizard King 2023-06-16
Detective Conan: Black Iron Submarine 2023-04-14
Princess Principal Crown Handler: Chapter 3 2023-04-07
Sword Art Online the Movie -Progressive- Scherzo of Deep Night 2023-02-02
Lupin The 3rd vs. Cat’s Eye 2023-01-27
The Seven Deadly Sins: Grudge of Edinburgh Part 1 2022-12-20
One Piece Film Red 2022-11-03
Evangelion: 3.0+1.0 Thrice Upon a Time 2022-10-13
Naruto 20th Anniversary - Road of Naruto 2022-10-02
High & Low The Worst X 2022-09-09
Dragon Ball Super: Super Hero 2022-08-18
Fullmetal Alchemist: The Final Alchemy 2022-06-24
Fullmetal Alchemist: The Revenge of Scar 2022-05-20
Shin Ultraman 2022-05-13
Jujutsu Kaisen 0 2022-03-17
Sword Art Online the Movie -Progressive- Aria of a Starless Night 2021-12-09
Lupin III: The First 2021-11-01
My Hero Academia: World Heroes' Mission 2021-10-28
Bright: Samurai Soul 2021-10-12
Princess Principal Crown Handler: Chapter 2 2021-09-23
Fate/kaleid liner Prisma☆Illya: Licht Nameless Girl 2021-08-27
