In [1]:
import os
from elasticsearch import Elasticsearch, helpers, NotFoundError
import json

In [2]:
client = Elasticsearch("http://localhost:9200", \
                       basic_auth=("elastic", "1fVO6RuC"))  

In [3]:
#### Q1: Write code that connects to the Elasticsearch client, and then retrieve detailed health information. Save the first five pairs of key-values from the resulting JSON output in q1.json.

In [3]:
#q1
health_info = client.cluster.health()
q1 = dict(list(health_info.items())[:5])
with open('answers/q1.json', 'w') as f:
    json.dump(q1, f, indent=4)

In [5]:
#### Q2: Create an index madmap and bulk upload all provided JSON files except places.json. After the upload, retrieving the dynamic mapping configuration from Elasticsearch. Save the resulting ouptput in q2.json.

In [4]:
#q2
madmap_index = "madmap"
try:
    client.indices.delete(index=madmap_index)
except NotFoundError as e:
    print("Index doesn't exist!")

client.indices.create(index=madmap_index)

json_data_dir = "data/jsons"
json_files = [f for f in os.listdir(json_data_dir) if ".json" in f and f != 'places.json']

for json_file in json_files:
    with open(f'data/jsons/{json_file}', 'r') as f:
        data = json.load(f)
        key = list(data.keys())[0]
        documents = data[key]
        if isinstance(documents, list):
            actions = [{"_index": madmap_index, "_source": doc} for doc in documents]
        helpers.bulk(client, actions)
         
q2 = client.indices.get_mapping(index=madmap_index)

with open('answers/q2.json', 'w') as f:
    json.dump(dict(q2), f, indent=4)

In [7]:
#### Q3: Add a new, index-able field named wiki (type: text) to the mapping from Q2. Bulk upload all the text files. Save this new dynamic mapping result in q3.json.

In [5]:
#q3
mapping_update = {
    "properties": {
        "wiki": {"type": "text"}
    }
}

client.indices.put_mapping(index=madmap_index, body=mapping_update)

text_data_dir = "data/text"
text_files = [f for f in os.listdir(text_data_dir) if ".txt" in f]

actions = []

for text_file in text_files:
    with open(f'data/text/{text_file}', 'r') as f:
        data = f.read()
        actions.append({"_index": madmap_index, "_source": {"wiki": data}})

if actions:
    helpers.bulk(client, actions)

q3 = client.indices.get_mapping(index=madmap_index)

with open('answers/q3.json', 'w') as f:
    json.dump(dict(q3), f, indent=4)

In [9]:
#### Q4: Find all locations that contain "University".

In [6]:
#q4
query = {
    "query": {
        "match": {
            "formattedAddress": "University"
        }
    },
    "size": 1000 
}

q4 = client.search(index=madmap_index, body=query)

with open('answers/q4.json', 'w') as f:
    json.dump(dict(q4), f, indent=4)

In [11]:
#### Q5: Find all titles of news articles that containe something similar to "Madson" (Fuzzy search).

In [7]:
#q5
query = {
  "size": 1000,
  "query": {
    "match": {
      "title": {
        "query": "Madson",
        "fuzziness": "AUTO"
      }
    }
  },
  "_source": ["title"]
}

q5 = client.search(index=madmap_index, body=query)

with open('answers/q5.json', 'w') as f:
    json.dump(dict(q5), f, indent=4)

In [13]:
#### Q6: Find all news articles whose title or description or content contains the phrase "Wisconsin Badgers".

In [8]:
#q6
query = {
  "size": 1000,
  "query": {
    "bool": {
      "should": [
        { "match_phrase": { "title": "Wisconsin Badgers" } },
        { "match_phrase": { "description": "Wisconsin Badgers" } },
        { "match_phrase": { "content": "Wisconsin Badgers" } }
      ]
    }
  }
}

q6 = client.search(index=madmap_index, body=query)

with open('answers/q6.json', 'w') as f:
    json.dump(dict(q6), f, indent=4)

In [15]:
#### Q7: Find all locations that do not have "Madison" in their address.

In [9]:
#q7
query = {
  "size": 1000,
  "query": {
    "bool": {
      "must": [
        { "exists": { "field": "formattedAddress" } }  
      ],
      "must_not": [
        { "match": { "formattedAddress": "Madison" } }  
      ]
    }
  },
  "_source": ["name", "formattedAddress"]
}

q7 = client.search(index=madmap_index, body=query)

with open('answers/q7.json', 'w') as f:
    json.dump(dict(q7), f, indent=4)

In [17]:
#### Q8: What are the biggest football rivalries of Wisconsin Badgers (Boosting)?

In [10]:
#q8
query = {
  "size": 1000,
  "query": {
    "simple_query_string": {
      "query": "rivalry^5 football^3 badgers",
      "fields": ["wiki"]
    }
  },
  "_source": ["wiki"]
}

q8 = client.search(index=madmap_index, body=query)

with open('answers/q8.json', 'w') as f:
    json.dump(dict(q8), f, indent=4)

In [19]:
#### Q9: Highlight rivalries. 

In [11]:
#q9
query = {
  "size": 1,
  "query": {
    "match_phrase": {
      "wiki": "rivalry"
    }
  },
  "_source": ["wiki"],
  "highlight": {
    "fields": {
      "wiki": {}
    }
  }
}

q9 = client.search(index=madmap_index, body=query)["hits"]["hits"][0]["highlight"]

with open('answers/q9.json', 'w') as f:
    json.dump(dict(q9), f, indent=4)

In [None]:
#### Q10: Write an Elasticsearch query that retrives all articles in news_madison.json where the source's "name" field exactly matches "Nasa". Save the output in q10.json. 

In [12]:
#q10
query = {
  "size": 1000,
  "query": {
    "match_phrase": {
      "source.name": "Nasa"
    }
  },
  "_source": ["title", "source.name", "publishedAt"]
}

q10 = client.search(index=madmap_index, body=query)

with open('answers/q10.json', 'w') as f:
    json.dump(dict(q10), f, indent=4)

In [None]:
#### Q11: How many people were arrested in the State Street Halloween Party from from 2001 to 2019? Save the output in q11.json.

In [13]:
#q11
query = {
    "query": {
        "bool": {
            "must": [
                {
                   "range": {
                        "year": {
                            "gte": "2001",
                            "lte": "2019"
                        }
                    }
                }
            ]
        }
    },
    "aggs": {
        "total_arrests": {
            "sum": {
                "field": "arrests"
            }
        }
    }
}

q11 = client.search(index=madmap_index, body=query)["aggregations"]["total_arrests"]["value"]

with open('answers/q11.json', 'w') as f:
    json.dump(q11, f, indent=4)

In [None]:
#### Q12: What are the top 10 sources that published the most news articles?

In [14]:
#q12
query = {
    "aggs": {
        "top_sources": {
            "terms": {
                "field": "source.name.keyword",
                "size": 10
            }
        }
    }
}

q12 = client.search(index=madmap_index, body=query)["aggregations"]["top_sources"]["buckets"]

with open('answers/q12.json', 'w') as f:
    json.dump(q12, f, indent=4)

In [None]:
#### Q13: How many names are listed in the location dataset?

In [15]:
#q13
query = {
  "query": {
    "exists": {
      "field": "name"
    }
  },
  "aggs": {
    "location_name_count": {
      "value_count": {
        "field": "name.keyword"
      }
    }
  }
}

q13 = client.search(index=madmap_index, body=query)["aggregations"]["location_name_count"]["value"]
with open('answers/q13.json', 'w') as f:
    json.dump(q13, f, indent=4)

In [None]:
#### Q14: How many distinct authors contributed to the news articles?

In [16]:
#q14
query = {
  "query": {
    "exists": {
      "field": "author"
    }
  },
  "aggs": {
    "unique_authors": {
      "cardinality": {
        "field": "author.keyword"
      }
    }
  }
}

q14 = client.search(index=madmap_index, body=query)["aggregations"]["unique_authors"]["value"]
with open('answers/q14.json', 'w') as f:
    json.dump(q14, f, indent=4)

In [None]:
#### Q15: How many people on an average attended the State Street Halloween Party from from 2001 to 2019?

In [17]:
#q15
query = {
  "query": {
    "exists": {
      "field": "attended"
    }
  },
  "aggs": {
    "avg_attended": {
      "avg": {
        "field": "attended"
      }
    }
  }
}

q15 = client.search(index=madmap_index, body=query)["aggregations"]["avg_attended"]["value"]
with open('answers/q15.json', 'w') as f:
    json.dump(q15, f, indent=4)

In [None]:
#### Q16: Load places.json into Kibana Maps

In [None]:
#q16

In [None]:
#### Q17: Identify Areas in Madison with the Highest Business Density

In [None]:
#q17

In [None]:
#### Q18: Distribution of Price Levels Across Madison

In [None]:
#q18

In [None]:
#### Q19: Find the Most Expensive Place in Madison

In [18]:
#q19

q19 = {
  "coordinates": [
    {
      "coordinates": [
        -89.38210219999999,
        43.0754933
      ],
      "type": "Point"
    }
  ],
  "formattedAddress": [
    "1 S Pinckney St, Madison, WI 53703, USA"
  ],
  "name": [
    "L'Etoile Restaurant"
  ],
  "place_id": [
    "ChIJvw6XEsNTBogRN99XXAXk-pI"
  ],
  "place_type": [
    "restaurants"
  ],
  "priceLevel": [
    "PRICE_LEVEL_VERY_EXPENSIVE"
  ],
  "_id": "2wh7lpUBrIXkCu8zrtVP",
  "_index": "places_madison",
  "_score": 0
}

with open('answers/q19.json', 'w') as f:
    json.dump(q19, f, indent=4)

In [None]:
#### Q20: Find the Nearest Cafe from the Computer Sciences Department

In [None]:
#q20