In [1]:
from urllib.request import urlopen
import pandas as pd
import json
from elasticsearch import Elasticsearch, RequestsHttpConnection
from newsapi import NewsApiClient
import warnings

In [2]:
def Elasticsearch_connection(host_link,user_auth):
    warnings.filterwarnings("ignore")
    es = Elasticsearch(hosts=host_link ,verify_certs=False,http_auth= user_auth, connection_class=RequestsHttpConnection,)
    print("ElasticSearch connection has been established and this connection instance is stored in es variable")
    return es

In [3]:
es = Elasticsearch_connection(['https://tux-es1.cci.drexel.edu:9200/','https://tux-es2.cci.drexel.edu:9200/','https://tux-es3.cci.drexel.edu:9200/'],'id:password')
# result = es.search(index='ms4976_info624_201904_project')
# doc_id = result['hits']['total']['value']
# res = es.search(index='ms4976_info624_201904_project',body={"from" : 0, "size" : doc_id,"query": {"match_all" : {}}})
# print(type(res))


ElasticSearch connection has been established and this connection instance is stored in es variable


In [4]:
#creating an below index with respective settings
index_name = 'ms4976_info624_201904_newsproject1'
request_body = {
        "settings":{
            "index":{
                "similarity":{
                    "custom_bm25":{
                        "type": "BM25",
                        "k1": 2.0,
                        "b":1.0
                    },
                    "custom_dfr":{
                        "type": "DFR",
                        "basic_model": "g",
                        "after_effect": "l",
                        "normalization": "h2",
                        "normalization.h2.c": "3.0"
                    }
                }
            }
    },

        'mappings': {
            
            "properties":{
                "source":{
                    "type": "text",
                    "analyzer": "standard"
                    },
                "author":{
                    "type": "text" ,
                    "analyzer": "standard",
                    "similarity": "boolean"
                    },
                "title":{
                    "type": "text" ,
                    "analyzer": "english",
                    "similarity":"custom_dfr"
                    },
     
                "description":{
                    "type": "text" ,
                    "analyzer": "english",
                    "similarity":"custom_bm25"
                    },
                 "url":{
                    "type": "text"
                    },
     
                "publishedAt":{
                    "type" : "date"
                    },
                "timestamp" :{
                    "type" : "rank_feature",
                    "positive_score_impact" : True  
                    }
                }
            }
        }
es.indices.create(index = index_name, body = request_body)

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'ms4976_info624_201904_newsproject1'}

In [5]:
#reindexing data to this index from ms4976_info624_201904_project 
result = es.reindex({
        "source": {"index": 'ms4976_info624_201904_newsproject'},
        "dest": {"index": index_name}
    }, wait_for_completion=True, request_timeout=300)
result

{'took': 6629,
 'timed_out': False,
 'total': 3561,
 'updated': 0,
 'created': 3561,
 'deleted': 0,
 'batches': 4,
 'version_conflicts': 0,
 'noops': 0,
 'retries': {'bulk': 0, 'search': 0},
 'throttled_millis': 0,
 'requests_per_second': -1.0,
 'throttled_until_millis': 0,
 'failures': []}

In [6]:
#creating an below index with respective settings
index_name = 'ms4976_info624_201904_newsproject2'
request_body = {
        "settings":{
            "index":{
                "similarity":{
                    "custom_bm25":{
                        "type": "BM25",
                        "k1": 1.5,
                        "b":1.0
                    },
                    "custom_dfr":{
                        "type": "DFR",
                        "basic_model": "if",
                        "after_effect": "b",
                        "normalization": "h3",
                        "normalization.h2.c": "3.0"
                    }
                }
            }
    },

        'mappings': {
            
            "properties":{
                "source":{
                    "type": "text",
                    "analyzer": "standard"
                    },
                "author":{
                    "type": "text" ,
                    "analyzer": "standard",
                    "similarity": "boolean"
                    },
                "title":{
                    "type": "text" ,
                    "analyzer": "english",
                    "similarity":"custom_dfr"
                    },
     
                "description":{
                    "type": "text" ,
                    "analyzer": "english",
                    "similarity":"custom_bm25"
                    },
                 "url":{
                    "type": "text"
                    },
     
                "publishedAt":{
                    "type" : "date"
                    },
                "timestamp" :{
                    "type" : "rank_feature",
                    "positive_score_impact" : True  
                    }
                }
            }
        }
es.indices.create(index = index_name, body = request_body)

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'ms4976_info624_201904_newsproject2'}

In [7]:
#reindexing data to this index from ms4976_info624_201904_project 
result = es.reindex({
        "source": {"index": 'ms4976_info624_201904_newsproject'},
        "dest": {"index": index_name}
    }, wait_for_completion=True, request_timeout=300)
result

{'took': 2717,
 'timed_out': False,
 'total': 3561,
 'updated': 0,
 'created': 3561,
 'deleted': 0,
 'batches': 4,
 'version_conflicts': 0,
 'noops': 0,
 'retries': {'bulk': 0, 'search': 0},
 'throttled_millis': 0,
 'requests_per_second': -1.0,
 'throttled_until_millis': 0,
 'failures': []}

In [8]:
#creating an below index with respective settings
index_name = 'ms4976_info624_201904_newsproject3'
request_body = {
        "settings":{
            "index":{
                "similarity":{
                    "custom_bm25":{
                        "type": "BM25",
                        "k1": 1.0,
                        "b":0.9
                    }
                }
            }
        },

        'mappings': {
            
            "properties":{
                "source":{
                    "type": "text",
                    "analyzer": "standard"
                    },
                "author":{
                    "type": "text" ,
                    "analyzer": "standard",
                    "similarity": "boolean"
                    },
                "title":{
                    "type": "text" ,
                    "analyzer": "english",
                    "similarity":"custom_bm25"
                    },
     
                "description":{
                    "type": "text" ,
                    "analyzer": "english",
                    "similarity":"custom_bm25"
                    },
                 "url":{
                    "type": "text"
                    },
     
                "publishedAt":{
                    "type" : "date"
                    },
                "timestamp" :{
                    "type" : "rank_feature",
                    "positive_score_impact" : True  
                    }
                }
            }
        }
es.indices.create(index = index_name, body = request_body)

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'ms4976_info624_201904_newsproject3'}

In [9]:
#reindexing data to this index from ms4976_info624_201904_project 
result = es.reindex({
        "source": {"index": 'ms4976_info624_201904_newsproject'},
        "dest": {"index": index_name}
    }, wait_for_completion=True, request_timeout=300)
result

{'took': 1807,
 'timed_out': False,
 'total': 3561,
 'updated': 0,
 'created': 3561,
 'deleted': 0,
 'batches': 4,
 'version_conflicts': 0,
 'noops': 0,
 'retries': {'bulk': 0, 'search': 0},
 'throttled_millis': 0,
 'requests_per_second': -1.0,
 'throttled_until_millis': 0,
 'failures': []}

In [11]:
#creating an below index with respective settings
index_name = 'ms4976_info624_201904_newsproject4'
request_body = {
        "settings":{
            "index":{
                "similarity":{
                    "custom_dfr":{
                        "type": "DFR",
                        "basic_model": "ine",
                        "after_effect": "b",
                        "normalization": "z",
                        "normalization.h2.c": "3.0"
                    }
                }
            }
    },

        'mappings': {
            
            "properties":{
                "source":{
                    "type": "text",
                    "analyzer": "standard"
                    },
                "author":{
                    "type": "text" ,
                    "analyzer": "standard",
                    "similarity": "boolean"
                    },
                "title":{
                    "type": "text" ,
                    "analyzer": "english",
                    "similarity":"custom_dfr"
                    },
     
                "description":{
                    "type": "text" ,
                    "analyzer": "english",
                    "similarity":"custom_dfr"
                    },
                 "url":{
                    "type": "text"
                    },
     
                "publishedAt":{
                    "type" : "date"
                    },
                "timestamp" :{
                    "type" : "rank_feature",
                    "positive_score_impact" : True  
                    }
                }
            }
        }
es.indices.create(index = index_name, body = request_body)

{'acknowledged': True,
 'shards_acknowledged': True,
 'index': 'ms4976_info624_201904_newsproject4'}

In [12]:
#reindexing data to this index from ms4976_info624_201904_project 
result = es.reindex({
        "source": {"index": 'ms4976_info624_201904_newsproject'},
        "dest": {"index": index_name}
    }, wait_for_completion=True, request_timeout=300)
result

{'took': 4479,
 'timed_out': False,
 'total': 3561,
 'updated': 0,
 'created': 3561,
 'deleted': 0,
 'batches': 4,
 'version_conflicts': 0,
 'noops': 0,
 'retries': {'bulk': 0, 'search': 0},
 'throttled_millis': 0,
 'requests_per_second': -1.0,
 'throttled_until_millis': 0,
 'failures': []}

#### NOTE: The next part of Evaluation has been done in jupyter notebook called: Evaluation.