In [2]:
pip install elasticsearch

Note: you may need to restart the kernel to use updated packages.



[notice] A new release of pip available: 22.2.2 -> 23.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip


In [3]:
import pandas as pd

from elasticsearch import Elasticsearch



In [4]:
df = pd.read_csv('Mini Project Corpus - 190599B.csv')

In [5]:
df.count()

Poetic Opening         179
Poem                   179
Author                 177
Year                   177
Lines                  177
Count of \nMetaphor    177
Metaphorical Name      101
Meaning                100
Extra Notes             46
dtype: int64

In [6]:

df = df[df["Count of \nMetaphor"] == 1]


In [12]:
mapping = {
    "settings": {
        "index": {
            "number_of_shards": 1,
            "number_of_replicas": 1
        },
        "analysis": {
            "analyzer": {
                "tamil_ngram_analyzer": {
                    "type": "custom",
                    "tokenizer": "classic",
                    "filter": [
                        "ngram_filter"
                    ]
                },
                "custom_analyzer": {
          "type": "custom",
          "tokenizer": "standard",
          "filter": ["lowercase"]
        },
                "tamil_ngram_analyzer_1": {
                    "type": "custom",
                    "tokenizer": "classic",
                    "filter": [
                        "ngram_filter_1"
                    ]
                },
                "tamil_ngram_analyzer_2": {
                    "type": "custom",
                    "tokenizer": "classic",
                    "filter": [
                        "ngram_filter_2"
                    ]
                },

                "tamil_analyzer_sw": {
                    "type": "custom",
                    "tokenizer": "classic",
                    "filter": [
                        "custom_stopword"
                    ]
                },
                "tamil_analyzer_st": {
                    "type": "custom",
                    "tokenizer": "classic",
                    "filter": [
                        "custom_stemmer"
                    ]
                },
                "tamil_analyzer_st_sw": {
                    "type": "custom",
                    "tokenizer": "classic",
                    "filter": [
                        "custom_stemmer",
                        "custom_stopword"
                    ]
                },
                "tamil_analyzer_sw_st": {
                    "type": "custom",
                    "tokenizer": "classic",
                    "filter": [
                        "custom_stopword",
                        "custom_stemmer"
                    ]
                },
                "tamil_analyzer_syn": {
                    "type": "custom",
                    "tokenizer": "classic",
                    "filter": [
                        "custom_synonym"
                    ]
                },
                "tamil_analyzer_syn_sw": {
                    "type": "custom",
                    "tokenizer": "classic",
                    "filter": [
                        "custom_synonym",
                        "custom_stopword"
                    ]
                },
                "tamil_analyzer_syn_st": {
                    "type": "custom",
                    "tokenizer": "classic",
                    "filter": [
                        "custom_synonym",
                        "custom_stemmer"
                    ]
                },
                "tamil_analyzer_syn_sw_st": {
                    "type": "custom",
                    "tokenizer": "classic",
                    "filter": [
                        "custom_synonym",
                        "custom_stopword",
                        "custom_stemmer"
                    ]
                }
            },
            "filter": {
                "ngram_filter": {
                    "type": "edge_ngram",
                    "min_gram": 2,
                    "max_gram": 20,
                    "side": "front"
                },
                "ngram_filter_1": {
                    "type": "edge_ngram",
                    "min_gram": 1,
                    "max_gram": 20,
                    "side": "front"
                },
                "ngram_filter_2": {
                    "type": "edge_ngram",
                    "min_gram": 4,
                    "max_gram": 20,
                    "side": "front"
                },
                "custom_stopword": {
                    "type": "stop",
                    "stopwords_path": "analyze/stopwords.txt"
                },
                "custom_stemmer": {
                    "type": "stemmer_override",
                    "rules_path": "analyze/stem.txt"
                },
                "custom_synonym": {
                    "type": "synonym",
                    "synonyms_path": "analyze/synonyms.txt"
                }

            }}},
    "mappings": {
        "properties": {
            "Author": {
                "type": "text",
                "analyzer": "custom_analyzer",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                }
            },
            "Meaning": {
                "type": "text",
                "analyzer": "tamil_analyzer_syn_sw_st",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                }
            },
            "Lines": {
                "type": "text",
                "analyzer": "tamil_analyzer_syn_sw_st",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                }
            },
            "Metaphorical Name": {
                "type": "text",
                "analyzer": "custom_analyzer",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                }
            },
            "Poem": {
                "type": "text",
                "analyzer": "tamil_analyzer_syn_sw_st",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                }
            },
            "Poetic Opening": {
                "type": "text",
                "analyzer": "tamil_analyzer_syn_sw_st",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                }
            },
            "Extra Notes": {
                "type": "text",
                "analyzer": "tamil_analyzer_syn_sw_st",
                "fields": {
                    "keyword": {
                        "type": "keyword",
                        "ignore_above": 256
                    }
                }
            },
            "Year": {
                "type": "float"
            }
        }
    }
    
}


In [13]:
from elasticsearch import Elasticsearch

es = Elasticsearch("http://localhost:9200", verify_certs=False,
                   basic_auth=['elastic', '2xv2thxzCm51jguHSy*e'])

# convert pandas dataframe to json

# then do bulk upload with mapping


def upload_to_elastic(df):
    import json
    res1 = es.indices.create(index='poems_9', body=mapping)

    from elasticsearch import helpers

    # load combined.csv
    df_json = df.to_json(orient='records')

    # convert to json
    df_json = json.loads(df_json)

    res = helpers.bulk(es, df_json, index='poems_9')

    print(res)


upload_to_elastic(df)


(100, [])
