In [None]:
%%pom
dependencies:
- org.apache.lucene:lucene-core:9.7.0
- org.apache.lucene:lucene-analysis-common:9.7.0
- org.apache.lucene:lucene-queryparser:9.7.0

In [None]:
import java.io.BufferedReader;
import java.io.FileReader;
import java.io.IOException;
import java.io.StringReader;
import java.nio.file.Files;
import java.nio.file.Paths;

import java.util.Arrays;
import java.util.ArrayList;
import java.util.HashMap;
import java.util.Map;
import java.util.List;

In [None]:
import org.apache.lucene.analysis.Analyzer;
import org.apache.lucene.analysis.CharArraySet;
import org.apache.lucene.analysis.FilteringTokenFilter;
import org.apache.lucene.analysis.TokenStream;
import org.apache.lucene.analysis.Tokenizer;
import org.apache.lucene.analysis.en.EnglishAnalyzer;
import org.apache.lucene.analysis.en.EnglishPossessiveFilter;
import org.apache.lucene.analysis.en.KStemFilter;
import org.apache.lucene.analysis.en.PorterStemFilter;
import org.apache.lucene.analysis.standard.StandardAnalyzer;
import org.apache.lucene.analysis.standard.StandardTokenizer;
import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.Field.Store;
import org.apache.lucene.document.TextField;
import org.apache.lucene.document.IntField;
import org.apache.lucene.document.FieldType;
import org.apache.lucene.document.FloatField;
import org.apache.lucene.document.FloatPoint;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.IndexOptions;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexWriterConfig;
import org.apache.lucene.index.LogDocMergePolicy;
import org.apache.lucene.index.MergePolicy;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.NoMergePolicy;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.StoredFields;
import org.apache.lucene.index.Term;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.TermVectors;
import org.apache.lucene.search.BooleanQuery;
import org.apache.lucene.search.BooleanClause;
import org.apache.lucene.search.BooleanClause.Occur;
import org.apache.lucene.search.BoostQuery;
import org.apache.lucene.search.IndexSearcher;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TermRangeQuery;
import org.apache.lucene.search.TermQuery;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.queryparser.classic.MultiFieldQueryParser;
import org.apache.lucene.queryparser.classic.QueryParser;
import org.apache.lucene.queryparser.classic.ParseException;
import org.apache.lucene.util.BytesRef;

# A) Import the CSV data into a Lucene index

In [None]:
var fileImdbDataset = "imdb_top_1000.csv";
var pathIndex = "index";

In [None]:
List<Map<String, String>> readCollection(String name) throws IOException {
    List<Map<String, String>> docs = new ArrayList<Map<String, String>>();
    String splitter = ",(?=(?:[^\"]*\"[^\"]*\")*[^\"]*$)";
    BufferedReader reader = new BufferedReader(new FileReader(name));
    String line, keys[] = reader.readLine().split(splitter);

    while ((line = reader.readLine()) != null) {
        String[] values = line.split(splitter);
        Map<String, String> dataMap = new HashMap<>();

        for (int i = 0; i < keys.length; i++) {
            // dataMap.put(keys[i], values[i]);
            switch (keys[i]) {
                case "Series_Title":
                    dataMap.put("title", values[i]);
                    break;
                case "Released_Year":
                    dataMap.put("year", values[i]);
                    break;
                case "Runtime":
                    dataMap.put("runtime", values[i].replace(" min", ""));
                    break;
                case "Genre":
                    dataMap.put("genre", values[i].replace(",", ""));
                    break;
                case "IMDB_Rating":
                    dataMap.put("rating", values[i]);
                    break;
                case "Overview":
                    dataMap.put("summary", values[i].replace("\"", ""));
                    break;
                case "Star1":
                    dataMap.put("actors", values[i]);
                    break;
                case "Star2":
                case "Star3":
                case "Star4":
                    dataMap.put("actors", dataMap.get("actors") + " " + values[i]);
                    break;
            }
        }
        docs.add(dataMap);
    }
    reader.close();

    // print summary
    System.out.println("Read " + docs.size() + " documents from " + name);
    return docs;
}

var collection = readCollection(fileImdbDataset);
System.out.println("\nfirst document:");
collection.get(0).forEach((key, value) -> System.out.println(String.format("%10s: %s", key, value)));

# B) - G)

In [None]:
IndexSearcher getIndexSearcher() throws IOException {
    return new IndexSearcher(DirectoryReader.open(getDirectory()));
}

QueryParser getQueryParser() throws IOException {
    return new MultiFieldQueryParser(new String[]{"title", "summary", "genre", "actors"}, getAnalyzer());
}

QueryParser getQueryParser(String field) throws IOException {
    return new QueryParser(field, getAnalyzer());
}

In [None]:
void printResults(String query, TopDocs results) throws IOException {
    int rank = 1;
    System.out.println("Query: " + query);
    System.out.printf("%3s %5s %6s %6s %7s %6s   %s\n", "#", "id", "Score", "Year", "Runtime", "Rating", "Title" );
    for(ScoreDoc doc: results.scoreDocs) {
        Document document = getIndexSearcher().doc(doc.doc);
        System.out.printf("%3d %5d %6.2f %6s %7s %6s   %s\n", rank++, doc.doc, doc.score, 
            document.get("year"), document.get("runtime"), document.get("rating"), document.get("title") );
    } 
    System.out.println();
}

void searchExamples(QueryParser parser, String[] queries) throws IOException, ParseException {
    IndexSearcher searcher = getIndexSearcher();

    for(String query: queries) {
        printResults(query, searcher.search(parser.parse(query), 10));
        System.out.println();
    }
}

void searchQuery(Query query) throws IOException {
    IndexSearcher searcher = getIndexSearcher();
    TopDocs results = searcher.search(query, 10);
    printResults(query.toString(), results);
}

In [None]:
searchExamples(getQueryParser(), new String[]{
    "star wars", 
    "drama morgan freeman", 
    "comedy"
});

In [None]:
// E Faceted search
searchExamples(getQueryParser(), new String[]{
    "genre:drama actors:morgan actors:freeman", 
    "genre:comedy",
});

In [None]:
// D-F Spell - check
searchExamples(getQueryParser(), new String[]{
    "title:leon",
    "title:l?on",
    "title:leon~0.6",
});

In [None]:
// G Pagination
searchExamples(getQueryParser(), new String[]{
    "title:{a TO b}",
    "year:{1990 TO 2000}",
    "year:1994",
    "runtime:142"
});

In [None]:
var query = new BooleanQuery.Builder()
        // .add(IntField.newExactQuery("runtime", 142), Occur.MUST)
        // .add(IntField.newExactQuery("runtime", 142), Occur.SHOULD)
        .add(IntField.newExactQuery("runtime", 142), Occur.FILTER)
        .add(new TermQuery(new Term("actors", "morgan")), Occur.SHOULD)
        .build();

searchQuery(query);

In [None]:
var query = new BooleanQuery.Builder()
        .add(IntField.newRangeQuery("runtime", 120, 180), Occur.FILTER)
        .add(TermRangeQuery.newStringRange("year", "1990", "2000", true, false), Occur.FILTER)
        .add(new TermQuery(new Term("actors", "morgan")), Occur.SHOULD)
        .build();

searchQuery(query);

In [None]:
var query = new BooleanQuery.Builder()
        .add(new TermQuery(new Term("title", "star")), Occur.MUST)
        .add(new TermQuery(new Term("genre", "action")), Occur.SHOULD)
        .build();

searchQuery(query);

In [None]:
var query = new BooleanQuery.Builder()
        .add(IntField.newRangeQuery("runtime", 120, 180), Occur.FILTER)
        .add(TermRangeQuery.newStringRange("year", "1990", "2000", true, false), Occur.SHOULD)
        .add(new TermQuery(new Term("title", "shawshank")), Occur.MUST)
        .add(new BoostQuery(new TermQuery(new Term("actors", "morgan")), 1.5f), Occur.SHOULD)
        .build();
        
IndexSearcher searcher = getIndexSearcher();
TopDocs results = searcher.search(query, 10);
printResults(query.toString(), results);
System.out.println(searcher.explain(query, results.scoreDocs[0].doc));