Skip to content

Commit

Permalink
write comments
Browse files Browse the repository at this point in the history
  • Loading branch information
ShangtongZhang committed Jun 21, 2014
1 parent 34d6fe0 commit a491785
Show file tree
Hide file tree
Showing 2 changed files with 97 additions and 3 deletions.
27 changes: 25 additions & 2 deletions xapian-core/backends/brass/brass_postlist.cc
Original file line number Diff line number Diff line change
Expand Up @@ -673,10 +673,12 @@ FixedWidthChunk::FixedWidthChunk( map<Xapian::docid,Xapian::termcount>::const_it
buildVector();
}

//To apply fixed width format, we first build a vector based on map<did,len>
bool FixedWidthChunk::buildVector( )
{
if ( pl_start == pl_end )
{
//the map is empty.
LOGLINE( DB, "Desired postlist is empty!" );
return false;
}
Expand All @@ -687,7 +689,18 @@ bool FixedWidthChunk::buildVector( )
{
unsigned length_contiguous = 1;
Xapian::docid last_docid = it->first, cur_docid = 0;

//number of bytes to encode a length.
unsigned max_bytes = get_max_bytes(it->second);

/* Since different number of bytes is needed for different length
* and we must select the max number of bytes
* to make sure all lengths in this continuous block can be encoded,
* some bytes may be wasted.
* @used_bytes: number of bytes in all
* @good_bytes: number of bytes which isn't wasted.
* We require the ratio good_bytes/used_bytes must be bigger than a certain value,
* in case much space is wasted. */
unsigned used_bytes = 0;
unsigned good_bytes = 0;

Expand Down Expand Up @@ -744,6 +757,7 @@ bool FixedWidthChunk::buildVector( )
return true;
}

//Encode the vector generated by @buildVector.
bool FixedWidthChunk::encode( string& chunk ) const
{
LOGCALL(DB, bool, "FixedWidthChunk::encode", chunk.size() );
Expand Down Expand Up @@ -918,6 +932,7 @@ bool DoclenChunkWriter::get_new_doclen( )
const char* pos = chunk_from.data();
const char* end = pos+chunk_from.size();

//deal with the header of the chunk
if ( is_first_chunk )
{
read_start_of_first_chunk( &pos, end, NULL, NULL );
Expand All @@ -926,6 +941,7 @@ bool DoclenChunkWriter::get_new_doclen( )

if ( pos == end )
{
//original chunk is empty
LOGLINE( DB, "empty chunk!" );
map<Xapian::docid,Xapian::termcount>::const_iterator it = changes_start;
for ( ; it!=changes_end ; ++it )
Expand All @@ -944,6 +960,7 @@ bool DoclenChunkWriter::get_new_doclen( )
}
else
{
//read old map of <docid,length> from original chunk
Xapian::docid cur_did = 0, inc_did = 0;
Xapian::termcount doc_len = 0;
cur_did = first_did_in_chunk;
Expand All @@ -955,7 +972,6 @@ bool DoclenChunkWriter::get_new_doclen( )
cur_did += inc_did;
unpack_uint( &pos, end, &doc_len );
new_doclen.insert( new_doclen.end(), make_pair<Xapian::docid,Xapian::termcount>(cur_did,doc_len) );
//new_doclen[cur_did] = doc_len;
continue;
}
else
Expand All @@ -969,7 +985,6 @@ bool DoclenChunkWriter::get_new_doclen( )
{
unpack_uint_in_bytes( &pos, bytes, &doc_len );
new_doclen.insert( new_doclen.end(), make_pair<Xapian::docid,Xapian::termcount>(cur_did,doc_len) );
//new_doclen[cur_did] = doc_len;
cur_did++;
}
cur_did--;
Expand All @@ -979,6 +994,7 @@ bool DoclenChunkWriter::get_new_doclen( )

LOGVALUE( DB, new_doclen.size() );

//merge old map with @changes, get new map of <docid,length>
map<Xapian::docid,Xapian::termcount>::const_iterator chg_it = changes_start;
map<Xapian::docid,Xapian::termcount>::iterator ori_it = new_doclen.begin();

Expand Down Expand Up @@ -1029,14 +1045,21 @@ bool DoclenChunkWriter::get_new_doclen( )

bool DoclenChunkWriter::merge_doclen_changes( )
{
//get new map of <docid,length>
get_new_doclen( );

//build new chunk from new doclen map.
map<Xapian::docid,Xapian::termcount>::const_iterator start_pos, end_pos;
start_pos = end_pos = new_doclen.begin();
if ( new_doclen.size() == 0 )
{
return true;
}


//If the number of entries in new doclen map is less than a certain value,
//one chunk is enough.
//Otherwise we need to split it into many chunks.
if ( new_doclen.size() <= MAX_ENTRIES_IN_CHUNK )
{
string cur_chunk;
Expand Down
73 changes: 72 additions & 1 deletion xapian-core/backends/brass/brass_postlist.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
* Copyright 2002 Ananova Ltd
* Copyright 2002,2003,2004,2005,2007,2008,2009,2011,2013,2014 Olly Betts
* Copyright 2007,2009 Lemur Consulting Ltd
* Copyright 2014 Shangtong Zhang
*
* This program is free software; you can redistribute it and/or
* modify it under the terms of the GNU General Public License as
Expand Down Expand Up @@ -40,9 +41,21 @@
#include <string>
#include <vector>

//the following four macros is for fixed width doclen chunk.

//an indicator used in fixed width doclen chunk
#define SEPERATOR ((unsigned)-1)

/* If the length of a continuous block is more than this value,
* fixed width format will be applied. */
#define DOCLEN_CHUNK_MIN_CONTIGUOUS_LENGTH 5

/* The ratio is good bytes / used bytes in a block of fixed width format.
* we require good bytes ratio must be bigger than this value
* if we want to apply fixed width format. */
#define DOCLEN_CHUNK_MIN_GOOD_BYTES_RATIO 0.8

/* A fixed width doclen chunk has at most so many entries. */
#define MAX_ENTRIES_IN_CHUNK 2000

using namespace std;
Expand All @@ -55,7 +68,23 @@ namespace Brass {
class PostlistChunkWriter;
}


/* The Format Of Fixed Width Doclen Chunk
*
* In normal format of postlist chunk for doc length,
* we first encode the increment of did, then the doc length is encoded.
* But when doc ids are continuous, we have more efficient ways to encode them.
* In the following, we refer some continuous ids as a continuous block.
* When encoding doc lengths, if we discover a continuous block,
* we first encode -1 as an indicator, then the increment of did,
* the number of entries in this block, the bytes to encode a length.
* for example, if we have some tuples < did, len >,
* <1,7>,<2,5>,<3,6>,<4,15>,<5,257>,
* then the chunk will be
* -1 1 5 2 7 5 6 15 257 */


//This class is used to encode a map<did,len> to a chunk.
//It doesn't deal with the header of the chunk.
class FixedWidthChunk
{
private:
Expand All @@ -65,9 +94,13 @@ class FixedWidthChunk
public:
FixedWidthChunk( map<Xapian::docid,Xapian::termcount>::const_iterator pl_start_,
map<Xapian::docid,Xapian::termcount>::const_iterator pl_end_);

//The encoded map will be appended to @chunk.
bool encode( string& chunk ) const;
};

//This class is used to read fixed width format doclen chunk.
//It doesn't deal with the header of the chunk.
class FixedWidthChunkReader
{
private:
Expand All @@ -79,14 +112,24 @@ class FixedWidthChunkReader
Xapian::docid cur_did;
Xapian::termcount cur_length;
bool is_at_end;

//indicating whether we are in a continuous block.
bool is_in_block;

//If we are in a continuous block,
//the following two variables is the basic info for this continuous block.
unsigned len_info;
unsigned bytes_info;

Xapian::docid did_before_block;

//first doc id in this chunk
Xapian::docid first_did_in_chunk;


public:
//@pos_ : a pointer to the end of the header of the chunk.
//@end_ : a pointer to the end of the chunk.
FixedWidthChunkReader( const char* pos_, const char* end_, Xapian::docid first_did_in_chunk_ )
: ori_pos(pos_), pos(pos_), pos_of_block(NULL), end(end_), cur_did(0), cur_length(0),
is_at_end(false),is_in_block(false),len_info(0),bytes_info(0),
Expand All @@ -106,37 +149,60 @@ class FixedWidthChunkReader
LOGVALUE(DB, cur_length );
};

// jump to desired did,
// if it fails, it will arrive at the exact did just after @desired_did
bool jump_to( Xapian::docid desired_did );

// move to next did in the chunk.
// If no more did, set is_at_end=true.
bool next();

// return current docid
Xapian::docid get_docid()
{
return cur_did;
}

// return length of doc with current doc id
Xapian::termcount get_doclength()
{
return cur_length;
}

// return is_at_end
bool at_end()
{
return is_at_end;
}
};


//This class is used to update fixed width doclen chunk.
class DoclenChunkWriter
{
private:

//the original chunk
const string& chunk_from;

//the changes of doc length
map<Xapian::docid,Xapian::termcount>::const_iterator changes_start, changes_end;

BrassPostListTable* postlist_table;

bool is_first_chunk;
bool is_last_chunk;
Xapian::docid first_did_in_chunk;

//new map of doc length
map<Xapian::docid,Xapian::termcount> new_doclen;

//merge old map and new map
bool get_new_doclen( );
public:

//@chunk_from_ : original chunk
//@changes_start_ @changes_end_ : iterator of map of changes
DoclenChunkWriter( const string& chunk_from_,
map<Xapian::docid,Xapian::termcount>::const_iterator& changes_start_,
map<Xapian::docid,Xapian::termcount>::const_iterator& changes_end_,
Expand All @@ -149,9 +215,14 @@ class DoclenChunkWriter
LOGCALL_CTOR(DB, "DoclenChunkWriter", is_first_chunk_ | first_did_in_chunk_ );
is_last_chunk = true;
}

//it will build and insert new chunk,
//make sure old chunk is deleted before call this function.
bool merge_doclen_changes( );
};

//This class is just a wrapper of FixedWidthChunkReader,
//This class just deals with the header of the chunk.
class DoclenChunkReader
{
private:
Expand Down

0 comments on commit a491785

Please sign in to comment.