Skip to content

Commit

Permalink
last checkpoint: include cities in the nested set model calculation
Browse files Browse the repository at this point in the history
Previously, the contents of the country files were read and inserted into the database in chunks of 1000.
Then the Nested Set Model is built by reading the hierarchy.txt file and updating the records in the database(slow).

At this point, the Nested Set Model is built first by reading the hierarchy.txt file, and combining it with the contents
of the admin2Codes.txt file which contains the hierarchy for cities(Even though geonames.org has up to ADM5 division,
it cuts-off at ADM2 for cities). After the nestedSet is built, the country files contents is mapped into a Geoname with
the nestedSet properties(_lft, _rgt, depth and parent_id) and inserted into the database
  • Loading branch information
Parables committed May 30, 2023
1 parent 7e55142 commit 1cd7374
Show file tree
Hide file tree
Showing 8 changed files with 347 additions and 197 deletions.
11 changes: 6 additions & 5 deletions src/Actions/LoadGeonamesAction.php
Original file line number Diff line number Diff line change
Expand Up @@ -23,10 +23,11 @@ public function execute(LazyCollection $geonamesCollection, int $chunkSize = 100
DB::table('geonames')->truncate();
}

return $geonamesCollection
->chunk($chunkSize)
->each(function (LazyCollection $collection) {
DB::table('geonames')->insertOrIgnore($collection->all());
});
$chunks = $geonamesCollection->chunk($chunkSize);

return $chunks->each(function (LazyCollection $collection, int $index) use ($chunks) {
$this->toastable->toast("Inserting next batch... " . ($index + 1) . "/" . $chunks->count());
DB::table('geonames')->insertOrIgnore($collection->all());
});
}
}
2 changes: 1 addition & 1 deletion src/Actions/ReadFileAction.php
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@ public function execute(string $fileName): LazyCollection
return LazyCollection::empty();
}

$this->toastable->toast('Reading file: ' . $fileName);
// $this->toastable->toast('Reading file: ' . $fileName);

$collection = LazyCollection::make(function () use ($fileName) {
$fileStream = fopen($fileName, 'r');
Expand Down
28 changes: 28 additions & 0 deletions src/Actions/ReadFilesAction.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,28 @@
<?php

declare(strict_types=1);

namespace Parables\Geo\Actions;

use Illuminate\Support\LazyCollection;
use Parables\Geo\Actions\Concerns\HasToastable;

class ReadFilesAction
{
use HasToastable;

/**
* @param array<int,string> $fileNames
* @return LazyCollection<int, LazyCollection>
*/
public function execute(array $fileNames): LazyCollection
{
$readFileAction = (new ReadFileAction)->toastable($this->toastable);

return LazyCollection::make(function () use ($fileNames, $readFileAction) {
foreach ($fileNames as $fileName) {
yield $readFileAction->execute($fileName);
}
});
}
}
21 changes: 21 additions & 0 deletions src/Actions/ReadFilesActionTest.php
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
<?php

declare(strict_types=1);

namespace Parables\Geo\Actions;

use Illuminate\Support\Arr;
use Illuminate\Support\LazyCollection;
use Parables\Geo\Actions\Concerns\HasToastable;
use Parables\Geo\Actions\Fixtures\Toastable;

it('can read a list of files into a LazyCollection of file contents', function () {
$cacheFile = storage_path('/geo/countries.json');
$fileNames = array_map(
fn ($fileName) => $fileName . '.txt',
array_keys(Arr::wrap(json_decode(file_get_contents($cacheFile), associative: true)))
);
$files = (new ReadFilesAction)->toastable(new Toastable)->execute($fileNames);

expect($files->count())->toBe(253);
});
18 changes: 16 additions & 2 deletions src/Actions/TransformGeonamesAction.php
Original file line number Diff line number Diff line change
Expand Up @@ -13,11 +13,25 @@ class TransformGeonamesAction
use HasToastable;
/**
* @param LazyCollection $lines
* @param array $nestedSet
*/
public function execute(LazyCollection $lines, bool $toPayload = true, bool $idAsindex = true): LazyCollection
public function execute(LazyCollection $lines, array $nestedSet = [], bool $toPayload = true, bool $idAsindex = true): LazyCollection
{
$geonamesCollection = $lines->map(function (string $line, string &$key) use ($toPayload, $idAsindex) {
$geonamesCollection = $lines->map(function (string $line, string &$key) use ($nestedSet, $toPayload, $idAsindex) {
$geoname = GeoName::fromString($line);

$this->toastable->toast('Getting node from nestedSet...' . $geoname->id());
// INFO: Step1: Get parent using id
$node = $nestedSet[$geoname->id()] ?? null;
if (empty($node)) {
// INFO: Step2: Get parent using a concatenation of the countryCode.admin1Code.admin2Code(if present)
// Skip those with admin1Code === 00
} else {
$this->toastable->toast('Node Found...');
$geoname->nodeFromPayload($node);
}

$this->toastable->toast('Node not found for ' . $geoname->id() . ' ...');
$key = $idAsindex ? $geoname->id() : $key;
return $toPayload ? $geoname->toPayload() : $geoname;
});
Expand Down
21 changes: 15 additions & 6 deletions src/Actions/TransformGeonamesActionTest.php
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,25 @@

$toastable = new Toastable();

$geonamesCollection = (new ReadFileAction)
$toastable->toast('Reading GH.txt... ');
$lines = (new ReadFileAction)
->toastable($toastable)
// ->execute(storage_path('geo/GH.txt'));
->execute(storage_path('geo/allCountries.txt'));
->execute(storage_path('geo/GH.txt'));
//->execute(storage_path('geo/allCountries.txt'));

$geonamesCollection = (new TransformGeonamesAction)
$toastable->toast('Reading hierarchy.txt...');
$nestedSet = (new BuildNestedSetModelAction)->toastable(new Toastable)->execute();

$stream = fopen(storage_path("geo/nestedSet.json"), 'w');
fwrite($stream, json_encode($nestedSet, JSON_PRETTY_PRINT));
fclose($stream);

$toastable->toast('Transforming GeoNames...');
$lines = (new TransformGeonamesAction)
->toastable($toastable)
->execute($geonamesCollection);
->execute($lines, $nestedSet);

print_r($geonamesCollection->all());
print_r($lines->all());

expect('hi')->toBe('hi');
});
134 changes: 74 additions & 60 deletions src/Commands/GeoCommand.php
Original file line number Diff line number Diff line change
Expand Up @@ -6,18 +6,17 @@

use Illuminate\Console\Command;
use Illuminate\Support\Arr;
use Illuminate\Support\Facades\DB;
use Illuminate\Support\LazyCollection;
use Parables\Geo\Actions\BuildNestedSetAction;
use Parables\Geo\Actions\BuildNestedSetModelAction;
use Parables\Geo\Actions\Concerns\Toaster;
use Parables\Geo\Actions\Contracts\Toastable;
use Parables\Geo\Actions\DownloadAction;
use Parables\Geo\Actions\FilterFileNames;
use Parables\Geo\Actions\GetDownloadLinksAction;
use Parables\Geo\Actions\GetHierarchyAction;
use Parables\Geo\Actions\ListCountries;
use Parables\Geo\Actions\LoadGeonamesAction;
use Parables\Geo\Actions\ReadFileAction;
use Parables\Geo\Actions\ReadFilesAction;
use Parables\Geo\Actions\TransformGeonamesAction;
use Parables\Geo\Actions\UnzipAction;

Expand All @@ -43,19 +42,19 @@ public function handle(): int
return self::SUCCESS;
}

// $countries = $this->fetchCountries();
$countries = $this->fetchCountries();

// $fileNames = $this->askToSelectCountries(countries: $countries);
$fileNames = $this->askToSelectCountries(countries: $countries);

// $this->downloadFiles(fileNames: $fileNames);
$this->downloadFiles(fileNames: $fileNames);

// $fileNames = $this->unzipFiles(fileNames: $fileNames);
$fileNames = $this->unzipFiles(fileNames: array_slice($fileNames, 1)); // skip admin2Codes.txt

// $files = $this->readFiles(fileNames: $fileNames);
$contentsOfGeonameFiles = $this->readGeonameFiles(fileNames: array_slice($fileNames, 2)); // skip admin2Codes.txt and hierarchy.txt

// $this->loadGeonames(files: $files);
$nestedSet = $this->buildNestedSetModel(contentsOfGeonameFiles: $contentsOfGeonameFiles);

$this->buildNestedSetModel();
$this->loadGeonames(contentsOfGeonameFiles: $contentsOfGeonameFiles, nestedSet: $nestedSet);

$this->newLine(2);
$this->comment('All done... Enjoy :-)');
Expand Down Expand Up @@ -89,6 +88,7 @@ public function fetchCountries(): array
if ($fetchUpdates) {
return $this->fetchUpdates(cacheFile: $cacheFile);
}

// read from cache
return Arr::wrap(json_decode(file_get_contents($cacheFile), associative: true));
}
Expand All @@ -115,16 +115,14 @@ public function fetchUpdates(string $cacheFile): array
$this->info('Update complete...');
$this->info('Fetched ' . count($countries) . ' countries from: ' . self::GEONAMES_ORG);

// write to cache
$stream = fopen($cacheFile, 'w');
fwrite(stream: $stream, data: json_encode($countries, JSON_PRETTY_PRINT));
fclose($stream);
$this->writeToFile(fileName: $cacheFile, content: $countries);

return $countries;
}

/**
* @param array<int,string> $countries
* @return array<int,string>
*/
public function askToSelectCountries(array $countries): array
{
Expand Down Expand Up @@ -167,12 +165,12 @@ public function askToSelectCountries(array $countries): array

/**
* @param array<int,string> $countryCodes
* @return array<int,string>
*/
public function appendFileExtension(array $countryCodes): array
{
$fileNames = array_map(fn ($code) => $code . '.zip', $countryCodes);
$fileNames[] = 'no-country.zip';
return $fileNames;
$fileNames = ['admin2Codes.txt', 'hierarchy.zip', 'no-country.zip'];
return [$fileNames] + array_map(fn ($code) => $code . '.zip', $countryCodes);
}

/**
Expand All @@ -194,10 +192,12 @@ public function downloadFiles(array $fileNames): void
$overwrite = $overwrite === 'overwrite' ? true : false;
$downloadAction = (new DownloadAction)->toastable($this);

$this->withProgressBar($fileNames, function (string $fileName) use ($downloadAction, $overwrite) {
// FIX: reuse progressBar for all long proceses... https://symfony.com/doc/current/components/console/helpers/progressbar.html
foreach ($fileNames as $fileName) {
$downloadAction->execute(fileName: $fileName, overwrite: $overwrite);
});
}
}

/**
* @param array<int,mixed> $fileNames
*/
Expand All @@ -208,57 +208,84 @@ public function unzipFiles(array $fileNames): array
$result = [];
$unzipAction = (new UnzipAction)->toastable($this);

$this->withProgressBar($fileNames, function (string $fileName) use ($unzipAction, &$result) {
// FIX: reuse progressBar for all long proceses... https://symfony.com/doc/current/components/console/helpers/progressbar.html
foreach ($fileNames as $fileName) {
$result[] = $unzipAction->execute(fileName: $fileName, overwrite: true);
});
}

return $result;
}

/**
* @param array<int,string> $fileNames
* @return LazyCollection<int, LazyCollection>
*/
public function readFiles(array $fileNames): LazyCollection
public function readGeonameFiles(array $fileNames): LazyCollection
{
$readFileAction = (new ReadFileAction)->toastable($this);
return (new ReadFilesAction)->toastable($this)->execute($fileNames);
}

return LazyCollection::make(function () use ($fileNames, $readFileAction) {
foreach ($fileNames as $fileName) {
yield $readFileAction->execute($fileName);
}
});
/**
* @param LazyCollection<int, LazyCollection> $contentsOfGeonameFiles
*/
public function buildNestedSetModel(LazyCollection $contentsOfGeonameFiles): array
{
$this->info('Building Nested Set Model...');

$hierarchy = (new GetHierarchyAction)
->toastable($this)
->execute(contentsOfGeonameFiles: $contentsOfGeonameFiles);

$nestedSet = (new BuildNestedSetModelAction)
->toastable($this)
->execute(hierarchy: $hierarchy, nestChildren: false);

$this->writeToFile(fileName: storage_path('geo/nestedSet.json'), content: $nestedSet);

return $nestedSet;
}


/**
* @param LazyCollection<int, LazyCollection> $files
* @param LazyCollection<int, LazyCollection> $contentsOfGeonameFiles
* @param array<int,mixed> $nestedSet
*/
public function loadGeonames(LazyCollection $files): void
public function loadGeonames(LazyCollection $contentsOfGeonameFiles, array $nestedSet): void
{
$this->newLine(2);
$this->info('Processing file contents in batches...');
$this->info('This might take a while so please be patient...');

$progressBar = $this->output->createProgressBar($files->count());
$progressBar = $this->output->createProgressBar($contentsOfGeonameFiles->count());
$progressBar->start();

$transformGeonamesAction = (new TransformGeonamesAction)->toastable($this);
$loadGeonamesAction = (new LoadGeonamesAction)->toastable($this);

$chunks = $files->chunk(50);
$chunks->each(function (LazyCollection $files, int $index) use ($transformGeonamesAction, $loadGeonamesAction, $chunks, $progressBar) {
// first delete the file if it exists
$geonameFile = storage_path('geo/geonames.json');
if (file_exists($geonameFile)) {
unlink($geonameFile);
}

$chunks = $contentsOfGeonameFiles->chunk(50);
$chunks->each(function (LazyCollection $contentsOfGeonameFiles, int $index) use ($geonameFile, $nestedSet, $transformGeonamesAction, $loadGeonamesAction, $chunks, $progressBar) {
$this->newLine(2);
$this->info('Processing batch: ' . $index . '/' . $chunks->count());
$this->info('Processing batch: ' . ($index + 1) . '/' . $chunks->count());

$files->each(function (LazyCollection $lines) use ($transformGeonamesAction, $loadGeonamesAction, $progressBar) {
$contentsOfGeonameFiles->each(function (LazyCollection $fileContents) use ($geonameFile, $nestedSet, $transformGeonamesAction, $loadGeonamesAction, $progressBar) {
$this->newLine(2);

$geonamesCollection = $transformGeonamesAction->execute(
lines: $fileContents,
nestedSet: $nestedSet,
toPayload: true,
idAsindex: true
);

$this->writeToFile(fileName: $geonameFile, content: $geonamesCollection, mode: 'a');

$loadGeonamesAction->execute(
geonamesCollection: $transformGeonamesAction->execute(
lines: $lines,
toPayload: true,
idAsindex: true
),
geonamesCollection: $geonamesCollection,
chunkSize: 1000,
truncateBeforeInsert: false
);
Expand All @@ -269,28 +296,15 @@ public function loadGeonames(LazyCollection $files): void
$progressBar->finish();
}

public function buildNestedSetModel(): void
private function writeToFile(string $fileName, mixed $content, string $mode = 'w'): void
{
$this->info('Building Nested Set Model...');

// Download and unzip the hierarchy.zip file
$fileNames = ['hierarchy.zip'];
$this->downloadFiles(fileNames: $fileNames);
$this->unzipFiles(fileNames: $fileNames);

// then the BuildNestedSetModelAction will take it from there
LazyCollection::wrap(
(new BuildNestedSetModelAction)
->toastable($this)
->execute()
)->chunk(1000)
->each(function (LazyCollection $collection) {
$collection->each(function (array $payload) {
DB::table('geonames')->where('id', $payload['id'])->update($payload);
});
});
// write to cache
$stream = fopen($fileName, $mode);
fwrite(stream: $stream, data: json_encode($content, JSON_PRETTY_PRINT));
fclose($stream);
}


// TODO: Download and process the postal codes from https://download.geonames.org/export/zip/
// $auxilaryFileNames = ['hierarchy.zip', /*'alternateNamesV2.zip',*/ 'countryInfo.txt',];
// process countryInfo file
Expand Down

0 comments on commit 1cd7374

Please sign in to comment.