Skip to content

Commit

Permalink
Merge pull request #18 from geekwright/addstopwords
Browse files Browse the repository at this point in the history
Add StopWords Class
  • Loading branch information
mambax7 committed Nov 2, 2016
2 parents 4eb9f2d + 8c1471b commit 099a67c
Show file tree
Hide file tree
Showing 2 changed files with 100 additions and 38 deletions.
70 changes: 32 additions & 38 deletions src/Xmf/Metagen.php
Expand Up @@ -148,10 +148,10 @@ public static function generateKeywords(
);

foreach ($originalKeywords as $originalKeyword) {
if (static::checkStopWords($originalKeyword)) {
if (static::stopWordsObject()->check($originalKeyword)) {
$secondRoundKeywords = explode("'", $originalKeyword);
foreach ($secondRoundKeywords as $secondRoundKeyword) {
if (static::checkStopWords($secondRoundKeyword)
if (static::stopWordsObject()->check($secondRoundKeyword)
&& strlen($secondRoundKeyword) >= $minLength
) {
$keyCount[$secondRoundKeyword] =
Expand All @@ -173,39 +173,6 @@ public static function generateKeywords(
return $keywords;
}

/**
* checkStopWords - look up a word in a list of stop words and
* classify it as a significant word or a stop word.
*
* @param string $key the word to check
*
* @return bool True if word is significant, false if it is a stop word
*/
public static function checkStopWords($key)
{
static $stopwords = null;

if (!$stopwords) {
if (!defined('_XMF_STOPWORDS')) {
Language::load('stopwords');
}
if (defined('_XMF_STOPWORDS')) {
$sw = explode(' ', _XMF_STOPWORDS);
$stopwords = array_fill_keys($sw, true);
} else {
$stopwords = array('_' => true);
}
}
if (!empty($stopwords)) {
if (function_exists('mb_strtolower')) {
return !isset($stopwords[mb_strtolower($key, static::ENCODING)]);
} else {
return !isset($stopwords[strtolower($key)]);
}
}
return true;
}

/**
* generateDescription - generate a short description from a body of text
*
Expand Down Expand Up @@ -322,7 +289,7 @@ public static function generateSeoTitle($title = '', $extension = '')

$tableau = explode("-", $title);
$tableau = array_filter($tableau, 'static::nonEmptyString');
$tableau = array_filter($tableau, 'static::checkStopWords');
$tableau = array_filter($tableau, array(static::stopWordsObject(), 'check'));
$title = implode("-", $tableau);

$title = (empty($title)) ? '' : $title . $extension;
Expand Down Expand Up @@ -448,14 +415,13 @@ protected static function getNeedlePositions($haystack, $needles)
*/
protected static function purifyText($text, $keyword = false)
{
$myts = \MyTextSanitizer::getInstance();
$text = str_replace(' ', ' ', $text);
$text = str_replace('<br />', ' ', $text);
$text = str_replace('<br/>', ' ', $text);
$text = str_replace('<br', ' ', $text);
$text = strip_tags($text);
$text = html_entity_decode($text);
$text = $myts->undoHtmlSpecialChars($text);
$text = htmlspecialchars_decode($text, ENT_QUOTES);
$text = str_replace(')', ' ', $text);
$text = str_replace('(', ' ', $text);
$text = str_replace(':', ' ', $text);
Expand Down Expand Up @@ -534,4 +500,32 @@ function ($matches) {

return $text;
}

/**
* checkStopWords - look up a word in a list of stop words and
* classify it as a significant word or a stop word.
*
* @param string $key the word to check
*
* @return bool True if word is significant, false if it is a stop word
* @deprecated since v1.2.0 - use Xmf\StopWords::check()
*/
public static function checkStopWords($key)
{
return static::stopWordsObject()->check($key);
}

/**
* Get a StopWords object
*
* @return StopWords
*/
protected static function stopWordsObject()
{
static $object;
if (null === $object) {
$object = new StopWords();
}
return $object;
}
}
68 changes: 68 additions & 0 deletions src/Xmf/StopWords.php
@@ -0,0 +1,68 @@
<?php
/*
You may not change or alter any portion of this comment or credits
of supporting developers from this source code or any supporting source code
which is considered copyrighted (c) material of the original comment or credit authors.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
*/

namespace Xmf;

/**
* StopWords - facilitate filtering of common or purely connective words for natural language processing
*
* @category Xmf\StopWords
* @package Xmf
* @author Richard Griffith <richard@geekwright.com>
* @author trabis <lusopoemas@gmail.com>
* @copyright 2011-2016 XOOPS Project (http://xoops.org)
* @license GNU GPL 2 or later (http://www.gnu.org/licenses/gpl-2.0.html)
* @link http://xoops.org
* @see https://en.wikipedia.org/wiki/Stop_words
*/
class StopWords
{

/**
* mbstring encoding
*/
const ENCODING = 'UTF-8';

/** @var string[] */
protected $stopwordList = array();

/**
* StopWords constructor - load stop words for current locale
*
* @todo specify locale to constructor, will require shift away from defined constant
*/
public function __construct()
{
if (!defined('_XMF_STOPWORDS')) {
Language::load('stopwords');
}
if (defined('_XMF_STOPWORDS')) {
$sw = explode(' ', _XMF_STOPWORDS);
$this->stopwordList = array_fill_keys($sw, true);
}
}

/**
* check - look up a word in a list of stop words and
* classify it as a significant word or a stop word.
*
* @param string $key the word to check
*
* @return bool True if word is significant, false if it is a stop word
*/
public function check($key)
{
$key = function_exists('mb_strtolower')
? mb_strtolower($key, static::ENCODING)
: strtolower($key);
return !isset($this->stopwordList[$key]);
}
}

0 comments on commit 099a67c

Please sign in to comment.