Skip to content

Commit

Permalink
use negation instead of stub in stop word filter
Browse files Browse the repository at this point in the history
  • Loading branch information
Paul Lam committed Oct 16, 2012
1 parent de83c06 commit 92ef31f
Show file tree
Hide file tree
Showing 4 changed files with 7 additions and 44 deletions.
14 changes: 2 additions & 12 deletions part4/src/impatient/core.clj
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,13 @@
"reads in a line of string and splits it by regex"
(s/split line #"[\[\]\\\(\),.)\s]+"))

(defn constant-true [x]
"always return true"
true)

(defn expand-stop-tuple [stop]
(<- [?stop ?stub]
(stop ?stop)
(constant-true ?stop :> ?stub)))

(defn -main [in out stop & args]
(let [rain (hfs-delimited in :skip-header? true)
stop (expand-stop-tuple (hfs-delimited stop :skip-header? true))]
stop (hfs-delimited stop :skip-header? true)]
(?<- (hfs-delimited out)
[?word ?count]
(rain _ ?line)
(split ?line :> ?word-dirty)
((c/comp s/trim s/lower-case) ?word-dirty :> ?word)
(stop ?word !!is-stop)
(nil? !!is-stop)
(stop ?word :> false)
(c/count ?count))))
15 changes: 2 additions & 13 deletions part5/src/impatient/core.clj
Original file line number Diff line number Diff line change
Expand Up @@ -9,23 +9,12 @@
"reads in a line of string and splits it by regex"
(s/split line #"[\[\]\\\(\),.)\s]+"))

(defn constant-true [x]
"always return true"
true)

(defn expand-stop-tuple [stop]
"hack to make 'left-join-negate-right' works in etl-docs-gen"
(<- [?stop ?stub]
(stop ?stop)
(constant-true ?stop :> ?stub)))

(defn etl-docs-gen [rain stop]
(<- [?doc-id ?word]
(rain ?doc-id ?line)
(split ?line :> ?word-dirty)
((c/comp s/trim s/lower-case) ?word-dirty :> ?word)
(stop ?word !!is-stop)
(nil? !!is-stop)))
(stop ?word :> false)))

(defn word-count [src]
"simple word count across all documents"
Expand Down Expand Up @@ -64,7 +53,7 @@

(defn -main [in out stop tfidf & args]
(let [rain (hfs-delimited in :skip-header? true)
stop (expand-stop-tuple (hfs-delimited stop :skip-header? true))
stop (hfs-delimited stop :skip-header? true)
src (etl-docs-gen rain stop)]
(?- (hfs-delimited tfidf)
(TF-IDF src))
Expand Down
17 changes: 3 additions & 14 deletions part6/src/impatient/core.clj
Original file line number Diff line number Diff line change
Expand Up @@ -10,16 +10,6 @@
"reads in a line of string and splits it by regex"
(s/split line #"[\[\]\\\(\),.)\s]+"))

(defn constant-true [x]
"always return true"
true)

(defn expand-stop-tuple [stop]
"hack to make 'left-join-negate-right' works in etl-docs-gen"
(<- [?stop ?stub]
(stop ?stop)
(constant-true ?stop :> ?stub)))

(defn scrub-text [s]
"trim open whitespaces and lower case"
((comp s/trim s/lower-case) s))
Expand All @@ -37,8 +27,7 @@
(rain ?doc-id ?line)
(split ?line :> ?word-dirty)
(scrub-text ?word-dirty :> ?word)
(stop ?word !!is-stop)
(nil? !!is-stop)
(stop ?word :> false)
(assert-doc-id ?doc-id)
(:trap (hfs-textline "output/trap" :sinkmode :update))))

Expand Down Expand Up @@ -82,13 +71,13 @@
["tmp/checkpoint"]
etl-step ([:tmp-dirs etl-stage]
(let [rain (hfs-delimited in :skip-header? true)
stop (expand-stop-tuple (hfs-delimited stop :skip-header? true))]
stop (hfs-delimited stop :skip-header? true)]
(?- (hfs-delimited etl-stage)
(etl-docs-gen rain stop))))
tf-step ([:deps etl-step]
(let [src (name-vars (hfs-delimited etl-stage :skip-header? true) ["?doc-id" "?word"])]
(?- (hfs-delimited tfidf)
(TF-IDF src))))
(TF-IDF src))))
wrd-step ([:deps etl-step]
(?- (hfs-delimited out)
(word-count (hfs-delimited etl-stage))))))
5 changes: 0 additions & 5 deletions part6/test/impatient/core_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,6 @@
(fact
(scrub-text "FoO BAR ") => "foo bar"))

(deftest expand-stop-tuple-test
(let [src [["a"] ["b"] ["c"]]]
(fact
(expand-stop-tuple src) => (produces [["a" true] ["b" true] ["c" true]]))))

(deftest etl-docs-gen-test
(let [rain [["doc1" "a b c"]]
stop [["b" true]]]
Expand Down

0 comments on commit 92ef31f

Please sign in to comment.