OOP Testing and Corpus Module

Olamyy · Jul 13, 2019 · 741d909 · 741d909
1 parent 23e4594
commit 741d909
Show file tree

Hide file tree

Showing 13 changed files with 235 additions and 144 deletions.
diff --git a/requirements.txt b/requirements.txt
@@ -1,3 +1,4 @@
+gensim
 bs4
 configargparse
 torch

diff --git a/src/iranlowo/corpus/__init__.py b/src/iranlowo/corpus/__init__.py
@@ -0,0 +1 @@
+from .corpus import Corpus, DirectoryCorpus
diff --git a/src/iranlowo/corpus/bbc_yoruba.py b/src/iranlowo/corpus/bbc_yoruba.py
@@ -0,0 +1,13 @@
+from iranlowo.corpus import Corpus
+
+
+class BBCCorpus(Corpus):
+    def __init__(self, path):
+        """
+
+        Args:
+            path:
+        """
+        super(BBCCorpus, self).__init__(path=self.path, **kwargs)
+        super().__init__(path)
+
diff --git a/src/iranlowo/corpus/bibeli.py b/src/iranlowo/corpus/bibeli.py
@@ -0,0 +1,12 @@
+from iranlowo.corpus import Corpus
+
+
+class BibeliCorpus(Corpus):
+    def __init__(self, path):
+        """
+
+        Args:
+            path:
+        """
+        super(BibeliCorpus, self).__init__(path=self.path, **kwargs)
+
diff --git a/src/iranlowo/corpus/corpus.py b/src/iranlowo/corpus/corpus.py
@@ -2,36 +2,33 @@
 import os
 import sys
 
-import requests
 
 from gensim import interfaces
 from gensim.corpora.csvcorpus import CsvCorpus
-from gensim.corpora.textcorpus import lower_to_unicode, strip_multiple_whitespaces, walk
-from gensim.utils import deaccent
+from gensim.corpora.textcorpus import walk
 
-from iranlowo.preprocessing import is_valid_owé_format
+from iranlowo.preprocessing import is_valid_owé_format, normalize_diacritics_text
 from iranlowo.utils import is_text_nfc
 
 
 class Corpus(interfaces.CorpusABC):
-    def __init__(self, path=None, text=None, is_url=False, rlist=False, stream=False, fformat='txt', cformat=None, labels=False, preprocess=False):
+    def __init__(self, path=None, text=None, stream=False, fformat='txt', cformat=None, labels=False, preprocess=None):
         """
 
         Args:
             path:
             text:
-            **kwargs:
         """
         self.path = path
         self.text = text
-        self.rlist = rlist
         self.labels = labels
         self.stream = stream
         self.fformat = fformat
-        self.preprocess = preprocess
         self.cformat = cformat
-        self.is_url = is_url
-        self.data = text if text else self.read_file_or_filename()
+        self.preprocess = preprocess
+        if not self.preprocess:
+            self.preprocess = [normalize_diacritics_text]
+        self.data = self.read_file_filename_or_text(text=text) if text else self.read_file_filename_or_text()
         self.validate_format()
 
     def __iter__(self):
@@ -41,9 +38,6 @@ def __iter__(self):
     def __len__(self):
         return len(self.data)
 
-    def get_data(self):
-        pass
-
     @staticmethod
     def save_corpus(fname, corpus, id2word=None, metadata=False):
         pass
@@ -55,29 +49,24 @@ def streamfile(self, fobj):
                 num_text += 1
                 yield line
 
-    def read_file_or_filename(self, f=None):
+    def read_file_filename_or_text(self, f=None, text=None):
         """
 
         Returns:
 
         """
         path = f if f else self.path
-        text = None
-        print(len(self.path))
         out = []
-        if isinstance(path, list):
+        if text:
+            return self.handle_preprocessing(text) if self.preprocess else text
+        elif isinstance(path, list):
             for f in path:
                 path.remove(f)
                 sys.setrecursionlimit(10000)
-                text = self.read_file_or_filename(f)
+                text = self.read_file_filename_or_text(f)
                 out.append(text)
         else:
-            if self.is_url:
-                r = requests.get(path)
-                if r.status_code in [200, 201]:
-                    text = r.text
-                    return text
-            elif isinstance(path, str):
+            if isinstance(path, str):
                 if self.fformat == "txt":
                     text = open(path)
                 elif self.fformat == "csv":
@@ -87,22 +76,15 @@ def read_file_or_filename(self, f=None):
             else:
                 text = self.path.seek(0)
 
-            if not self.stream:
-                text = text.read() if not self.rlist else text.readlines()
-                print(text)
-                if self.preprocess:
-                    text = self.handle_preprocessing(text)
-                return text
-            else:
-                self.streamfile(text)
+            text = text.read() if not self.stream else ''.join(list(self.streamfile(text)))
+            return self.handle_preprocessing(text) if self.preprocess else text
 
     def handle_preprocessing(self, text):
         if callable(self.preprocess):
             return self.preprocess(text)
         if isinstance(self.preprocess, list):
-            prep_list = self.preprocess if isinstance(self.preprocess, list) else [lower_to_unicode, deaccent, strip_multiple_whitespaces]
-            for technique in prep_list:
-                text = technique(self.data)
+            for technique in self.preprocess:
+                text = technique(text)
             return text
 
     def validate_format(self):
@@ -113,7 +95,7 @@ def validate_format(self):
         """
         data = self.data
         if isinstance(data, list):
-            data = ' '.join(data)
+            data = ''.join(data)
         if not self.cformat and not is_text_nfc(data):
             raise TypeError("The corpus does not comply to the NFC corpus format")
         elif self.cformat == "owe":
@@ -148,4 +130,3 @@ def __init__(self, path, **kwargs):
     def read_files(self):
         for path in self.flist:
             yield os.path.join(self.path_dir, path)
-
diff --git a/src/iranlowo/corpus/scrapper.py b/src/iranlowo/corpus/scrapper.py
diff --git a/src/iranlowo/corpus/yor_blog.py b/src/iranlowo/corpus/yor_blog.py
@@ -0,0 +1,12 @@
+from iranlowo.corpus import Corpus
+
+
+class YorubaBlogCorpus(Corpus):
+    def __init__(self, path):
+        """
+
+        Args:
+            path:
+        """
+        super(YorubaBlogCorpus, self).__init__(path=self.path, **kwargs)
+
diff --git a/src/iranlowo/interfaces.py b/src/iranlowo/interfaces.py
@@ -0,0 +1,14 @@
+import scrapy
+
+
+class Scrapper(scrapy.Spider):
+    """
+    Interface for scrapping data from :mod:`iranlowo.scrapper`
+    """
+
+    def __init__(self, name, urls, **kwargs):
+        super(Scrapper, self).__init__(name, **kwargs)
+
+    def parse(self, response):
+        pass
+
diff --git a/tests/pred.txt b/tests/pred.txt
@@ -0,0 +1 @@
+ṣùgbọ́n