In [1]:
import os
import yaml
import pymysql


"""
Database class loads the database credentials
This class is used to create a pymysql connector to connect to the database
For more reference on pymysql: http://pymysql.readthedocs.io/en/latest/index.html
"""


class Database:
    def __init__(self, host_name, database_name, user_name, password, charset=None):
        """
        :param host_name: Database Host Name or Server Name
        :param database_name: Name of the database to connect to
        :param user_name: User credential for authorization
        :param password: Password for authorization
        :param charset: Character set for the database
        """
        self.host_name = host_name
        self.database_name = database_name
        self.user_name = user_name
        self.password = password
        self.charset = charset

    def connect_with_pymysql(self, unicode=True):
        """
        :param unicode: Set it to true if you're working with unicodes
        :return: Returns a pymysql.connect object on success, else prints connection error and returns None
        """
        try:
            connect = pymysql.connect(
                host=self.host_name,
                user=self.user_name,
                passwd=self.password,
                db=self.database_name,
                use_unicode=unicode,
                charset=self.charset,
                cursorclass=pymysql.cursors.DictCursor
            )
            return connect
        except Exception:
            print("Connection error")


class DatabaseConnection:
    DB_CREDS_LOCATION = os.path.join(os.path.expanduser("~"), '.ssh/database_creds')

    def connect_to_database(self, db='maya_ai_local'):
        filename = os.path.join(self.DB_CREDS_LOCATION, db + '.yaml')
        with open(filename, 'r') as f:
            data = yaml.load(f)
        conn = Database(
            data['host'],
            data['database'],
            data['username'],
            data['password'],
            'utf8'
        )
        return conn.connect_with_pymysql()

In [2]:
import re
from html.parser import HTMLParser
from html import unescape
import logging
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)

class MyHTMLParser(HTMLParser):
    string = ''

    def handle_data(self, data):
        # \u200c zero width non zoiner
        self.string += data.replace('\u200c','')


class Preprocessor:
    bangla_numbers = ['০','১','২','৩','৪','৫','৬','৭','৮','৯']
    english_numbers = ['0','1','2','3','4','5','6','7','8','9']

    def replace_numbers(self, token_list):
        for index, item in enumerate(token_list):
            for i in self.bangla_numbers:
                if i in item:
                    token_list[index] = '__digit__bn__'
                    break
            for i in self.english_numbers:
                if i in item:
                    token_list[index] = '__digit__en__'
                    break

        return token_list

    @staticmethod
    def tokenize(text):
        return text.split(' ')


    @staticmethod
    def remove_extra_whitespace(text):
        return re.sub('\s+', ' ', text).strip()

    @staticmethod
    def convert_to_lowercase(text):
        return text.lower()

    @staticmethod
    def decode_text(text, type):
        # utf-8 decode
        if type == 1:
            return text.encode('latin').decode('utf-8')
        # html entity decode
        elif type == 2:
            parser = MyHTMLParser()
            parser.feed(unescape(text))
            return parser.string

    @staticmethod
    def punctuation_remover(data):
        import string
        try:
            remove_punctuation_map = dict((ord(char), u' ') for char in string.punctuation)
            for i in [2404, 55357, 56842, 55356, 57198, 57252, 128522]:
                remove_punctuation_map[i] = u' '
            return data.translate(remove_punctuation_map)
        except TypeError:
            return None

class LanguageSeparation:
    en_word_set = set()
    def __init__(self):
        with open('/Users/shuvo/Downloads/maya_article_en.txt','r') as file:
            english_text = file.read()
        english_text = english_text.split(' ')
        
        for word in english_text:
            self.en_word_set.add(word)

    def language_detection(self, text):
        # input text as a string
        # bangla = 0
        # english = 1
        # banglish = 2
        size = len(text)
        bn_count, en_count = 0,0
        for letter in text:
            try:
                letter.encode('ascii')
            except UnicodeEncodeError:
                bn_count += 1
            except UnicodeError:
                bn_count += 1
            else:
                en_count += 1
        try:
            bn_prob = float(bn_count)/size
        except ZeroDivisionError:
            return 'bn'
        if bn_prob > 0.60:
            return 'bn'
        else:
            words = text.split(' ')
            no_words = len(words)
            en_word_count = 0
            for word in words:
                if word in self.en_word_set:
                    en_word_count += 1
            en_prob = float(en_word_count)/no_words
            if en_prob > 0.50:
                return 'en'
            else: 
                return 'banglish'


In [3]:
db_conn = DatabaseConnection()
connection = db_conn.connect_to_database()
try:
    with connection.cursor() as cursor:
        # Read a single record
        sql = "SELECT id,body,source from questions"
        cursor.execute(sql)
        result = cursor.fetchall()
    
    with connection.cursor() as cursor:
        for i in result:
            if i['source']!='app':
                a= i['body'].encode('latin').decode('utf-8')
                a = Preprocessor.punctuation_remover(a)
            else:
                decoded_data = unescape(i['body'])
                parser = MyHTMLParser()
                parser.feed(decoded_data)
                a = Preprocessor.punctuation_remover(parser.string)
            pre = Preprocessor()
            lang = LanguageSeparation()
            a = Preprocessor.remove_extra_whitespace(a)
            logging.info(pre.replace_numbers(a))

            sql = "insert into auto_categorize(question_id,body,source,language) values('"+str(i['id'])+"','"+i['body']+"','"+i['source']+"','"+lang.language_detection(i['body'])+"')"
            cursor.execute(sql)
        connection.commit()

finally:
    connection.close()


In [13]:
def clean_maya_article_en():
    with open('/Users/shuvo/Downloads/maya_article_en.txt', 'r') as f:
        file = f.read()
    result = file.lower()
    result = Preprocessor.punctuation_remover(result)
    result = ''.join([i for i in result if not i.isdigit()])
    result = Preprocessor.remove_extra_whitespace(result)
    with open('/Users/shuvo/Downloads/maya_article_en.txt', 'w') as f:
        file = f.write(result)
