# Function

In [None]:
"""Text cleansing function which are used very frequently.

Usage:

```
from yellowduck.preprocessing.text import TextCleansing

Using all function
text = yellowduck.preprocessing.text.TextCleansing.pipeline(my_text)

-Individual-
text = TextCleansing.http_https(text)
text = TextCleansing.new_line(text)
text = TextCleansing.tab_space(text)
text = TextCleansing.hashtag(text)
text = TextCleansing.punctuation(text)
text = TextCleansing.emoji(text)
text = TextCleansing.redundant_space(text)
```
Arguments:
    text: python string.
Returns:
    A python string.
"""

In [152]:
import re

class TextCleansing():
    def http_https(text: str) -> str:
        text = re.sub(r'https\S+', '', str(text))
        text = re.sub(r'http\S+', '', str(text))
        return text

    # Remove new line (\n) and tab space (\t)
    def new_line(text: str) -> str:
        text = str(text).replace('\n',' ')
        return text

    def tab_space(text: str) -> str:
        text = str(text).replace('\t',' ')
        return text

    # Remove hashtag and line@ id
    def hashtag(text: str) -> str:
        text = re.sub(r'#[A-Za-z0-9‡∏Å-‡πô]+', ' ', str(text))
        text = re.sub(r'@[A-Za-z0-9‡∏Å-‡πô]+', ' ', str(text))
        return text

    # Clean Symbol
    def punctuation(text:str, except_punct:list=[]) -> str:
        puncts = [',', '"', ':', ')', '(', '-', '!', '?', '|', ';', "'", '$', '&', '[', ']', '>', '%', '=', '#', '*', '+', '\\', '‚Ä¢',  '~', '@', '¬£',
            '¬∑', '_', '{', '}', '¬©', '^', '¬Æ', '`',  '<', '‚Üí', '¬∞', '‚Ç¨', '‚Ñ¢', '‚Ä∫',  '‚ô•', '‚Üê', '√ó', '¬ß', '‚Ä≥', '‚Ä≤', '√Ç', '‚ñà', '¬Ω', '√†', '‚Ä¶', '\xa0', '\t',
            '‚Äú', '‚òÖ', '‚Äù', '‚Äì', '‚óè', '√¢', '‚ñ∫', '‚àí', '¬¢', '¬≤', '¬¨', '‚ñë', '¬∂', '‚Üë', '¬±', '¬ø', '‚ñæ', '‚ïê', '¬¶', '‚ïë', '‚Äï', '¬•', '‚ñì', '‚Äî', '‚Äπ', '‚îÄ', '\u3000', '\u202f',
            '‚ñí', 'Ôºö', '¬º', '‚äï', '‚ñº', '‚ñ™', '‚Ä†', '‚ñ†', '‚Äô', '‚ñÄ', '¬®', '‚ñÑ', '‚ô´', '‚òÜ', '√©', '¬Ø', '‚ô¶', '¬§', '‚ñ≤', '√®', '¬∏', '¬æ', '√É', '‚ãÖ', '‚Äò', '‚àû', '¬´',
            '‚àô', 'Ôºâ', '‚Üì', '„ÄÅ', '‚îÇ', 'Ôºà', '¬ª', 'Ôºå', '‚ô™', '‚ï©', '‚ïö', '¬≥', '„Éª', '‚ï¶', '‚ï£', '‚ïî', '‚ïó', '‚ñ¨', '‚ù§', '√Ø', '√ò', '¬π', '‚â§', '‚Ä°', '‚àö', '‚Ä¢', '!']

        final_puncts = [ele for ele in puncts if ele not in except_punct]

        for punct in final_puncts:
            text = text.replace(punct,' ')
        return text

    # Remove emoji
    def emoji(text) -> str:
        emoj = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002500-\U00002BEF"  # chinese char
            u"\U00002702-\U000027B0"
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            u"\U0001f926-\U0001f937"
            u"\U00010000-\U0010ffff"
            u"\u2640-\u2642" 
            u"\u2600-\u2B55"
            u"\u200d"
            u"\u23cf"
            u"\u23e9"
            u"\u231a"
            u"\ufe0f"  # dingbats
            u"\u3030"
                        "]+", re.UNICODE)
        return re.sub(emoj, ' ', text)

    def redundant_space(text) -> str:
        return ' '.join(text.split())

# Test Function

In [153]:
my_text = '‡∏£‡πâ‡∏≤‡∏ô\n\n\n\n\n\n   #‡∏Ç‡∏≠‡∏á‡∏°‡∏±‡∏ô‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ   \t2.‡πÄ‡∏£‡∏≤‡∏à‡∏∞‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏®‡∏ú‡∏•‡∏ú‡∏π‡πâ‡πÇ‡∏ä‡∏Ñ‡∏î‡∏µ‡∏†‡∏≤‡∏¢‡πÉ‡∏ô 30 ‡∏Å‡∏±‡∏ô‡∏¢‡∏≤‡∏¢‡∏ô ‡∏ô‡∏µ‡πâ (‡πÇ‡∏î‡∏¢‡∏Å‡∏≤‡∏£ inbox ‡∏Å‡∏•‡∏±‡∏ö‡πÑ‡∏õ)\n      3.‡πÉ‡∏ä‡πâ‡πÑ‡∏î‡πâ‡∏ó‡∏±‡πâ‡∏á‡∏°‡∏≤‡∏Å‡∏¥‡∏ô‡∏ó‡∏µ‡πà‡∏£‡πâ‡∏≤‡∏ô‡∏´‡∏£‡∏∑‡∏≠‡πÉ‡∏´‡πâ‡πÑ‡∏õ‡∏™‡πà‡∏á‡∏ó‡∏µ‡πà‡∏ö‡πâ‡∏≤‡∏ô‡∏Å‡πá‡πÑ‡∏î‡πâ (‡πÑ‡∏°‡πà‡∏£‡∏ß‡∏°‡∏Ñ‡πà‡∏≤‡∏™‡πà‡∏á)      **‡∏£‡πâ‡∏≤‡∏ô‡πÄ‡∏õ‡∏¥‡∏î 11.00-23.00 (‡∏Ñ‡∏£‡∏±‡∏ß‡∏õ‡∏¥‡∏î 22.00)**   \n---------------------------------------------------------   \n‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Delivery \n\tüëá‡∏ß‡∏¥‡∏ò‡∏µ‡∏Å‡∏≤‡∏£‡∏™‡∏±‡πà‡∏áüëá   üì±‡∏™‡∏±‡πà‡∏á‡∏ú‡πà‡∏≤‡∏ô Lineman ‡πÑ‡∏î‡πâ‡πÄ‡∏•‡∏¢‡∏ô‡∏∞‡∏Ñ‡∏£‡∏±‡∏ö ‡∏´‡∏≤‡∏Ñ‡∏≥‡∏ß‡πà‡∏≤ "‡∏Ñ‡∏ô‡∏°‡∏±‡∏ô‡∏Å‡∏∏‡πâ‡∏á" ‡∏á‡πà‡∏≤‡∏¢‡πÜ‡∏≠‡∏¥‡πà‡∏°‡∏≠‡∏£‡πà‡∏≠‡∏¢‡∏™‡∏ö‡∏≤‡∏¢‡∏≠‡∏¢‡∏∏‡πà‡∏ö‡πâ‡∏≤‡∏ô‡πÑ‡∏î‡πâ‡πÄ‡∏•‡∏¢‡∏à‡πâ‡∏≤    ‡∏´‡∏£‡∏∑‡∏≠‡∏à‡∏∞‡πÇ‡∏ó‡∏£ ‡πÑ‡∏•‡∏ô‡πå ‡∏ú‡πà‡∏≤‡∏ô‡πÉ‡∏´‡πâ‡∏ó‡∏≤‡∏á‡∏£‡πâ‡∏≤‡∏ô‡∏à‡∏±‡∏î‡∏Å‡∏≤‡∏£‡πÉ‡∏´‡πâ‡∏Å‡πá‡πÑ‡∏î‡πâ‡∏Ñ‡∏£‡∏±‡∏ö ** ‡∏£‡∏±‡∏ö‡∏≠‡∏≠‡πÄ‡∏î‡∏≠‡∏£‡πå 11.00 - 22.00 ‡πÄ‡∏ó‡πà‡∏≤‡∏ô‡∏±‡πâ‡∏ô‡∏ô‡∏∞‡∏Ñ‡∏£‡∏±‡∏ö **   \n---------------------------------------------------------   \nü¶ê Follow us ü¶ê   Line :     Facebook : konmunkung   ‡πÇ‡∏ó‡∏£ : 064 414 7844      ‡πÅ‡∏ú‡∏ô‡∏ó‡∏µ‡πà‡∏£‡πâ‡∏≤‡∏ô : ‡∏£‡πâ‡∏≤‡∏ô‡∏≠‡∏¢‡∏π‡πà‡πÉ‡∏ô‡πÇ‡∏Ñ‡∏£‡∏á‡∏Å‡∏≤‡∏£ Tree square ‡∏ó‡∏≤‡∏ß‡∏ô‡πå ‡∏≠‡∏¥‡∏ô ‡∏ó‡∏≤‡∏ß‡∏ô‡πå   üìåüìåhttps://goo.gl/maps/DXTAh5Z4jds                 '

print(my_text)

‡∏£‡πâ‡∏≤‡∏ô





   #‡∏Ç‡∏≠‡∏á‡∏°‡∏±‡∏ô‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ   	2.‡πÄ‡∏£‡∏≤‡∏à‡∏∞‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏®‡∏ú‡∏•‡∏ú‡∏π‡πâ‡πÇ‡∏ä‡∏Ñ‡∏î‡∏µ‡∏†‡∏≤‡∏¢‡πÉ‡∏ô 30 ‡∏Å‡∏±‡∏ô‡∏¢‡∏≤‡∏¢‡∏ô ‡∏ô‡∏µ‡πâ (‡πÇ‡∏î‡∏¢‡∏Å‡∏≤‡∏£ inbox ‡∏Å‡∏•‡∏±‡∏ö‡πÑ‡∏õ)
      3.‡πÉ‡∏ä‡πâ‡πÑ‡∏î‡πâ‡∏ó‡∏±‡πâ‡∏á‡∏°‡∏≤‡∏Å‡∏¥‡∏ô‡∏ó‡∏µ‡πà‡∏£‡πâ‡∏≤‡∏ô‡∏´‡∏£‡∏∑‡∏≠‡πÉ‡∏´‡πâ‡πÑ‡∏õ‡∏™‡πà‡∏á‡∏ó‡∏µ‡πà‡∏ö‡πâ‡∏≤‡∏ô‡∏Å‡πá‡πÑ‡∏î‡πâ (‡πÑ‡∏°‡πà‡∏£‡∏ß‡∏°‡∏Ñ‡πà‡∏≤‡∏™‡πà‡∏á)      **‡∏£‡πâ‡∏≤‡∏ô‡πÄ‡∏õ‡∏¥‡∏î 11.00-23.00 (‡∏Ñ‡∏£‡∏±‡∏ß‡∏õ‡∏¥‡∏î 22.00)**   
---------------------------------------------------------   
‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Delivery 
	üëá‡∏ß‡∏¥‡∏ò‡∏µ‡∏Å‡∏≤‡∏£‡∏™‡∏±‡πà‡∏áüëá   üì±‡∏™‡∏±‡πà‡∏á‡∏ú‡πà‡∏≤‡∏ô Lineman ‡πÑ‡∏î‡πâ‡πÄ‡∏•‡∏¢‡∏ô‡∏∞‡∏Ñ‡∏£‡∏±‡∏ö ‡∏´‡∏≤‡∏Ñ‡∏≥‡∏ß‡πà‡∏≤ "‡∏Ñ‡∏ô‡∏°‡∏±‡∏ô‡∏Å‡∏∏‡πâ‡∏á" ‡∏á‡πà‡∏≤‡∏¢‡πÜ‡∏≠‡∏¥‡πà‡∏°‡∏≠‡∏£‡πà‡∏≠‡∏¢‡∏™‡∏ö‡∏≤‡∏¢‡∏≠‡∏¢‡∏∏‡πà‡∏ö‡πâ‡∏≤‡∏ô‡πÑ‡∏î‡πâ‡πÄ‡∏•‡∏¢‡∏à‡πâ‡∏≤    ‡∏´‡∏£‡∏∑‡∏≠‡∏à‡∏∞‡πÇ‡∏ó‡∏£ ‡πÑ‡∏•‡∏ô‡πå ‡∏ú‡πà‡∏≤‡∏ô‡πÉ‡∏´‡πâ‡∏ó‡∏≤‡∏á‡∏£‡πâ‡∏≤‡∏ô‡∏à‡∏±‡∏î‡∏Å‡∏≤‡∏£‡πÉ‡∏´‡πâ‡∏Å‡πá‡πÑ‡∏î‡πâ‡∏Ñ‡∏£‡∏±‡∏ö ** 

In [154]:
my_text

'‡∏£‡πâ‡∏≤‡∏ô\n\n\n\n\n\n   #‡∏Ç‡∏≠‡∏á‡∏°‡∏±‡∏ô‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ   \t2.‡πÄ‡∏£‡∏≤‡∏à‡∏∞‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏®‡∏ú‡∏•‡∏ú‡∏π‡πâ‡πÇ‡∏ä‡∏Ñ‡∏î‡∏µ‡∏†‡∏≤‡∏¢‡πÉ‡∏ô 30 ‡∏Å‡∏±‡∏ô‡∏¢‡∏≤‡∏¢‡∏ô ‡∏ô‡∏µ‡πâ (‡πÇ‡∏î‡∏¢‡∏Å‡∏≤‡∏£ inbox ‡∏Å‡∏•‡∏±‡∏ö‡πÑ‡∏õ)\n      3.‡πÉ‡∏ä‡πâ‡πÑ‡∏î‡πâ‡∏ó‡∏±‡πâ‡∏á‡∏°‡∏≤‡∏Å‡∏¥‡∏ô‡∏ó‡∏µ‡πà‡∏£‡πâ‡∏≤‡∏ô‡∏´‡∏£‡∏∑‡∏≠‡πÉ‡∏´‡πâ‡πÑ‡∏õ‡∏™‡πà‡∏á‡∏ó‡∏µ‡πà‡∏ö‡πâ‡∏≤‡∏ô‡∏Å‡πá‡πÑ‡∏î‡πâ (‡πÑ‡∏°‡πà‡∏£‡∏ß‡∏°‡∏Ñ‡πà‡∏≤‡∏™‡πà‡∏á)      **‡∏£‡πâ‡∏≤‡∏ô‡πÄ‡∏õ‡∏¥‡∏î 11.00-23.00 (‡∏Ñ‡∏£‡∏±‡∏ß‡∏õ‡∏¥‡∏î 22.00)**   \n---------------------------------------------------------   \n‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Delivery \n\tüëá‡∏ß‡∏¥‡∏ò‡∏µ‡∏Å‡∏≤‡∏£‡∏™‡∏±‡πà‡∏áüëá   üì±‡∏™‡∏±‡πà‡∏á‡∏ú‡πà‡∏≤‡∏ô Lineman ‡πÑ‡∏î‡πâ‡πÄ‡∏•‡∏¢‡∏ô‡∏∞‡∏Ñ‡∏£‡∏±‡∏ö ‡∏´‡∏≤‡∏Ñ‡∏≥‡∏ß‡πà‡∏≤ "‡∏Ñ‡∏ô‡∏°‡∏±‡∏ô‡∏Å‡∏∏‡πâ‡∏á" ‡∏á‡πà‡∏≤‡∏¢‡πÜ‡∏≠‡∏¥‡πà‡∏°‡∏≠‡∏£‡πà‡∏≠‡∏¢‡∏™‡∏ö‡∏≤‡∏¢‡∏≠‡∏¢‡∏∏‡πà‡∏ö‡πâ‡∏≤‡∏ô‡πÑ‡∏î‡πâ‡πÄ‡∏•‡∏¢‡∏à‡πâ‡∏≤    ‡∏´‡∏£‡∏∑‡∏≠‡∏à‡∏∞‡πÇ‡∏ó‡∏£ ‡πÑ‡∏•‡∏ô‡πå ‡∏ú‡πà‡∏≤‡∏ô‡πÉ‡∏´‡πâ‡∏ó‡∏≤‡∏á‡∏£‡πâ‡∏≤‡∏ô‡∏à‡∏±‡∏î‡∏Å‡∏≤‡∏£‡πÉ‡∏´‡πâ‡∏Å‡πá‡πÑ‡∏î‡πâ‡∏Ñ

In [155]:
TextCleansing.http_https(my_text)

'‡∏£‡πâ‡∏≤‡∏ô\n\n\n\n\n\n   #‡∏Ç‡∏≠‡∏á‡∏°‡∏±‡∏ô‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ   \t2.‡πÄ‡∏£‡∏≤‡∏à‡∏∞‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏®‡∏ú‡∏•‡∏ú‡∏π‡πâ‡πÇ‡∏ä‡∏Ñ‡∏î‡∏µ‡∏†‡∏≤‡∏¢‡πÉ‡∏ô 30 ‡∏Å‡∏±‡∏ô‡∏¢‡∏≤‡∏¢‡∏ô ‡∏ô‡∏µ‡πâ (‡πÇ‡∏î‡∏¢‡∏Å‡∏≤‡∏£ inbox ‡∏Å‡∏•‡∏±‡∏ö‡πÑ‡∏õ)\n      3.‡πÉ‡∏ä‡πâ‡πÑ‡∏î‡πâ‡∏ó‡∏±‡πâ‡∏á‡∏°‡∏≤‡∏Å‡∏¥‡∏ô‡∏ó‡∏µ‡πà‡∏£‡πâ‡∏≤‡∏ô‡∏´‡∏£‡∏∑‡∏≠‡πÉ‡∏´‡πâ‡πÑ‡∏õ‡∏™‡πà‡∏á‡∏ó‡∏µ‡πà‡∏ö‡πâ‡∏≤‡∏ô‡∏Å‡πá‡πÑ‡∏î‡πâ (‡πÑ‡∏°‡πà‡∏£‡∏ß‡∏°‡∏Ñ‡πà‡∏≤‡∏™‡πà‡∏á)      **‡∏£‡πâ‡∏≤‡∏ô‡πÄ‡∏õ‡∏¥‡∏î 11.00-23.00 (‡∏Ñ‡∏£‡∏±‡∏ß‡∏õ‡∏¥‡∏î 22.00)**   \n---------------------------------------------------------   \n‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Delivery \n\tüëá‡∏ß‡∏¥‡∏ò‡∏µ‡∏Å‡∏≤‡∏£‡∏™‡∏±‡πà‡∏áüëá   üì±‡∏™‡∏±‡πà‡∏á‡∏ú‡πà‡∏≤‡∏ô Lineman ‡πÑ‡∏î‡πâ‡πÄ‡∏•‡∏¢‡∏ô‡∏∞‡∏Ñ‡∏£‡∏±‡∏ö ‡∏´‡∏≤‡∏Ñ‡∏≥‡∏ß‡πà‡∏≤ "‡∏Ñ‡∏ô‡∏°‡∏±‡∏ô‡∏Å‡∏∏‡πâ‡∏á" ‡∏á‡πà‡∏≤‡∏¢‡πÜ‡∏≠‡∏¥‡πà‡∏°‡∏≠‡∏£‡πà‡∏≠‡∏¢‡∏™‡∏ö‡∏≤‡∏¢‡∏≠‡∏¢‡∏∏‡πà‡∏ö‡πâ‡∏≤‡∏ô‡πÑ‡∏î‡πâ‡πÄ‡∏•‡∏¢‡∏à‡πâ‡∏≤    ‡∏´‡∏£‡∏∑‡∏≠‡∏à‡∏∞‡πÇ‡∏ó‡∏£ ‡πÑ‡∏•‡∏ô‡πå ‡∏ú‡πà‡∏≤‡∏ô‡πÉ‡∏´‡πâ‡∏ó‡∏≤‡∏á‡∏£‡πâ‡∏≤‡∏ô‡∏à‡∏±‡∏î‡∏Å‡∏≤‡∏£‡πÉ‡∏´‡πâ‡∏Å‡πá‡πÑ‡∏î‡πâ‡∏Ñ

In [156]:
TextCleansing.new_line(my_text)

'‡∏£‡πâ‡∏≤‡∏ô         #‡∏Ç‡∏≠‡∏á‡∏°‡∏±‡∏ô‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ   \t2.‡πÄ‡∏£‡∏≤‡∏à‡∏∞‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏®‡∏ú‡∏•‡∏ú‡∏π‡πâ‡πÇ‡∏ä‡∏Ñ‡∏î‡∏µ‡∏†‡∏≤‡∏¢‡πÉ‡∏ô 30 ‡∏Å‡∏±‡∏ô‡∏¢‡∏≤‡∏¢‡∏ô ‡∏ô‡∏µ‡πâ (‡πÇ‡∏î‡∏¢‡∏Å‡∏≤‡∏£ inbox ‡∏Å‡∏•‡∏±‡∏ö‡πÑ‡∏õ)       3.‡πÉ‡∏ä‡πâ‡πÑ‡∏î‡πâ‡∏ó‡∏±‡πâ‡∏á‡∏°‡∏≤‡∏Å‡∏¥‡∏ô‡∏ó‡∏µ‡πà‡∏£‡πâ‡∏≤‡∏ô‡∏´‡∏£‡∏∑‡∏≠‡πÉ‡∏´‡πâ‡πÑ‡∏õ‡∏™‡πà‡∏á‡∏ó‡∏µ‡πà‡∏ö‡πâ‡∏≤‡∏ô‡∏Å‡πá‡πÑ‡∏î‡πâ (‡πÑ‡∏°‡πà‡∏£‡∏ß‡∏°‡∏Ñ‡πà‡∏≤‡∏™‡πà‡∏á)      **‡∏£‡πâ‡∏≤‡∏ô‡πÄ‡∏õ‡∏¥‡∏î 11.00-23.00 (‡∏Ñ‡∏£‡∏±‡∏ß‡∏õ‡∏¥‡∏î 22.00)**    ---------------------------------------------------------    ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Delivery  \tüëá‡∏ß‡∏¥‡∏ò‡∏µ‡∏Å‡∏≤‡∏£‡∏™‡∏±‡πà‡∏áüëá   üì±‡∏™‡∏±‡πà‡∏á‡∏ú‡πà‡∏≤‡∏ô Lineman ‡πÑ‡∏î‡πâ‡πÄ‡∏•‡∏¢‡∏ô‡∏∞‡∏Ñ‡∏£‡∏±‡∏ö ‡∏´‡∏≤‡∏Ñ‡∏≥‡∏ß‡πà‡∏≤ "‡∏Ñ‡∏ô‡∏°‡∏±‡∏ô‡∏Å‡∏∏‡πâ‡∏á" ‡∏á‡πà‡∏≤‡∏¢‡πÜ‡∏≠‡∏¥‡πà‡∏°‡∏≠‡∏£‡πà‡∏≠‡∏¢‡∏™‡∏ö‡∏≤‡∏¢‡∏≠‡∏¢‡∏∏‡πà‡∏ö‡πâ‡∏≤‡∏ô‡πÑ‡∏î‡πâ‡πÄ‡∏•‡∏¢‡∏à‡πâ‡∏≤    ‡∏´‡∏£‡∏∑‡∏≠‡∏à‡∏∞‡πÇ‡∏ó‡∏£ ‡πÑ‡∏•‡∏ô‡πå ‡∏ú‡πà‡∏≤‡∏ô‡πÉ‡∏´‡πâ‡∏ó‡∏≤‡∏á‡∏£‡πâ‡∏≤‡∏ô‡∏à‡∏±‡∏î‡∏Å‡∏≤‡∏£‡πÉ‡∏´‡πâ‡∏Å‡πá‡πÑ‡∏î‡πâ‡∏Ñ‡∏£‡∏±‡∏ö 

In [157]:
TextCleansing.new_line(my_text)

'‡∏£‡πâ‡∏≤‡∏ô         #‡∏Ç‡∏≠‡∏á‡∏°‡∏±‡∏ô‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ   \t2.‡πÄ‡∏£‡∏≤‡∏à‡∏∞‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏®‡∏ú‡∏•‡∏ú‡∏π‡πâ‡πÇ‡∏ä‡∏Ñ‡∏î‡∏µ‡∏†‡∏≤‡∏¢‡πÉ‡∏ô 30 ‡∏Å‡∏±‡∏ô‡∏¢‡∏≤‡∏¢‡∏ô ‡∏ô‡∏µ‡πâ (‡πÇ‡∏î‡∏¢‡∏Å‡∏≤‡∏£ inbox ‡∏Å‡∏•‡∏±‡∏ö‡πÑ‡∏õ)       3.‡πÉ‡∏ä‡πâ‡πÑ‡∏î‡πâ‡∏ó‡∏±‡πâ‡∏á‡∏°‡∏≤‡∏Å‡∏¥‡∏ô‡∏ó‡∏µ‡πà‡∏£‡πâ‡∏≤‡∏ô‡∏´‡∏£‡∏∑‡∏≠‡πÉ‡∏´‡πâ‡πÑ‡∏õ‡∏™‡πà‡∏á‡∏ó‡∏µ‡πà‡∏ö‡πâ‡∏≤‡∏ô‡∏Å‡πá‡πÑ‡∏î‡πâ (‡πÑ‡∏°‡πà‡∏£‡∏ß‡∏°‡∏Ñ‡πà‡∏≤‡∏™‡πà‡∏á)      **‡∏£‡πâ‡∏≤‡∏ô‡πÄ‡∏õ‡∏¥‡∏î 11.00-23.00 (‡∏Ñ‡∏£‡∏±‡∏ß‡∏õ‡∏¥‡∏î 22.00)**    ---------------------------------------------------------    ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Delivery  \tüëá‡∏ß‡∏¥‡∏ò‡∏µ‡∏Å‡∏≤‡∏£‡∏™‡∏±‡πà‡∏áüëá   üì±‡∏™‡∏±‡πà‡∏á‡∏ú‡πà‡∏≤‡∏ô Lineman ‡πÑ‡∏î‡πâ‡πÄ‡∏•‡∏¢‡∏ô‡∏∞‡∏Ñ‡∏£‡∏±‡∏ö ‡∏´‡∏≤‡∏Ñ‡∏≥‡∏ß‡πà‡∏≤ "‡∏Ñ‡∏ô‡∏°‡∏±‡∏ô‡∏Å‡∏∏‡πâ‡∏á" ‡∏á‡πà‡∏≤‡∏¢‡πÜ‡∏≠‡∏¥‡πà‡∏°‡∏≠‡∏£‡πà‡∏≠‡∏¢‡∏™‡∏ö‡∏≤‡∏¢‡∏≠‡∏¢‡∏∏‡πà‡∏ö‡πâ‡∏≤‡∏ô‡πÑ‡∏î‡πâ‡πÄ‡∏•‡∏¢‡∏à‡πâ‡∏≤    ‡∏´‡∏£‡∏∑‡∏≠‡∏à‡∏∞‡πÇ‡∏ó‡∏£ ‡πÑ‡∏•‡∏ô‡πå ‡∏ú‡πà‡∏≤‡∏ô‡πÉ‡∏´‡πâ‡∏ó‡∏≤‡∏á‡∏£‡πâ‡∏≤‡∏ô‡∏à‡∏±‡∏î‡∏Å‡∏≤‡∏£‡πÉ‡∏´‡πâ‡∏Å‡πá‡πÑ‡∏î‡πâ‡∏Ñ‡∏£‡∏±‡∏ö 

In [158]:
TextCleansing.tab_space(my_text)

'‡∏£‡πâ‡∏≤‡∏ô\n\n\n\n\n\n   #‡∏Ç‡∏≠‡∏á‡∏°‡∏±‡∏ô‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ    2.‡πÄ‡∏£‡∏≤‡∏à‡∏∞‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏®‡∏ú‡∏•‡∏ú‡∏π‡πâ‡πÇ‡∏ä‡∏Ñ‡∏î‡∏µ‡∏†‡∏≤‡∏¢‡πÉ‡∏ô 30 ‡∏Å‡∏±‡∏ô‡∏¢‡∏≤‡∏¢‡∏ô ‡∏ô‡∏µ‡πâ (‡πÇ‡∏î‡∏¢‡∏Å‡∏≤‡∏£ inbox ‡∏Å‡∏•‡∏±‡∏ö‡πÑ‡∏õ)\n      3.‡πÉ‡∏ä‡πâ‡πÑ‡∏î‡πâ‡∏ó‡∏±‡πâ‡∏á‡∏°‡∏≤‡∏Å‡∏¥‡∏ô‡∏ó‡∏µ‡πà‡∏£‡πâ‡∏≤‡∏ô‡∏´‡∏£‡∏∑‡∏≠‡πÉ‡∏´‡πâ‡πÑ‡∏õ‡∏™‡πà‡∏á‡∏ó‡∏µ‡πà‡∏ö‡πâ‡∏≤‡∏ô‡∏Å‡πá‡πÑ‡∏î‡πâ (‡πÑ‡∏°‡πà‡∏£‡∏ß‡∏°‡∏Ñ‡πà‡∏≤‡∏™‡πà‡∏á)      **‡∏£‡πâ‡∏≤‡∏ô‡πÄ‡∏õ‡∏¥‡∏î 11.00-23.00 (‡∏Ñ‡∏£‡∏±‡∏ß‡∏õ‡∏¥‡∏î 22.00)**   \n---------------------------------------------------------   \n‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Delivery \n üëá‡∏ß‡∏¥‡∏ò‡∏µ‡∏Å‡∏≤‡∏£‡∏™‡∏±‡πà‡∏áüëá   üì±‡∏™‡∏±‡πà‡∏á‡∏ú‡πà‡∏≤‡∏ô Lineman ‡πÑ‡∏î‡πâ‡πÄ‡∏•‡∏¢‡∏ô‡∏∞‡∏Ñ‡∏£‡∏±‡∏ö ‡∏´‡∏≤‡∏Ñ‡∏≥‡∏ß‡πà‡∏≤ "‡∏Ñ‡∏ô‡∏°‡∏±‡∏ô‡∏Å‡∏∏‡πâ‡∏á" ‡∏á‡πà‡∏≤‡∏¢‡πÜ‡∏≠‡∏¥‡πà‡∏°‡∏≠‡∏£‡πà‡∏≠‡∏¢‡∏™‡∏ö‡∏≤‡∏¢‡∏≠‡∏¢‡∏∏‡πà‡∏ö‡πâ‡∏≤‡∏ô‡πÑ‡∏î‡πâ‡πÄ‡∏•‡∏¢‡∏à‡πâ‡∏≤    ‡∏´‡∏£‡∏∑‡∏≠‡∏à‡∏∞‡πÇ‡∏ó‡∏£ ‡πÑ‡∏•‡∏ô‡πå ‡∏ú‡πà‡∏≤‡∏ô‡πÉ‡∏´‡πâ‡∏ó‡∏≤‡∏á‡∏£‡πâ‡∏≤‡∏ô‡∏à‡∏±‡∏î‡∏Å‡∏≤‡∏£‡πÉ‡∏´‡πâ‡∏Å‡πá‡πÑ‡∏î‡πâ‡∏Ñ‡∏

In [159]:
TextCleansing.hashtag(my_text)

'‡∏£‡πâ‡∏≤‡∏ô\n\n\n\n\n\n       \t2.‡πÄ‡∏£‡∏≤‡∏à‡∏∞‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏®‡∏ú‡∏•‡∏ú‡∏π‡πâ‡πÇ‡∏ä‡∏Ñ‡∏î‡∏µ‡∏†‡∏≤‡∏¢‡πÉ‡∏ô 30 ‡∏Å‡∏±‡∏ô‡∏¢‡∏≤‡∏¢‡∏ô ‡∏ô‡∏µ‡πâ (‡πÇ‡∏î‡∏¢‡∏Å‡∏≤‡∏£ inbox ‡∏Å‡∏•‡∏±‡∏ö‡πÑ‡∏õ)\n      3.‡πÉ‡∏ä‡πâ‡πÑ‡∏î‡πâ‡∏ó‡∏±‡πâ‡∏á‡∏°‡∏≤‡∏Å‡∏¥‡∏ô‡∏ó‡∏µ‡πà‡∏£‡πâ‡∏≤‡∏ô‡∏´‡∏£‡∏∑‡∏≠‡πÉ‡∏´‡πâ‡πÑ‡∏õ‡∏™‡πà‡∏á‡∏ó‡∏µ‡πà‡∏ö‡πâ‡∏≤‡∏ô‡∏Å‡πá‡πÑ‡∏î‡πâ (‡πÑ‡∏°‡πà‡∏£‡∏ß‡∏°‡∏Ñ‡πà‡∏≤‡∏™‡πà‡∏á)      **‡∏£‡πâ‡∏≤‡∏ô‡πÄ‡∏õ‡∏¥‡∏î 11.00-23.00 (‡∏Ñ‡∏£‡∏±‡∏ß‡∏õ‡∏¥‡∏î 22.00)**   \n---------------------------------------------------------   \n‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Delivery \n\tüëá‡∏ß‡∏¥‡∏ò‡∏µ‡∏Å‡∏≤‡∏£‡∏™‡∏±‡πà‡∏áüëá   üì±‡∏™‡∏±‡πà‡∏á‡∏ú‡πà‡∏≤‡∏ô Lineman ‡πÑ‡∏î‡πâ‡πÄ‡∏•‡∏¢‡∏ô‡∏∞‡∏Ñ‡∏£‡∏±‡∏ö ‡∏´‡∏≤‡∏Ñ‡∏≥‡∏ß‡πà‡∏≤ "‡∏Ñ‡∏ô‡∏°‡∏±‡∏ô‡∏Å‡∏∏‡πâ‡∏á" ‡∏á‡πà‡∏≤‡∏¢‡πÜ‡∏≠‡∏¥‡πà‡∏°‡∏≠‡∏£‡πà‡∏≠‡∏¢‡∏™‡∏ö‡∏≤‡∏¢‡∏≠‡∏¢‡∏∏‡πà‡∏ö‡πâ‡∏≤‡∏ô‡πÑ‡∏î‡πâ‡πÄ‡∏•‡∏¢‡∏à‡πâ‡∏≤    ‡∏´‡∏£‡∏∑‡∏≠‡∏à‡∏∞‡πÇ‡∏ó‡∏£ ‡πÑ‡∏•‡∏ô‡πå ‡∏ú‡πà‡∏≤‡∏ô‡πÉ‡∏´‡πâ‡∏ó‡∏≤‡∏á‡∏£‡πâ‡∏≤‡∏ô‡∏à‡∏±‡∏î‡∏Å‡∏≤‡∏£‡πÉ‡∏´‡πâ‡∏Å‡πá‡πÑ‡∏î‡πâ‡∏Ñ‡∏£‡∏±‡∏ö ** ‡∏£‡∏±‡∏ö‡∏≠‡∏≠‡πÄ‡∏î‡∏

In [160]:
TextCleansing.punctuation(my_text, except_punct=['('])

'‡∏£‡πâ‡∏≤‡∏ô\n\n\n\n\n\n    ‡∏Ç‡∏≠‡∏á‡∏°‡∏±‡∏ô‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ    2.‡πÄ‡∏£‡∏≤‡∏à‡∏∞‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏®‡∏ú‡∏•‡∏ú‡∏π‡πâ‡πÇ‡∏ä‡∏Ñ‡∏î‡∏µ‡∏†‡∏≤‡∏¢‡πÉ‡∏ô 30 ‡∏Å‡∏±‡∏ô‡∏¢‡∏≤‡∏¢‡∏ô ‡∏ô‡∏µ‡πâ (‡πÇ‡∏î‡∏¢‡∏Å‡∏≤‡∏£ inbox ‡∏Å‡∏•‡∏±‡∏ö‡πÑ‡∏õ \n      3.‡πÉ‡∏ä‡πâ‡πÑ‡∏î‡πâ‡∏ó‡∏±‡πâ‡∏á‡∏°‡∏≤‡∏Å‡∏¥‡∏ô‡∏ó‡∏µ‡πà‡∏£‡πâ‡∏≤‡∏ô‡∏´‡∏£‡∏∑‡∏≠‡πÉ‡∏´‡πâ‡πÑ‡∏õ‡∏™‡πà‡∏á‡∏ó‡∏µ‡πà‡∏ö‡πâ‡∏≤‡∏ô‡∏Å‡πá‡πÑ‡∏î‡πâ (‡πÑ‡∏°‡πà‡∏£‡∏ß‡∏°‡∏Ñ‡πà‡∏≤‡∏™‡πà‡∏á         ‡∏£‡πâ‡∏≤‡∏ô‡πÄ‡∏õ‡∏¥‡∏î 11.00 23.00 (‡∏Ñ‡∏£‡∏±‡∏ß‡∏õ‡∏¥‡∏î 22.00      \n                                                            \n‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Delivery \n üëá‡∏ß‡∏¥‡∏ò‡∏µ‡∏Å‡∏≤‡∏£‡∏™‡∏±‡πà‡∏áüëá   üì±‡∏™‡∏±‡πà‡∏á‡∏ú‡πà‡∏≤‡∏ô Lineman ‡πÑ‡∏î‡πâ‡πÄ‡∏•‡∏¢‡∏ô‡∏∞‡∏Ñ‡∏£‡∏±‡∏ö ‡∏´‡∏≤‡∏Ñ‡∏≥‡∏ß‡πà‡∏≤  ‡∏Ñ‡∏ô‡∏°‡∏±‡∏ô‡∏Å‡∏∏‡πâ‡∏á  ‡∏á‡πà‡∏≤‡∏¢‡πÜ‡∏≠‡∏¥‡πà‡∏°‡∏≠‡∏£‡πà‡∏≠‡∏¢‡∏™‡∏ö‡∏≤‡∏¢‡∏≠‡∏¢‡∏∏‡πà‡∏ö‡πâ‡∏≤‡∏ô‡πÑ‡∏î‡πâ‡πÄ‡∏•‡∏¢‡∏à‡πâ‡∏≤    ‡∏´‡∏£‡∏∑‡∏≠‡∏à‡∏∞‡πÇ‡∏ó‡∏£ ‡πÑ‡∏•‡∏ô‡πå ‡∏ú‡πà‡∏≤‡∏ô‡πÉ‡∏´‡πâ‡∏ó‡∏≤‡∏á‡∏£‡πâ‡∏≤‡∏ô‡∏à‡∏±‡∏î‡∏Å‡∏≤‡∏£‡πÉ‡∏´‡πâ‡∏Å‡πá‡πÑ‡∏î‡πâ‡∏Ñ‡∏

In [161]:
TextCleansing.emoji(my_text)

'‡∏£‡πâ‡∏≤‡∏ô\n\n\n\n\n\n   #‡∏Ç‡∏≠‡∏á‡∏°‡∏±‡∏ô‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ   \t2.‡πÄ‡∏£‡∏≤‡∏à‡∏∞‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏®‡∏ú‡∏•‡∏ú‡∏π‡πâ‡πÇ‡∏ä‡∏Ñ‡∏î‡∏µ‡∏†‡∏≤‡∏¢‡πÉ‡∏ô 30 ‡∏Å‡∏±‡∏ô‡∏¢‡∏≤‡∏¢‡∏ô ‡∏ô‡∏µ‡πâ (‡πÇ‡∏î‡∏¢‡∏Å‡∏≤‡∏£ inbox ‡∏Å‡∏•‡∏±‡∏ö‡πÑ‡∏õ)\n      3.‡πÉ‡∏ä‡πâ‡πÑ‡∏î‡πâ‡∏ó‡∏±‡πâ‡∏á‡∏°‡∏≤‡∏Å‡∏¥‡∏ô‡∏ó‡∏µ‡πà‡∏£‡πâ‡∏≤‡∏ô‡∏´‡∏£‡∏∑‡∏≠‡πÉ‡∏´‡πâ‡πÑ‡∏õ‡∏™‡πà‡∏á‡∏ó‡∏µ‡πà‡∏ö‡πâ‡∏≤‡∏ô‡∏Å‡πá‡πÑ‡∏î‡πâ (‡πÑ‡∏°‡πà‡∏£‡∏ß‡∏°‡∏Ñ‡πà‡∏≤‡∏™‡πà‡∏á)      **‡∏£‡πâ‡∏≤‡∏ô‡πÄ‡∏õ‡∏¥‡∏î 11.00-23.00 (‡∏Ñ‡∏£‡∏±‡∏ß‡∏õ‡∏¥‡∏î 22.00)**   \n---------------------------------------------------------   \n‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Delivery \n\t ‡∏ß‡∏¥‡∏ò‡∏µ‡∏Å‡∏≤‡∏£‡∏™‡∏±‡πà‡∏á     ‡∏™‡∏±‡πà‡∏á‡∏ú‡πà‡∏≤‡∏ô Lineman ‡πÑ‡∏î‡πâ‡πÄ‡∏•‡∏¢‡∏ô‡∏∞‡∏Ñ‡∏£‡∏±‡∏ö ‡∏´‡∏≤‡∏Ñ‡∏≥‡∏ß‡πà‡∏≤ "‡∏Ñ‡∏ô‡∏°‡∏±‡∏ô‡∏Å‡∏∏‡πâ‡∏á" ‡∏á‡πà‡∏≤‡∏¢‡πÜ‡∏≠‡∏¥‡πà‡∏°‡∏≠‡∏£‡πà‡∏≠‡∏¢‡∏™‡∏ö‡∏≤‡∏¢‡∏≠‡∏¢‡∏∏‡πà‡∏ö‡πâ‡∏≤‡∏ô‡πÑ‡∏î‡πâ‡πÄ‡∏•‡∏¢‡∏à‡πâ‡∏≤    ‡∏´‡∏£‡∏∑‡∏≠‡∏à‡∏∞‡πÇ‡∏ó‡∏£ ‡πÑ‡∏•‡∏ô‡πå ‡∏ú‡πà‡∏≤‡∏ô‡πÉ‡∏´‡πâ‡∏ó‡∏≤‡∏á‡∏£‡πâ‡∏≤‡∏ô‡∏à‡∏±‡∏î‡∏Å‡∏≤‡∏£‡πÉ‡∏´‡πâ‡∏Å‡πá‡πÑ‡∏î‡πâ‡∏Ñ‡∏£‡∏±‡∏ö

In [162]:
TextCleansing.redundant_space(my_text)

'‡∏£‡πâ‡∏≤‡∏ô #‡∏Ç‡∏≠‡∏á‡∏°‡∏±‡∏ô‡∏ï‡πâ‡∏≠‡∏á‡∏°‡∏µ 2.‡πÄ‡∏£‡∏≤‡∏à‡∏∞‡∏õ‡∏£‡∏∞‡∏Å‡∏≤‡∏®‡∏ú‡∏•‡∏ú‡∏π‡πâ‡πÇ‡∏ä‡∏Ñ‡∏î‡∏µ‡∏†‡∏≤‡∏¢‡πÉ‡∏ô 30 ‡∏Å‡∏±‡∏ô‡∏¢‡∏≤‡∏¢‡∏ô ‡∏ô‡∏µ‡πâ (‡πÇ‡∏î‡∏¢‡∏Å‡∏≤‡∏£ inbox ‡∏Å‡∏•‡∏±‡∏ö‡πÑ‡∏õ) 3.‡πÉ‡∏ä‡πâ‡πÑ‡∏î‡πâ‡∏ó‡∏±‡πâ‡∏á‡∏°‡∏≤‡∏Å‡∏¥‡∏ô‡∏ó‡∏µ‡πà‡∏£‡πâ‡∏≤‡∏ô‡∏´‡∏£‡∏∑‡∏≠‡πÉ‡∏´‡πâ‡πÑ‡∏õ‡∏™‡πà‡∏á‡∏ó‡∏µ‡πà‡∏ö‡πâ‡∏≤‡∏ô‡∏Å‡πá‡πÑ‡∏î‡πâ (‡πÑ‡∏°‡πà‡∏£‡∏ß‡∏°‡∏Ñ‡πà‡∏≤‡∏™‡πà‡∏á) **‡∏£‡πâ‡∏≤‡∏ô‡πÄ‡∏õ‡∏¥‡∏î 11.00-23.00 (‡∏Ñ‡∏£‡∏±‡∏ß‡∏õ‡∏¥‡∏î 22.00)** --------------------------------------------------------- ‡∏™‡∏≥‡∏´‡∏£‡∏±‡∏ö Delivery üëá‡∏ß‡∏¥‡∏ò‡∏µ‡∏Å‡∏≤‡∏£‡∏™‡∏±‡πà‡∏áüëá üì±‡∏™‡∏±‡πà‡∏á‡∏ú‡πà‡∏≤‡∏ô Lineman ‡πÑ‡∏î‡πâ‡πÄ‡∏•‡∏¢‡∏ô‡∏∞‡∏Ñ‡∏£‡∏±‡∏ö ‡∏´‡∏≤‡∏Ñ‡∏≥‡∏ß‡πà‡∏≤ "‡∏Ñ‡∏ô‡∏°‡∏±‡∏ô‡∏Å‡∏∏‡πâ‡∏á" ‡∏á‡πà‡∏≤‡∏¢‡πÜ‡∏≠‡∏¥‡πà‡∏°‡∏≠‡∏£‡πà‡∏≠‡∏¢‡∏™‡∏ö‡∏≤‡∏¢‡∏≠‡∏¢‡∏∏‡πà‡∏ö‡πâ‡∏≤‡∏ô‡πÑ‡∏î‡πâ‡πÄ‡∏•‡∏¢‡∏à‡πâ‡∏≤ ‡∏´‡∏£‡∏∑‡∏≠‡∏à‡∏∞‡πÇ‡∏ó‡∏£ ‡πÑ‡∏•‡∏ô‡πå ‡∏ú‡πà‡∏≤‡∏ô‡πÉ‡∏´‡πâ‡∏ó‡∏≤‡∏á‡∏£‡πâ‡∏≤‡∏ô‡∏à‡∏±‡∏î‡∏Å‡∏≤‡∏£‡πÉ‡∏´‡πâ‡∏Å‡πá‡πÑ‡∏î‡πâ‡∏Ñ‡∏£‡∏±‡∏ö ** ‡∏£‡∏±‡∏ö‡∏≠‡∏≠‡πÄ‡∏î‡∏≠‡∏£‡πå 11.

# Usage

In [None]:
# !pip install yellowduck -q

!pip install --upgrade --force-reinstall yellowduck

In [None]:
import yellowduck

In [None]:
yellowduck.preprocessing.text.TextCleansing.http_https(text)


In [None]:
yellowduck.preprocessing.text.TextCleansing.new_line(text)


In [None]:
yellowduck.preprocessing.text.TextCleansing.tab_space(text)


In [None]:
yellowduck.preprocessing.text.TextCleansing.hashtag(text)


In [None]:
yellowduck.preprocessing.text.TextCleansing.punctuation(text)


In [None]:
yellowduck.preprocessing.text.TextCleansing.emoji(text)


In [None]:
yellowduck.preprocessing.text.TextCleansing.redundant_space(text)