# Text Preprocessing

In [1]:
#importing necessary libraries for text preprocessing
import pandas as pd
import string # for removing punctuations marks
import re # for removing url
import demoji # for removing emoticons
import spacy # for tokenization
from spacy.lang.en import English
from spacy.lang.en.stop_words import STOP_WORDS as eng_stop_words
demoji.download_codes() # download emoticons
punctuation_marks = list(string.punctuation)

Downloading emoji data ...
... OK (Got response in 0.33 seconds)
Writing emoji data to C:\Users\Ravineesh\.demoji\codes.json ...
... OK


In [2]:
print(punctuation_marks)

['!', '"', '#', '$', '%', '&', "'", '(', ')', '*', '+', ',', '-', '.', '/', ':', ';', '<', '=', '>', '?', '@', '[', '\\', ']', '^', '_', '`', '{', '|', '}', '~']


### 1. English Language Tokenization

In [3]:
def tokenization_en(input_string,stop_words):
  nlp = English()
  input_text = nlp(input_string)
  
  token_list=[]
  for token in input_text:
    if(token.text not in stop_words):
      token_list.append(token.text)

  return token_list

### 2. Removing Punctuations marks

In [4]:
def remove_punctuation(input_string):
  result=[]
  for word in input_string:
    if(bool(word.translate(str.maketrans('', '', string.punctuation)))==True):
      result.append(word)
  
  return result


### 3. Remove emoticons from text

In [5]:
def remove_emoticons(input_string):
  result = []
  for word in input_string:
    if(len(demoji.findall(word))==0):
      result.append(word)
  
  return result

### 4.Remove Numbers from text

In [6]:
def remove_numbers(input_string):
  result=[]
  for word in input_string:
    result.append(re.sub(r'\d+','',word))
  
  return result

### 5. Remove Non-ASCII Characters from text

In [7]:
def remove_non_ascii(input_string):
  result=[]
  for word in input_string:
    word = word.encode("ascii","ignore")
    result.append(word.decode())
  
  return result

### 6.Remove URL's

In [8]:
def remove_URL(input_string):
  result = []
  for token in input_string:
    result.append(re.sub(r'http\S+',"",token))

  return result

### 7. Remove empty strings from text 

In [9]:
def remove_empty_strings(input_string):
  result=[]

  for word in input_string:
    word.strip()
    if(word!=""):
      result.append(word)
  
  return result

### Preprocessing start

In [10]:
def text_preprocessing(input_string):
    result = ''
    text = tokenization_en(input_string,eng_stop_words)
    text = remove_punctuation(text)
    text = remove_emoticons(text)
    text = remove_numbers(text)
    text = remove_non_ascii(text)
    text = remove_URL(text)
    text = remove_empty_strings(text)
    result = ' '.join([elem for elem in text])

    return result
   

In [11]:
input_string = "start spreading the news yankees win great start by 🎅🏾 going 5strong innings with 5k’s🔥 🐂 solo homerun 🌋🌋 with 2 solo homeruns and👹 3run homerun… 🤡 🚣🏼 👨🏽‍⚖️ with rbi’s … 🔥🔥 🇲🇽 and 🇳🇮 to close the game🔥🔥!!"

input_string = text_preprocessing(input_string)
print("Preprocessed Text:-",input_string)



Preprocessed Text:- start spreading news yankees win great start going strong innings k solo homerun solo homeruns run homerun rbi close game
