# Text Preprocessing Pipeline

In [None]:
# step 1 >> Removing Noise 
#                --> number, punctuation, url
# step 2 >> Transform any abnormalities
#                --> spell correction
#                --> demojization
#                --> remove html tag
#                --> process hashtags
# step 3 >> Lowercase and split
#                --> convert text to lowercase
#                --> convert the text into a list of words
# step 4 >> Remove stop words
# step 5 >> stem the words
# step 6 >> Join and returns
#                --> join the list of words in text again
#                --> return the joined text    

In [None]:
# Domain specific pre-processing >> Libraries
#                                   -->emoji() >> to convert emoji to text
#                                   -->regex() >> to remove hashtags
#                                   -->PyEnchant() >> for spell correction
#                                   -->lxml() >> to remove html tags
# --> removing the stop words i.e. the, me, and, my --> NLTK
# --> stemming >> converts words to it's root form which is not in the vocabulari i.e. amazing becomes amaz after stemming --> NLTK
# --> lemmatization >> does the same things as stemming; the only difference is that the lemmatized word is in the vocabulary --NLTK
# --> removing the contraction >> I'm --> I am, He've --> He have

In [None]:
## Frequency Based Embedding >>
#                          Type 1 >> Count Vectorizer 
#                                          --> convert a sentence into a bag of word representation
#                          Type 2 >> TF-IDF Vectorizer
#                                          --> convert a sentence into a vector

# Sentiment Analysis: Model

In [None]:
import re
import pandas as pd
import numpy as np

from lxml import html      # --> to remove html tag
from emoji import demojize  # --> to convert emoji into text

import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [None]:
df = pd.read_csv('/kaggle/input/sentimental-analysis-for-tweets/sentiment_tweets3.csv')
df.drop(['Index'], axis=1, inplace=True)
df.head()

In [None]:
df.rename(columns={'message to examine':'text', 'label (depression result)': 'sentiment'}, inplace=True)
df.head()

In [None]:
df['sentiment'].value_counts()

# Data Pre-Processing

In [None]:
stemmer = PorterStemmer()
stop = stopwords.words('english')

def clean_text(text):
    # Convert emoji to text
    text = demojize(text)
    
    # Remove HTML Tags
    try:
        text = html.document_fromstring(text).text_content()
    except:
        pass
    
    # Remove hyperlinks
    text = re.sub('http\S+', ' ', text)
    
    # Remove non alphabets
    text = re.sub('[^a-zA-Z ]+', ' ', text)
    
    # lowercase and stem
    text = text.lower().split()
    
    # Remove stopwords and start words
    text = [stemmer.stem(word) for word in text if word not in stop and len(word) > 2]
    
    # Join and Return
    return ' '.join(text)

In [None]:
sample_text = "I'm learning Natural Language Processing"
cleaned_text = clean_text(sample_text)
cleaned_text