In [24]:
import pywhatkit as pw

In [25]:
from nltk import sent_tokenize , word_tokenize
import pandas as pd
import numpy as np

In [26]:
df = pd.read_csv('Emotion_Dataset/text_emotion.csv')

In [27]:
# Convert to Lower case

df['content'] = df['content'].str.lower()
df['content'].iloc[1:3]

1    layin n bed with a headache  ughhhh...waitin o...
2                  funeral ceremony...gloomy friday...
Name: content, dtype: object

In [28]:
# Remove HTML tags
import re
def remove_html_tag(text):
    pattern = re.compile('<.*?>')
    return pattern.sub(r'' , text)

text = '<h>shatnu</h> <p> Paragraph Tag </p><h2> Heading Tag </h2><b> Bold Tag </b><i>Italic Tag </i><u> Underline Tag</u>'

print(remove_html_tag(text))

df['content'] = df['content'].apply(remove_html_tag)


shatnu  Paragraph Tag  Heading Tag  Bold Tag Italic Tag  Underline Tag


In [29]:
# Remove URls

def remove_url(text):
    patterm = re.compile('https://S+www\.S+')
    return patterm.sub(r'' , text)

df['content'] = df['content'].apply(remove_url)

df['content'][2]


'funeral ceremony...gloomy friday...'

In [30]:
# Remove Punctuations

import string 

exclude = string.punctuation

def remove_pun(text):
    return text.translate(str.maketrans("" , "" , exclude))

df['content'] = df['content'].apply(remove_pun)

df['content'][2]
df['content'][5]


'repinging ghostridah14 why didnt you go to prom bc my bf didnt like my friends'

In [31]:
# Spelling correction

from textblob import TextBlob


wrong_text = 'repinging ghostridah14 whi didnt yuo go to prmo bc my bf didnt like my frieds'

obj = TextBlob(wrong_text)
obj.correct().string

'ringing ghostridah14 who didn you go to pro bc my of didn like my friends'

In [32]:
# Remove stop words

from nltk.corpus import stopwords

def remove_stopwords(text):
    new_text = []
    for word in text.split():
        if word in stopwords.words('english'):
            new_text.append('')
        else:
            new_text.append(word)
    temp = new_text[:]
    new_text.clear()
    return ' ' . join(temp)

df['content'] = df['content'].apply(remove_stopwords)

df['content'][2]
df['content'][5]

'repinging ghostridah14  didnt  go  prom bc  bf didnt like  friends'

In [33]:
# Remove Emojis 

def remove_emojis(text):
    emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return emoji_pattern.sub(r'',text)

test = 'This is a smiley face \U0001f602'
print(remove_emojis(test))

This is a smiley face 


In [34]:
# Tokenisation
# problems : prefix($) , suffix(?) , infix(/..) , Exceptions(U.S)

from nltk import sent_tokenize , word_tokenize

sent = 'Are you going to delhi ? cause in am'

print(sent_tokenize(sent))
print(word_tokenize(sent))


['Are you going to delhi ?', 'cause in am']
['Are', 'you', 'going', 'to', 'delhi', '?', 'cause', 'in', 'am']


In [35]:
import spacy

nlp = spacy.load('en_core_web_sm')

doc = nlp(sent)

print(doc)

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a Python package or a valid path to a data directory.

In [None]:
# Stemming ( word --> root )
# not nessearly a word in english languages
# info retirval systems

from nltk import PorterStemmer

ps = PorterStemmer()

def stem(text):
    return " " .join([ps.stem(word) for word in text.split()])



df['content'] = df['content'].apply(stem)

df['content'][2]
df['content'][5]
print(stem('walk walking walked'))



walk walk walk


In [None]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\singh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [None]:
# Lemmatization (Stemming but slow)

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')

wnl = WordNetLemmatizer()

def lemm_word(text):
    return " " . join(wnl.lemmatize(word , pos='v') for word in text.split())

print(lemm_word('i was running and eating alone but gone to swimmming after this'))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\singh\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


i be run and eat alone but go to swimmming after this
