In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv('train.txt', sep=';', header=None)
df.head()

Unnamed: 0,0,1
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [3]:
df.rename(columns={
    0: 'para',
    1: 'emo'
}, inplace=True)
df.head()

Unnamed: 0,para,emo
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger


In [4]:
df.isna().sum()

para    0
emo     0
dtype: int64

## Manuall Encoding of Emotions

In [16]:
unique_emo = df['emo'].unique()
unique_emo

array(['sadness', 'anger', 'love', 'surprise', 'fear', 'joy'],
      dtype=object)

In [17]:
emo_label = {
    'sadness':1,
    'anger':2,
    'love':3,
    'surprise':4,
    'fear':5,
    'joy':6
}

In [19]:
df['emo'] = df['emo'].map(emo_label)
df.head()

Unnamed: 0,para,emo
0,i didnt feel humiliated,1
1,i can go from feeling so hopeless to so damned...,1
2,im grabbing a minute to post i feel greedy wrong,2
3,i am ever feeling nostalgic about the fireplac...,3
4,i am feeling grouchy,2


## Data Cleaning

### Converting to Lowercase

In [22]:
df['para'] = df['para'].apply(lambda x:x.lower())

### Removing Punctuations

#### 1) using string functions

In [26]:
import string

In [27]:
def remove_punc(txt):
    return txt.translate(str.maketrans('','',string.punctuation))

In [30]:
df['para'] = df['para'].apply(remove_punc)
df.head()

Unnamed: 0,para,emo
0,i didnt feel humiliated,1
1,i can go from feeling so hopeless to so damned...,1
2,im grabbing a minute to post i feel greedy wrong,2
3,i am ever feeling nostalgic about the fireplac...,3
4,i am feeling grouchy,2


#### 2) removing punctuations using Regular Exp.

In [23]:
import regex as rg

In [45]:
def remove_pun(txt):
    
    txt = rg.sub(r'[^a-zA-Z\s]','',txt)   # (string/pattern to be replace, replace with , text )
    # ^ : excluding these...
    # a-z : small letters
    # A-Z : large letters
    # \s : whitespace
    return txt
    # so this r'[^a-zA-Z\s]' includes punctuations,numbers...we have ^ as exclusion mark

This regex removes all of the following:

Punctuation: ! , . ?

Digits: 0â€“9

Emojis: ðŸ™‚ ðŸ”¥ ðŸ’”

Symbols: â‚¹ â‚¬ Â©

Accented letters: Ã© Ã± Ã¼

Non-English scripts: à¤¹à¤¿à¤‚à¤¦à¥€

In [46]:
df['para'] = df['para'].apply(remove_pun)
df['para']

0                                  i didnt feel humiliated
1        i can go from feeling so hopeless to so damned...
2         im grabbing a minute to post i feel greedy wrong
3        i am ever feeling nostalgic about the fireplac...
4                                     i am feeling grouchy
                               ...                        
15995    i just had a very brief time in the beanbag an...
15996    i am now turning and i feel pathetic that i am...
15997                       i feel strong and good overall
15998    i feel like this was such a rude comment and i...
15999    i know a lot but i feel so stupid because i ca...
Name: para, Length: 16000, dtype: object

In [47]:
df.head()

Unnamed: 0,para,emo
0,i didnt feel humiliated,1
1,i can go from feeling so hopeless to so damned...,1
2,im grabbing a minute to post i feel greedy wrong,2
3,i am ever feeling nostalgic about the fireplac...,3
4,i am feeling grouchy,2


### Removing Emojis

In [54]:
# With a manual function

def remove_emoji(txt):
    new=''
    for i in txt:
        if i.isascii(): # emoji's aren't in ascii range, 0-127 it inlcudes small/capital letters and 0-9 only
            new += i
    return new     

# similarly we can remove numbers with i.isdigit()    

In [53]:
df['para'] = df['para'].apply(remove_emoji)
df['para']

0                                  i didnt feel humiliated
1        i can go from feeling so hopeless to so damned...
2         im grabbing a minute to post i feel greedy wrong
3        i am ever feeling nostalgic about the fireplac...
4                                     i am feeling grouchy
                               ...                        
15995    i just had a very brief time in the beanbag an...
15996    i am now turning and i feel pathetic that i am...
15997                       i feel strong and good overall
15998    i feel like this was such a rude comment and i...
15999    i know a lot but i feel so stupid because i ca...
Name: para, Length: 16000, dtype: object

### Removing extra whitespace

In [56]:
def clean_whitespace(txt):
    return rg.sub(r'\s+', ' ', txt).strip()

# \s : includes whitespace, [ \t\n\r\f\v]
# \s+ : matches one or more than 1 whitespace charc, thoise charc are [ \t\n\r\f\v]
# .strip removes whitespaces in beginning and end of text...not what s in middle

In [57]:
df['para'] = df['para'].apply(clean_whitespace)

In [58]:
df['para']

0                                  i didnt feel humiliated
1        i can go from feeling so hopeless to so damned...
2         im grabbing a minute to post i feel greedy wrong
3        i am ever feeling nostalgic about the fireplac...
4                                     i am feeling grouchy
                               ...                        
15995    i just had a very brief time in the beanbag an...
15996    i am now turning and i feel pathetic that i am...
15997                       i feel strong and good overall
15998    i feel like this was such a rude comment and i...
15999    i know a lot but i feel so stupid because i ca...
Name: para, Length: 16000, dtype: object

### Removing HTML tags, for webscrapped data

we use reg.exp. r'<[^>]+>'

< is literal 1st tag..

[ ] is a character class, where ^ says exclude this, > is literal exclusion of >

so [^>] is Match any character that is NOT >

'+' is one or more charc ,Keep matching [^>] as long as possible

Final > : Matches the literal >

### Removing Stop Words
'is' 'the' 'are'

### we use NLTK - natural language toolkit

In [61]:
import nltk

In [60]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


In [77]:
# have to download stopwords and tokenization resources

nltk.download('punkt')
nltk.download('punkt_tab')
 # Punkt is a pre-trained sentence tokenizer
nltk.download('stopwords')
 # stopwords of all languages

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\vatsc\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\vatsc\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt_tab.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vatsc\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [78]:
stop_words = set(stopwords.words('english'))
stop_words

{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [79]:
# no. of stopwords of english..
len(stop_words)

198

### Tokenizing & Stopword removal

In [80]:
def remove_stop(txt):
    words = word_tokenize(txt)
    new = []
    for i in words:
      if i not in stop_words: # we have to check membership of i with list of stop_words...not equality !=
          new.append(i) # new += i will return each letter seperatly..good -> g,o,o,d
    return ' '.join(new) 

    # join() converts a list of strings into ONE string
    # separator.join(iterable_of_strings)
    # return new will just return a list, not string we need

In [82]:
df['para'] = df['para'].apply(remove_stop)
df['para']

0                                    didnt feel humiliated
1        go feeling hopeless damned hopeful around some...
2                im grabbing minute post feel greedy wrong
3        ever feeling nostalgic fireplace know still pr...
4                                          feeling grouchy
                               ...                        
15995        brief time beanbag said anna feel like beaten
15996    turning feel pathetic still waiting tables sub...
15997                             feel strong good overall
15998                       feel like rude comment im glad
15999                         know lot feel stupid portray
Name: para, Length: 16000, dtype: object