In [257]:
import pandas as pd
import numpy as np
import os
import math
import re
from collections import defaultdict, Counter
from tqdm import tqdm
import plotly.graph_objects as go
import nltk
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/vatsalsaglani/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/vatsalsaglani/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [29]:
text = 'This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well. Raw text and already processed! bag of words formats are provided. See the README file contained in the release for more details.'

In [35]:
text_2 = text = 'This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well. Raw text!!!! and already processed bag of words formats are provided. See the README file contained in the release for more details.'

In [31]:
text

'This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well. Raw text and already processed bag of words formats are provided. See the README file contained in the release for more details.'

In [33]:
text_2

'This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well. Raw text and already processed bag of words formats are provided. See the README file contained in the release for more details.'

# Working with Regular Expressions

In [17]:
''.join(re.findall('[a-zA-Z0-9]', text))

'ThisisadatasetforbinarysentimentclassificationcontainingsubstantiallymoredatathanpreviousbenchmarkdatasetsWeprovideasetof25000highlypolarmoviereviewsfortrainingand25000fortestingThereisadditionalunlabeleddataforuseaswellRawtextandalreadyprocessedbagofwordsformatsareprovidedSeetheREADMEfilecontainedinthereleaseformoredetails'

A more better way to remove the punctuations from the `text`

In [28]:
' '.join([''.join(re.findall('([a-zA-Z0-9^!])', txt)) for txt in text.split(' ')])

'This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets We provide a set of 25000 highly polar movie reviews for training and 25000 for testing There is additional unlabeled data for use as well Raw text and already processed! bag of words formats are provided See the README file contained in the release for more details'

In [36]:
' '.join([''.join(re.findall('([a-zA-Z0-9^!])', txt)) for txt in text_2.split(' ')])

'This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets We provide a set of 25000 highly polar movie reviews for training and 25000 for testing There is additional unlabeled data for use as well Raw text!!!! and already processed bag of words formats are provided See the README file contained in the release for more details'

In [37]:
name_text = 'My name is Vatsal Saglani'

In [47]:
re.findall('[^a-z]', name_text)

['M', ' ', ' ', ' ', 'V', ' ', 'S']

Finding `names` in a text 

In [111]:
[re.findall('[^a-z].[a-z]+', txt) for txt in name_text.split(' ')]

[[], [], [], ['Vatsal'], ['Saglani']]

In [120]:
' '.join([txt[0] for txt in [re.findall('[^a-z].[a-z]+', txt) for txt in name_text.split(' ')] if len(txt) > 0])


'Vatsal Saglani'

Finding `numbers` in a text

In [229]:
number_text = 'her phone number is (+91)-9790994455 and zip code is 560102'

`phonenuber` and `zipcodes`

In [238]:
[num for num in re.findall('([+0-9]+)', number_text) if len(list(num)) >= 10 or len(list(num)) >= 6]

['9790994455', '560102']

In [239]:
re.findall('\d{10}', number_text)

['9790994455']

In [237]:
re.findall('\d{6, 10}', number_text)

[]

`phonenumber`

In [241]:
number_text = 'her phone number is (+91) 979-099-4455 and zip code is 560102'

In [242]:
re.findall('\d{3}-\d{3}-\d{4}', number_text)

['979-099-4455']

In [249]:
''.join([num for num in list(re.findall('\d{3}-\d{3}-\d{4}', number_text)[0]) if not num == '-'])

'9790994455'

List of special characters

- `!` - `unkexl` if `word` before not known else `exl{n}` where `n` equal to number of `!` after the word. Example, hey!!! `idx(`hey`)` `exl3`
- `?` - `unkque` if `word` before not known else `que{n}` where `n` equal to numebr of `?` after the word. Exampel, what???? `idx(`what`)` `que4`
- `$` - `unkcurr` -> _Start of a character_
- `UPPERCASE` - `uprcs` -> _Upper case text_
- `_%` - `unkper` -> % symbol in a character
- `#` - `tag` if text after hashtag known else `unktag` if text after hashtag unknown.

_*add extra when needed_



In [255]:
from nltk.corpus import stopwords

In [260]:
len(set(stopwords.words('english')))

179

In [261]:
# ' '.join([''.join(re.findall('([a-zA-Z0-9^!])', txt)) for txt in text_2.split(' ')])

In [540]:
def clean_data(text):
    
    return ' '.join([''.join(re.findall('([a-zA-Z0-9!#$?%*&])', txt)) for txt in text.strip().split(' ')])

In [541]:
clean_data(text_2)

'This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets We provide a set of 25000 highly polar movie reviews for training and 25000 for testing There is additional unlabeled data for use as well Raw text!!!! and already processed bag of words formats are provided See the README file contained in the release for more details'

In [520]:
text_3 = "This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well. Raw text and already processed bag of words formats are provided. See the README file contained in the release for more $$$ details."


In [265]:
clean_data(text_3)

'This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets We provide a set of 25000 highly polar movie reviews for training and 25000 for testing There is additional unlabeled data for use as well Raw text and already processed bag of words formats are provided See the README file contained in the release for more $$$ details'

In [266]:
def remove_stopwords(text):
    text = clean_data(text)
    
    stp_words = list(set(stopwords.words('english')))
    
    return ' '.join([txt for txt in text.split(' ') if txt not in stp_words])

In [267]:
remove_stopwords(text_2)

'This dataset binary sentiment classification containing substantially data previous benchmark datasets We provide set 25000 highly polar movie reviews training 25000 testing There additional unlabeled data use well Raw text!!!! already processed bag words formats provided See README file contained release details'

In [268]:
remove_stopwords(text_3)

'This dataset binary sentiment classification containing substantially data previous benchmark datasets We provide set 25000 highly polar movie reviews training 25000 testing There additional unlabeled data use well Raw text already processed bag words formats provided See README file contained release $$$ details'

In [276]:
tt = 'hey !! red'
cc = re.search('[$%!#@]', tt)

In [281]:
if cc:
    print("True")
else: 
    print("False")

True


In [278]:
 spec_chars = lambda char: re.search('[$&#@!?]', char)

In [280]:
cc = spec_chars(tt)

In [285]:
tt = 'hey!!!$$$'

In [291]:
re.split("[!#@$%]", tt)

['hey', '', '', '', '', '', '']

In [294]:
tt = 'hey!!'
spec_c = ['!', '@', '#', '$', '@', '?']
lst = list(tt)
spc_char = [ch for ch in lst if ch in spec_c]

In [295]:
spc_char

['!', '!']

In [326]:
tag = '#heydude'
re.split('#', tag)

['', 'heydude']

In [640]:
class Tokenize():
    
    def __init__(self, text_corpus, remove_stop_words = False):
        
        self.remove = remove_stop_words
        self.special_chars = {
            '!': "EXL",
            '#': "TAG",
            "$": "CUR",
            "?": "QUE",
            "%": "PER",
            "*": "CND",
            "&": "AND"
        }
        self.unk_chars = {
            '!': "UNKEXL",
            '#': "UNKTAG",
            "$": "UNKCUR",
            "?": "UNKQUE",
            "%": "UNKPER",
            "*": "UNKCND",
            "&": "UNKAND"
        }
        
        self.special_c = ['!', '#', '$', '?', '%', '*', '&']
        self.can_flt = lambda char: re.match('[A-Za-z]', char)
        self.text_corpus = text_corpus
        self.is_txt = lambda txt: re.search('([_a-zA-Z0-9_])', txt.strip())
#         self.vocab_set = []
        
        if self.remove:
            self.stopwords = list(set(stopwords.words('english')))
        
        self.build_vocab(self.text_corpus)
            
        
            
    def clean_data(self, text):
        
        return ' '.join([''.join(re.findall('([a-zA-Z0-9!#$?!%*&])', txt)) for txt in text.strip().split(' ')])
    
    def remove_stopwords(self, text):
        
        self.text = self.clean_data(text)
        
        return ' '.join([txt for txt in text.split(' ') if txt not in self.stopwords])
    
    def build_vocab(self, text_corpus):
        
        self.vocab_set = [] # for str_to_int
        self.vocab_dict = dict()
        
        if self.remove:
            text_corpus = self.remove_stopwords(self.clean_data(text_corpus))
        else:
            text_corpus = self.clean_data(text_corpus)
        
        self.special_ch = lambda char: re.search('[#$?!%*&]', char)
        
        for text in text_corpus.split(' '):
            if text not in self.vocab_set:
                if self.special_ch(text):
                    lst = list(text)
                    spc_char = [ch for ch in lst if ch in self.special_c]
                    if lst[0] == '$': self.vocab_set.extend([self.special_chars['$'] if self.special_chars['$'] not in self.vocab_set else '-1233', re.split('[$]', text)[-1] if re.split('[$]', text)[-1] not in self.vocab_set else '-1233']) 
                    if lst[0] == '?': self.vocab_set.extend([self.special_chars['?'] if self.special_chars['?'] not in self.vocab_set else '-1233', re.split('[?]', text)[-1] if re.split('[?]', text)[-1] not in self.vocab_set else '-1233']) 
                    if lst[0] == '!': self.vocab_set.extend([self.special_chars['!'] if self.special_chars['!'] not in self.vocab_set else '-1233', re.split('[!]', text)[-1] if re.split('[!]', text)[-1] not in self.vocab_set else '-1233']) 
                    if lst[0] == '%': self.vocab_set.extend([self.special_chars['%'] if self.special_chars['%'] not in self.vocab_set else '-1233', re.split('[%]', text)[-1] if re.split('[%]', text)[-1] not in self.vocab_set else '-1233']) 
                    if text == '&' and 'AND' not in self.vocab_set: self.vocab_set.append(self.special_chars['&']) 
                    if lst[0] == '*': self.vocab_set.extend([self.special_chars['*'] if self.special_chars['*'] not in self.vocab_set else '-1233', re.split('[*]', text)[-1] if re.split('[*]', text)[-1] not in self.vocab_set else '-1233']) 
                    if lst[0] == '#': self.vocab_set.extend([self.special_chars['#'] if self.special_chars['#'] not in self.vocab_set else '-1233', re.split('[#]', text)[-1] if re.split('[#]', text)[-1] not in self.vocab_set else '-1233']) 
                else:
                    self.vocab_set.append(text)
            else:
                continue
                
        
        if '-1233' in self.vocab_set:
            self.vocab_set = [vocab for vocab in self.vocab_set if not vocab == '-1233']
        
        unk_lst = []
        
        for knw in list(self.special_chars.values()):
            if knw not in self.vocab_set:
                ky = [key for key, value in self.special_chars.items() if value == knw][0]
                unk_lst.append(self.unk_chars[ky])
        
        self.vocab_set.extend(unk_lst)
        self.vocab_set.extend(['UNKNUM', 'UNKCAPSTR', 'UNKSTR'])
            
            
         
        self.vocab_dict = defaultdict(list)
        
        for index, data in enumerate(self.vocab_set):
            data = data.lower() if isinstance(data, str) else data
            self.vocab_dict[data].append(index)
            
        return self.vocab_set, self.vocab_dict
    
    def get_vocab_len(self):
        if len(self.vocab_set) > 0:
            return len(self.vocab_set)
        else:
            return "Build a vocab first"
    
    def get_tagged_sentence(self, text):
        
        if self.remove:
            text = self.remove_stopwords(self.clean_data(text))
        else:
            text = self.clean_data(text)
            
        text_lst = text.split(' ')
        if len(self.vocab_set) > 0:
            join_lst = []
            for txt in text_lst:
                if txt in self.vocab_set:
                    join_lst.append(txt)     
                else:
                    # not in vocab set
                    if self.special_ch(txt):
                        spc_chr = self.special_chars[re.findall('[#$?!%*&]', txt)[0]]
                        if spc_chr in self.vocab_set:
                            # special char in vocab
                            join_lst.append(spc_chr)
                            if self.is_txt(txt) and re.findall('[#$?!%*&]', txt)[0] in ['#', '$', '*']:
                                
                                t = re.split(['!#$?%*&'], txt)[-1]
                                if t in self.vocab_set:
                                    join_lst.append(t)
                                else:
                                    if not self.can_flt(t):
                                        join_lst.append('UNKNUM')
                                    elif t.isupper():
                                        join_lst.append('UNKCAPSTR')
                                    else:
                                        join_lst.append('UNKSTR')
                            else:
#                                 print(txt)
                                if self.is_txt(txt):
                                    t = re.split('[!#$?%*&]', txt)[0]
                                    if t in self.vocab_set:
                                        join_lst.append(t)
                                    else:
                                        if not self.can_flt(t):
                                            join_lst.append('UNKNUM')
                                        elif t.isupper():
                                            join_lst.append('UNKCAPSTR')
                                        else:
                                            join_lst.append('UNKSTR')
                        else:
                            # special char not in voacb
                            chr_ = re.findall('[#$?!%*&]', txt)[0]
                            join_lst.append(self.unk_chars[chr_])
                            if self.is_txt(txt) and chr_ in ['#', '$', '*']:
                                t = re.split('[!#$?%*&]', txt)[-1]
                                if t in self.vocab_set:
                                    join_lst.append(t)
                                else:
                                    if not self.can_flt(t):
                                        join_lst.append('UNKNUM')
                                    elif t.isupper():
                                        join_lst.append('UNKCAPSTR')
                                    else:
                                        join_lst.append('UNKSTR')
                            else:
                                if self.is_txt(txt):
                                    t = re.split('[!#$?%*&]', txt)[0]
                                    if t in self.vocab_set:
                                        join_lst.append(t)
                                    else:
                                        if not self.can_flt(t):
                                            join_lst.append('UNKNUM')
                                        elif t.isupper():
                                            join_lst.append('UNKCAPSTR')
                                        else:
                                            join_lst.append('UNKSTR')                         
                    else:
                        # not special char
                        if not self.can_flt(txt):
                            join_lst.append('UNKNUM')
                        elif txt.isupper():
                            join_lst.append('UNKCAPSTR')
                        else:
                            join_lst.append('UNKSTR')
            tagged_text = ' '.join(join_lst)
            return tagged_text
                            
        else:
            return "Build a vocab first"
        
        
        
    def get_indices(self, text):
        if self.remove:
            text = self.remove_stopwords(self.clean_data(text))
        else:
            text = self.clean_data(text)
            
#          if self.remove:
                
#              text = self.remove_stopwords(self.clean_data(text))
#         else:
#             text = self.clean_data(text)
            
        tagged_text = self.get_tagged_sentence(text)
        lst = []
        for txt in tagged_text.strip().split(' '):
            lst.append(self.vocab_set.index(txt))
        
        return np.array(lst)
        
        
            
            
    def from_df(self, dataframe, columname):
        
        join_lst = dataframe[columname].values.tolist()
        
        string = ' '.join(join_lst)
        
        self.build_vocab(string)
        
   

In [572]:
tweet_text = "He’s making his list and checking it twice. #Batman80 #LongLiveTheBat  "

In [573]:
tweet_text

'He’s making his list and checking it twice. #Batman80 #LongLiveTheBat  '

In [574]:
clean_data(tweet_text)

'Hes making his list and checking it twice #Batman80 #LongLiveTheBat'

In [575]:
tweet_text.strip()

'He’s making his list and checking it twice. #Batman80 #LongLiveTheBat'

In [576]:
clean_data(tweet_text)

'Hes making his list and checking it twice #Batman80 #LongLiveTheBat'

In [595]:
Tokenize(tweet_text).clean_data(tweet_text)

'Hes making his list and checking it twice #Batman80 #LongLiveTheBat'

In [596]:
cc = Tokenize(tweet_text)

In [597]:
cc.vocab_set

['Hes',
 'making',
 'his',
 'list',
 'and',
 'checking',
 'it',
 'twice',
 'TAG',
 'Batman80',
 'LongLiveTheBat',
 'UNKEXL',
 'UNKCUR',
 'UNKQUE',
 'UNKPER',
 'UNKCND',
 'UNKAND',
 'UNKNUM',
 'UNKCAPSTR',
 'UNKSTR']

In [598]:
cc.vocab_dict

defaultdict(list,
            {'hes': [0],
             'making': [1],
             'his': [2],
             'list': [3],
             'and': [4],
             'checking': [5],
             'it': [6],
             'twice': [7],
             'tag': [8],
             'batman80': [9],
             'longlivethebat': [10],
             'unkexl': [11],
             'unkcur': [12],
             'unkque': [13],
             'unkper': [14],
             'unkcnd': [15],
             'unkand': [16],
             'unknum': [17],
             'unkcapstr': [18],
             'unkstr': [19]})

In [567]:
text_3 = text_3 + ' !!' + ' ?' + ' %'

In [581]:
text_3

'This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets. We provide a set of 25,000 highly polar movie reviews for training, and 25,000 for testing. There is additional unlabeled data for use as well. Raw text and already processed bag of words formats are provided. See the README file contained in the release for more $$$ details. BINARY !! ? %'

In [641]:
cc = Tokenize(text_2)

In [642]:
cc.get_tagged_sentence(text_3)

'This is a dataset for binary sentiment classification containing substantially more data than previous benchmark datasets We provide a set of 25000 highly polar movie reviews for training and 25000 for testing There is additional unlabeled data for use as well Raw UNKSTR and already processed bag of words formats are provided See the README file contained in the release for more UNKCUR details UNKCAPSTR UNKEXL UNKQUE UNKPER'

In [643]:
cc.get_indices(text_3)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17,  2, 18, 19, 20, 21, 22, 23, 24,  4, 25, 26, 20,  4, 27, 28,  1,
       29, 30, 11,  4, 31, 32, 33, 34, 59, 26, 35, 36, 37, 19, 38, 39, 40,
       41, 42, 43, 44, 45, 46, 47, 43, 48,  4, 10, 52, 49, 58, 50, 53, 54])

In [583]:
cc[0]

['This',
 'is',
 'a',
 'dataset',
 'for',
 'binary',
 'sentiment',
 'classification',
 'containing',
 'substantially',
 'more',
 'data',
 'than',
 'previous',
 'benchmark',
 'datasets',
 'We',
 'provide',
 'set',
 'of',
 '25000',
 'highly',
 'polar',
 'movie',
 'reviews',
 'training',
 'and',
 'testing',
 'There',
 'additional',
 'unlabeled',
 'use',
 'as',
 'well',
 'Raw',
 'text',
 'already',
 'processed',
 'bag',
 'words',
 'formats',
 'are',
 'provided',
 'See',
 'the',
 'README',
 'file',
 'contained',
 'in',
 'release',
 'CUR',
 '',
 'details',
 'BINARY',
 'EXL',
 'QUE',
 'PER',
 'UNKTAG',
 'UNKCND',
 'UNKAND',
 'UNKNUM',
 'UNKCAPSTR',
 'UNKSTR']