# Vectorizing Evoluation
This notebook will examine different versinos of word vectorizing using the `CountVectorizer` 
from the `sklearn` library

## Essential imports

In [244]:
# Import essentials for all versions
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

In [245]:
df = pd.read_csv('../data/train.csv')
documents = df['clean_text'].values.astype(str)

## Plain version

In [246]:
cv = CountVectorizer()
cv.fit(documents)
vocab = cv.get_feature_names_out()
print(f'Size of vocabulary: {len(vocab)}')

Size of vocabulary: 78073


We got a huge vocabulary! 

Let's examine the first 50 words:

In [247]:
print(vocab[:50])

['000' '000000' '00000000' '000000000' '0000000000' '0000000001' '0001'
 '0005' '000s' '001' '001to' '002' '007' '007james' '00xxxxxxxxx'
 '01042014they' '01072014' '0118' '018sec' '01992017' '01surgical' '021'
 '0229' '025' '0250' '02antisatellite' '03122016' '032019i' '0339' '03it'
 '03jalgaon' '041' '0414' '04280' '0448pm' '046' '047' '048' '04yrs'
 '0500' '05042019' '0554' '05852234246' '05yrs' '06042019' '0607' '0609'
 '062' '06mar2019' '0700']


Clearly these are words that are not so informative. Some of them are just numbers/usernames.

Let's examine the next 50:

In [248]:
print(vocab[50:100])

['072' '074223z' '080' '0800' '080916' '081' '081116' '081127z' '089'
 '08th' '090' '0914' '092807z' '0930' '0945' '09555560725' '09999150812'
 '09and' '09th' '0deposit' '0pposed' '0seats' '100' '1000' '10000'
 '100000' '1000000' '1000000000000000000000' '1000000000000000true'
 '1000000k' '100000s' '10000rs' '10000s' '10000x' '1000500' '1000am'
 '1000cr' '1000cuts' '1000failures' '1000s' '1000times' '1000x' '1001'
 '100100' '100150' '100200' '1008' '100agree' '100backup' '100better']


## Using the Min-Df arg
The min-df will exclude words that their document frequency is strictly lower than it's value.

In [249]:
cv = CountVectorizer(min_df=3)
cv.fit(documents)
vocab = cv.get_feature_names_out()
print(f'Vocabulary size: {len(vocab)}')

Vocabulary size: 22069


The size reduced significally!

Let's Examine it again

In [250]:
print(vocab[:150])

['000' '001' '002' '09th' '100' '1000' '10000' '100000' '1000000' '10000s'
 '1000s' '100k' '100m' '100s' '100th' '101' '1010' '1011' '1012' '1015'
 '102' '10279' '103' '1030' '104' '105' '106' '1068' '106800' '107' '108'
 '10cr' '10jp' '10k' '10l' '10part' '10s' '10th' '10x' '10years' '10yrs'
 '110' '1100' '11000' '110000' '111' '112' '1130' '114' '1145' '11451200'
 '1145am' '115' '11500' '117' '1179' '117cr' '118' '119' '11th' '120'
 '1200' '12000' '120000' '12000cr' '1207' '1210' '1214' '122' '123' '1230'
 '124' '125' '1250' '125cr' '126' '127' '128' '12k' '12pm' '12rs' '12th'
 '130' '13000' '130cr' '131' '132' '133' '134' '135' '13500' '136' '137'
 '138' '13cr' '13rd' '13th' '13yearold' '140' '1400' '14000' '14000crore'
 '141' '142' '144' '149th' '14k' '14l' '14th' '150' '1500' '15000'
 '150000' '1500000' '15000cr' '1500km' '1520' '153' '155' '157200000'
 '159' '15k' '15l' '15lac' '15lacs' '15lak' '15lakh' '15lakhs' '15lks'
 '15th' '15ଲକ' '160' '1600' '16000' '16fold' '16th' '170' '

We see significant improvement but uninformative numbers still remain in the vocabulary.

In [251]:
print(vocab[-150:])

['yeddy' 'yeddyurappa' 'yedurappa' 'yeh' 'yehi' 'yell' 'yelling' 'yellow'
 'yemen' 'yen' 'yep' 'yes' 'yesit' 'yesman' 'yesmodi' 'yess' 'yesss'
 'yest' 'yesterday' 'yesterdays' 'yesteryears' 'yet' 'yhe' 'yhi' 'yield'
 'yielded' 'yielding' 'yiu' 'yoddha' 'yog' 'yoga' 'yogendra' 'yogi'
 'yogiji' 'yogimodi' 'yogis' 'yogna' 'yojan' 'yojana' 'yojanas' 'yojna'
 'york' 'yorkers' 'you' 'youand' 'youd' 'youi' 'youjai' 'youjust'
 'youkeep' 'youll' 'youmodi' 'young' 'younger' 'youngest' 'youngster'
 'youngsters' 'your' 'youre' 'yours' 'yourself' 'yourselves' 'youth'
 'youthe' 'youthey' 'youths' 'youtube' 'youtuber' 'youve' 'youwe' 'youyou'
 'yov' 'yoy' 'ypu' 'yrs' 'ysr' 'ysrc' 'ysrcp' 'ystrdy' 'yuck' 'yummy'
 'yup' 'yur' 'yuva' 'yuvraj' 'zafar' 'zaid' 'zakia' 'zakir' 'zameen'
 'zameer' 'zara' 'zardari' 'zaroor' 'zarurat' 'zeal' 'zealand' 'zealots'
 'zee' 'zeenews' 'zeher' 'zenith' 'zero' 'zest' 'zia' 'zilch' 'zimbabwe'
 'zindabaad' 'zindabad' 'zionist' 'zombie' 'zombies' 'zone' 'zones' 'zoom'
 'zo

In [280]:
cv = CountVectorizer(min_df=3,
                     token_pattern=r'\S{1,}')
cv.fit(documents)
vocab = cv.get_feature_names_out()
print(f'Vocabulary size: {len(vocab)}')

Vocabulary size: 22597


In [281]:
print(vocab[-150:])

['‘you' '‘’' '’' '’all' '’ble' '’’' '“' '“100' '“after' '“all' '“best'
 '“chaukidar' '“chowkidar' '“chowkidar”' '“come' '“congress' '“damaged”'
 '“don’' '“drdo' '“during' '“first' '“free' '“from' '“garibi' '“give'
 '“gullidanda”' '“happy' '“hopeless”' '“how' '“important' '“india'
 '“indian' '“india’' '“indisputably' '“like' '“main' '“make'
 '“masterstroke”' '“mein' '“mission' '“modi' '“modi’' '“modi”' '“narendra'
 '“new' '“not' '“nothing”' '“now' '“one' '“only' '“pakistan' '“people'
 '“political' '“prime' '“put' '“rajan' '“real' '“sab' '“serious' '“space'
 '“special' '“surgical' '“swatch' '“teach' '“the' '“there' '“they' '“this'
 '“what' '“what’' '“when' '“why' '“with' '“you' '“’' '“”' '”' '•' '…' '‼️'
 '\u2060' '\u2066' '\u2066\u2066' '₹' '₹1068' '₹12000' '₹1500000' '₹2000'
 '₹350000cr' '₹37000' '₹5000cr' '₹6000' '₹6000month' '₹72000' '⃣' '℅' '▪️'
 '▪️rahul' '▶️' '►' '●' '☕' '☝️' '☹' '☹️' '☺' '☺☺' '☺☺☺' '☺️' '☺️☺️' '♥'
 '♥️' '♦️' '♦️ancient' '♦️clean' '♦️wayanad' '♦️will' '⚛️' '⚡' '⚡a

## Custom preprocessing
We will use the `re` module to remove words that start with digits

In [180]:
import re

In [181]:
cv = CountVectorizer(min_df=3,
                     preprocessor=lambda x: re.sub(r'\b\d+.*?\b', '', x),
                     binary=True)
cv.fit(documents)

In [182]:
vocab = cv.get_feature_names_out()
print(f'Size of vocabulary: {len(vocab)}')

Size of vocabulary: 21477


Slight improvement: ~600 less words

In [183]:
print(vocab[:150])

['aaa' 'aab' 'aabki' 'aacha' 'aache' 'aadani' 'aadat' 'aadhaar' 'aadhar'
 'aadmi' 'aadre' 'aag' 'aagaye' 'aage' 'aagi' 'aah' 'aai' 'aaj' 'aajkal'
 'aajtak' 'aaka' 'aakash' 'aake' 'aakhri' 'aal' 'aalo' 'aam' 'aamir'
 'aana' 'aandhi' 'aane' 'aankh' 'aankhe' 'aao' 'aap' 'aapas' 'aapcong'
 'aapka' 'aapke' 'aapki' 'aapko' 'aapne' 'aaps' 'aapse' 'aaptards' 'aare'
 'aarti' 'aasaram' 'aasmaan' 'aasman' 'aata' 'aate' 'aati' 'aatma' 'aay'
 'aaya' 'aaye' 'aayega' 'aayegi' 'aayenge' 'aayi' 'aayog' 'aayushman'
 'abandon' 'abandoned' 'abandoning' 'abb' 'abba' 'abbas' 'abbe'
 'abbreviation' 'abbreviations' 'abc' 'abcd' 'abd' 'abducted' 'abducting'
 'abduction' 'abdul' 'abdulla' 'abdullah' 'abdullahs' 'abe' 'abey' 'abh'
 'abhay' 'abhi' 'abhijit' 'abhinadan' 'abhinandan' 'abhinandans'
 'abhinandhan' 'abhinav' 'abhisaar' 'abhisar' 'abhishek' 'abhiyaan'
 'abhiyan' 'abi' 'abide' 'abiding' 'abilities' 'ability' 'abinandan'
 'abject' 'abki' 'able' 'abolish' 'abolished' 'abolishing' 'abolition'
 'abounding' 

### Another approach

In [184]:
cv = CountVectorizer(min_df=3,
                     token_pattern=r'\b[^\d\W]\w+\b',
                     binary=True)
cv.fit(documents)

In [185]:
vocab = cv.get_feature_names_out()
print(f'Size of vocabulary: {len(vocab)}')

Size of vocabulary: 21477


## Non ASCII alpha-numeric words

In [186]:
print(vocab[-100:])

['youll' 'youmodi' 'young' 'younger' 'youngest' 'youngster' 'youngsters'
 'your' 'youre' 'yours' 'yourself' 'yourselves' 'youth' 'youthe' 'youthey'
 'youths' 'youtube' 'youtuber' 'youve' 'youwe' 'youyou' 'yov' 'yoy' 'ypu'
 'yrs' 'ysr' 'ysrc' 'ysrcp' 'ystrdy' 'yuck' 'yummy' 'yup' 'yur' 'yuva'
 'yuvraj' 'zafar' 'zaid' 'zakia' 'zakir' 'zameen' 'zameer' 'zara'
 'zardari' 'zaroor' 'zarurat' 'zeal' 'zealand' 'zealots' 'zee' 'zeenews'
 'zeher' 'zenith' 'zero' 'zest' 'zia' 'zilch' 'zimbabwe' 'zindabaad'
 'zindabad' 'zionist' 'zombie' 'zombies' 'zone' 'zones' 'zoom' 'zooms'
 'zor' 'zoya' 'zumla' 'zyada' 'अपन' 'आइय' 'आदम' 'आपक' 'आपन' 'इतन' 'उनल'
 'उसक' 'करत' 'करन' 'कहत' 'चमच' 'तआप' 'तरम' 'पहल' 'बदल' 'बहन' 'रएकब' 'रबच'
 'लटल' 'शबच' 'षकभर' 'सकत' 'सबक' 'सबस' 'समझ' 'सरक' 'हतर' 'வகன' 'ಬಳಸ']


In [187]:
doc_str = documents.astype(str)


In [188]:
m = np.strings.find(doc_str, '♥')
m = np.where(m < 0, 0, 1)
m.sum()

np.int64(23)

In [189]:
cv = CountVectorizer(min_df=3,
                     token_pattern=r'(?a)\b[^\d\W]\w+\b',
                     binary=True)
cv.fit(documents)

In [190]:
vocab = cv.get_feature_names_out()
print(f'Size of vocabulary: {len(vocab)}')

Size of vocabulary: 21447


In [191]:
print(vocab[-100:])

['yet' 'yhe' 'yhi' 'yield' 'yielded' 'yielding' 'yiu' 'yoddha' 'yog'
 'yoga' 'yogendra' 'yogi' 'yogiji' 'yogimodi' 'yogis' 'yogna' 'yojan'
 'yojana' 'yojanas' 'yojna' 'york' 'yorkers' 'you' 'youand' 'youd' 'youi'
 'youjai' 'youjust' 'youkeep' 'youll' 'youmodi' 'young' 'younger'
 'youngest' 'youngster' 'youngsters' 'your' 'youre' 'yours' 'yourself'
 'yourselves' 'youth' 'youthe' 'youthey' 'youths' 'youtube' 'youtuber'
 'youve' 'youwe' 'youyou' 'yov' 'yoy' 'ypu' 'yrs' 'ysr' 'ysrc' 'ysrcp'
 'ystrdy' 'yuck' 'yummy' 'yup' 'yur' 'yuva' 'yuvraj' 'zafar' 'zaid'
 'zakia' 'zakir' 'zameen' 'zameer' 'zara' 'zardari' 'zaroor' 'zarurat'
 'ze' 'zeal' 'zealand' 'zealots' 'zee' 'zeenews' 'zeher' 'zenith' 'zero'
 'zest' 'zia' 'zilch' 'zimbabwe' 'zindabaad' 'zindabad' 'zionist' 'zombie'
 'zombies' 'zone' 'zones' 'zoom' 'zooms' 'zor' 'zoya' 'zumla' 'zyada']


## Using the nltk library

### Using the nltk word_tokenizor

In [192]:
from nltk.tokenize import word_tokenize

cv = CountVectorizer(min_df=3,
                     preprocessor=lambda x: re.sub(r'(?a)\b\d.*?\b', '', x),
                     tokenizer=word_tokenize,
                     binary=True,
                     token_pattern=None)
cv.fit(documents)

In [193]:
vocab = cv.get_feature_names_out()
print(f'Size of vocabulary: {len(vocab)}')

Size of vocabulary: 21667


In [194]:
print(vocab[-200:])

['ा' 'ाँ' 'ां' 'ांी' 'ां्े' 'ाउनलो' 'ाा' 'ाि' 'ाि्ा' 'ाी' 'ाीि' 'ाु' 'ाू'
 'ाे' 'ाेशबचा' 'ाो' 'ाों' 'ा्' 'ा।' 'ि' 'िं' 'िरएकबाोीसरका' 'िा' 'िि'
 'ििा' 'िी' 'िे' 'िेा' 'ि्' 'ि्षकभर्ी' 'ि्ा' 'ि्ी' 'ी' 'ीं' 'ीं।' 'ीा'
 'ीी' 'ीे' 'ु' 'ुा' 'ुाा' 'ुि' 'ुे' 'ु्ि' 'ू' 'ूँ' 'ूं' 'ूा' 'ूे' 'ू्' 'े'
 'ें' 'ेंे' 'ें्' 'ेहतर' 'ेा' 'ेि' 'ेी' 'ेे' 'ेो' 'े्' 'े।' 'ै' 'ैं'
 'ैंीौीाूं' 'ैलटलाों्रबचा' 'ैा' 'ैि' 'ैे' 'ै।' 'ॉ' 'ो' 'ों' 'ोा' 'ोितआपाा'
 'ोी' 'ोीसरकािे' 'ोीी' 'ोीींु्ेेआइये' 'ोीैोुिै' 'ोे' 'ोों' 'ौ' 'ौा' 'ौीा'
 'ौीाीोै' 'ौीाोै' 'ौीा।' 'ौ्ा' '्' '्ा' '्ाा' '्ि' '्ी' '्ू' '्े' '्ों'
 '््' '।' '।।' 'ি' 'োী' 'ોી' 'ୁଁଲକ୍ାି' 'ாை்ேவகன்' 'ிு்ெ்ி' 'ே்்ு்்'
 'ே்்ோ்' 'ோி' '்' 'ಬಳಸಿ' 'ಾಿ' 'ು್ಿಾಿ' 'ೂು' 'ೇ' 'ೋಿ' 'ೋಿ್ೊ್ೆ' '್' '್ಾ' '್ು'
 '್್' '\u200d' '\u200d♀️' '\u200d♀️\u200d♀️'
 '\u200d♀️\u200d♀️\u200d♀️\u200d♀️' '\u200d♂' '\u200d♂️'
 '\u200d♂️\u200d♂️' '\u200d♂️\u200d♂️\u200d♂️' '–' '—' '―' '‘' '’' '“' '”'
 '•' '…' '‼️' '\u2060' '\u2066' '\u2066\u2066' '₹' '⃣' '℅' '▪️' '▪️rahul'
 '▶️' '►' '●' '☕' '☝️' '☠️

### Using stemming

In [195]:
from nltk.stem import PorterStemmer

patt = re.compile(r'\b[^\d\W]\w+\b')
stemmer = PorterStemmer()

def custom_tokenizer(text):
    return [stemmer.stem(t) for t in patt.findall(text)]

In [196]:
cv = CountVectorizer(min_df=3,
                     tokenizer=custom_tokenizer,
                     binary=True,
                     token_pattern=None)
cv.fit(documents)

In [197]:
vectorized = cv.transform(documents[:100])
vectorized

<Compressed Sparse Row sparse matrix of dtype 'int64'
	with 1921 stored elements and shape (100, 15525)>

In [200]:
type(vectorized.toarray())

numpy.ndarray

In [167]:
vocab = cv.get_feature_names_out()
print(f'Size of vocabulary: {len(vocab)}')

Size of vocabulary: 15525


We reduced the vocanulary siginfically but it cost us with expensive running time

### Using lemmataizing

In [168]:
from nltk.corpus import wordnet as wn

def is_noun(tag):
    return tag in ['NN', 'NNS', 'NNP', 'NNPS']

def is_verb(tag):
    return tag in ['VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ']

def is_adverb(tag):
    return tag in ['RB', 'RBR', 'RBS']

def is_adjective(tag):
    return tag in ['JJ', 'JJR', 'JJS']

def penn2wn(tag):
    if is_adjective(tag):
        return wn.ADJ
    elif is_noun(tag):
        return wn.NOUN
    elif is_adverb(tag):
        return wn.ADV
    elif is_verb(tag):
        return wn.VERB
    return wn.NOUN

In [169]:
from nltk import pos_tag
from nltk.stem.wordnet import WordNetLemmatizer
lzr = WordNetLemmatizer()
patt = re.compile(r'(?a)\b[^\d\W]\w+\b')

def custom_tokenizer(text):
    return [lzr.lemmatize(t, penn2wn(pos)) for (t, pos) in pos_tag(patt.findall(text))]

In [170]:
cv = CountVectorizer(min_df=3,
                     tokenizer=custom_tokenizer,
                     binary=True,
                     token_pattern=None)
cv.fit(documents)

In [171]:
vocab = cv.get_feature_names_out()
print(f'Size of vocabulary: {len(vocab)}')

Size of vocabulary: 17860


In [172]:
vocab

array(['aaa', 'aab', 'aabki', ..., 'zoya', 'zumla', 'zyada'], dtype=object)

# Unicode and emojis

In [269]:
patt = (r'\b\w{2,}\b|(?:[\U0001F600-\U0001F64F\U0001F300-\U0001F5FF\U0001F680-\U0001F6FF'
        r'\U0001F700-\U0001F77F\U0001F900-\U0001F9FF\U0001FA70-\U0001FAFF\u2600-\u27BF])+')
cv = CountVectorizer(min_df=3,
                     token_pattern=patt)
cv.fit(documents)
vocab = cv.get_feature_names_out()
print(f'Vocabulary size: {len(vocab)}')


Vocabulary size: 22106


In [270]:
print(vocab[-150:])

['yojan' 'yojana' 'yojanas' 'yojna' 'york' 'yorkers' 'you' 'youand' 'youd'
 'youi' 'youjai' 'youjust' 'youkeep' 'youll' 'youmodi' 'young' 'younger'
 'youngest' 'youngster' 'youngsters' 'your' 'youre' 'yours' 'yourself'
 'yourselves' 'youth' 'youthe' 'youthey' 'youths' 'youtube' 'youtuber'
 'youve' 'youwe' 'youyou' 'yov' 'yoy' 'ypu' 'yrs' 'ysr' 'ysrc' 'ysrcp'
 'ystrdy' 'yuck' 'yummy' 'yup' 'yur' 'yuva' 'yuvraj' 'zafar' 'zaid'
 'zakia' 'zakir' 'zameen' 'zameer' 'zara' 'zardari' 'zaroor' 'zarurat'
 'zeal' 'zealand' 'zealots' 'zee' 'zeenews' 'zeher' 'zenith' 'zero' 'zest'
 'zia' 'zilch' 'zimbabwe' 'zindabaad' 'zindabad' 'zionist' 'zombie'
 'zombies' 'zone' 'zones' 'zoom' 'zooms' 'zor' 'zoya' 'zumla' 'zyada'
 'अपन' 'आइय' 'आदम' 'आपक' 'आपन' 'इतन' 'उनल' 'उसक' 'करत' 'करन' 'कहत' 'चमच'
 'तआप' 'तरम' 'पहल' 'बदल' 'बहन' 'रएकब' 'रबच' 'लटल' 'शबच' 'षकभर' 'सकत' 'सबक'
 'सबस' 'समझ' 'सरक' 'हतर' 'வகன' 'ಬಳಸ' '☕' '☝' '☠' '☹' '☺' '☺☺' '☺☺☺' '♀'
 '♂' '♥' '♦' '⚖' '⚛' '⚡' '⚡⚡' '⛳' '✅' '✈' '✊' '✋' '✌' '✌✌' '✌✌✌' '✍

In [275]:
print('\u3380')

㎀
