In [48]:
from gensim.utils import simple_preprocess, lemmatize
# from gensim.models import LdaModel, LdaMulticore
from gensim import corpora
# import gensim.downloader as api

### WITH Gensim simple_preprocess() function for splitting the message / document

In [22]:
documents = ['Long years ago, we made a tryst with destiny; and now the time comes when we shall redeem our pledge, not wholly or in full measure, but very substantially.',
             'At the stroke of the midnight hour, when the world sleeps, India will awake to life and freedom.']
tokenized_list = [simple_preprocess(doc) for doc in documents]

In [3]:
documents[0]

'Long years ago, we made a tryst with destiny; and now the time comes when we shall redeem our pledge, not wholly or in full measure, but very substantially.'

In [4]:
documents[1]

'At the stroke of the midnight hour, when the world sleeps, India will awake to life and freedom.'

In [5]:
tokenized_list[0][:8]

['long', 'years', 'ago', 'we', 'made', 'tryst', 'with', 'destiny']

In [6]:
tokenized_list[1][:8]

['at', 'the', 'stroke', 'of', 'the', 'midnight', 'hour', 'when']

In [25]:
import nltk
nltk.download('stopwords')  # run once
from nltk.corpus import stopwords
stop_words = stopwords.words('english')
# print(type(stop_words)) #list
stop_words = stop_words + ['com', 'edu', 'would','ago','could','without','also','many','away','shall','near','must']

[nltk_data] Downloading package stopwords to /home/nbuser/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
stop_words

### WITHOUT Gensim Using Split() function for splitting the message / document

In [None]:
texts = [[text for text in doc.split()] for doc in documents]

In [None]:
texts[0][:5]

In [None]:
tokenized_list[0][:5]

### Remove Punctuation and Get all tokens in lowercase from the text

In [39]:
import string
'Rahul'.lower(), 'Names: Rahul, Nancy;'.translate(str.maketrans('', '', string.punctuation))

('rahul', 'Names Rahul Nancy')

In [43]:
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [40]:
def nopunc(mystr):
    return mystr.translate(str.maketrans('', '', string.punctuation))

In [37]:
# texts = [[text.lower() for text in doc.split() if text.lower() not in stop_words] for doc in documents]

In [38]:
# texts[0][:5], texts[1][:5]

(['long', 'years', 'ago,', 'made', 'tryst'],
 ['stroke', 'midnight', 'hour,', 'world', 'sleeps,'])

In [41]:
texts = [[text.lower() for text in nopunc(doc).split() if text.lower() not in stop_words] for doc in documents]

In [47]:
texts[0][:5], texts[1][:5]

(['long', 'years', 'made', 'tryst', 'destiny'],
 ['stroke', 'midnight', 'hour', 'world', 'sleeps'])

In [50]:
dct = corpora.Dictionary(texts)

In [53]:
dct.token2id

{'comes': 0,
 'destiny': 1,
 'full': 2,
 'long': 3,
 'made': 4,
 'measure': 5,
 'pledge': 6,
 'redeem': 7,
 'substantially': 8,
 'time': 9,
 'tryst': 10,
 'wholly': 11,
 'years': 12,
 'awake': 13,
 'freedom': 14,
 'hour': 15,
 'india': 16,
 'life': 17,
 'midnight': 18,
 'sleeps': 19,
 'stroke': 20,
 'world': 21}

In [130]:
# Maps every word or "token" to the dictionary
corpus = [dct.doc2bow(line) for line in texts]

In [131]:
corpus

[[(0, 1),
  (1, 1),
  (2, 1),
  (3, 1),
  (4, 1),
  (5, 1),
  (6, 1),
  (7, 1),
  (8, 1),
  (9, 1),
  (10, 1),
  (11, 1),
  (12, 1)],
 [(13, 1),
  (14, 1),
  (15, 1),
  (16, 1),
  (17, 1),
  (18, 1),
  (19, 1),
  (20, 1),
  (21, 1)]]

In [58]:
dct.doc2idx(texts[0])

[3, 12, 4, 10, 1, 9, 0, 7, 6, 11, 2, 5, 8]

### Using SKLEARN CountVectorizer

In [56]:
from sklearn.feature_extraction.text import CountVectorizer

In [114]:
# LOWER CASE
# IGNORE PUNCTUATION
# STOPWORDS
def tokenize(mess):
#     return [word for word in mess.split() ]
    return [word.lower() for word in nopunc(mess).split() if word.lower() not in stop_words]
#     return [text.lower() for text in nopunc(doc).split() if text.lower() not in stop_words for doc in documents]

In [99]:
# list1 = []
# for doc in texts:
#     for text in doc:
#         if text not in list1:
#             list1.append(text) 
# list1

In [117]:
tokenize('India is a great country; it has lovely people')

['india', 'great', 'country', 'lovely', 'people']

In [118]:
nopunc('India is a great country; it has lovely people')

'India is a great country it has lovely people'

In [104]:
import pandas as pd

In [124]:
my_corpus = pd.DataFrame({'tokens':texts, 'message':documents})
my_corpus

Unnamed: 0,tokens,message
0,"[long, years, made, tryst, destiny, time, come...","Long years ago, we made a tryst with destiny; ..."
1,"[stroke, midnight, hour, world, sleeps, india,...","At the stroke of the midnight hour, when the w..."


In [121]:
# Z = CountVectorizer()
# Two Parameters: TOKEINZE Function, and Untokenized Column in a dataframe
z1 = CountVectorizer(analyzer=tokenize).fit(my_corpus['message'])
CountVectorizer()

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)

In [68]:
z1

CountVectorizer(analyzer=<function tokenize at 0x7f4a631edd90>, binary=False,
        decode_error='strict', dtype=<class 'numpy.int64'>,
        encoding='utf-8', input='content', lowercase=True, max_df=1.0,
        max_features=None, min_df=1, ngram_range=(1, 1), preprocessor=None,
        stop_words=None, strip_accents=None,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, vocabulary=None)

In [122]:
z1.get_feature_names()

['awake',
 'comes',
 'destiny',
 'freedom',
 'full',
 'hour',
 'india',
 'life',
 'long',
 'made',
 'measure',
 'midnight',
 'pledge',
 'redeem',
 'sleeps',
 'stroke',
 'substantially',
 'time',
 'tryst',
 'wholly',
 'world',
 'years']

In [123]:
z1.vocabulary_

{'long': 8,
 'years': 21,
 'made': 9,
 'tryst': 18,
 'destiny': 2,
 'time': 17,
 'comes': 1,
 'redeem': 13,
 'pledge': 12,
 'wholly': 19,
 'full': 4,
 'measure': 10,
 'substantially': 16,
 'stroke': 15,
 'midnight': 11,
 'hour': 5,
 'world': 20,
 'sleeps': 14,
 'india': 6,
 'awake': 0,
 'life': 7,
 'freedom': 3}

In [125]:
messages_bow = z1.transform(my_corpus['message'])

In [129]:
type(messages_bow), messages_bow.toarray()

(scipy.sparse.csr.csr_matrix,
 array([[0, 1, 1, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1],
        [1, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 0]]))

In [136]:
new_df = pd.DataFrame(columns=z1.get_feature_names(), data=messages_bow.toarray())

In [137]:
new_df

Unnamed: 0,awake,comes,destiny,freedom,full,hour,india,life,long,made,...,pledge,redeem,sleeps,stroke,substantially,time,tryst,wholly,world,years
0,0,1,1,0,1,0,0,0,1,1,...,1,1,0,0,1,1,1,1,0,1
1,1,0,0,1,0,1,1,1,0,0,...,0,0,1,1,0,0,0,0,1,0
