In [25]:
import nltk
import numpy as np

#Reading the text file
dataset = []
with open("TextFile.txt") as file:
    dataset = file.readlines()

## Basic Cleaning

In [40]:
break_removed = []
for x in dataset:
    break_removed.append(x[:-1])

In [41]:
break_removed

['Welcome to Natural Language Processing',
 'It is one of the most exciting research areas as of today',
 'We will see how Python can be used to work with text files']

In [16]:
## Converting everything to lower case
lower = []
for x in break_removed:
    lower.append(x.lower())

In [17]:
lower

['welcome to natural language processing',
 'it is one of the most exciting research areas as of today',
 'we will see how python can be used to work with text files']

## Tokenization

In [27]:
#Tokenizing the words
tokenized = []
for x in lower:
    tokenized.append(nltk.word_tokenize(x))
tokenized

[['welcome', 'to', 'natural', 'language', 'processing'],
 ['it',
  'is',
  'one',
  'of',
  'the',
  'most',
  'exciting',
  'research',
  'areas',
  'as',
  'of',
  'today'],
 ['we',
  'will',
  'see',
  'how',
  'python',
  'can',
  'be',
  'used',
  'to',
  'work',
  'with',
  'text',
  'files']]

## Stop Words Removal and Stemming

In [29]:
#Removing Stop Words
nltk.download("stopwords")
from nltk.corpus import stopwords
stpwrds = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\vishw\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
#Making stemmer object
from nltk.stem.porter import PorterStemmer
obj = PorterStemmer()

#Applying stemming and porting
final_stemmed = []
for x in tokenized:
    processed = [obj.stem(word) for word in x if word not in stpwrds]
    final_stemmed.append(processed)

In [35]:
final_stemmed

[['welcom', 'natur', 'languag', 'process'],
 ['one', 'excit', 'research', 'area', 'today'],
 ['see', 'python', 'use', 'work', 'text', 'file']]

## POS Tagging

In [38]:
pos_tag = []

for x in tokenized:
    pos_tag.append(nltk.pos_tag(x))

In [39]:
pos_tag

[[('welcome', 'NN'),
  ('to', 'TO'),
  ('natural', 'JJ'),
  ('language', 'NN'),
  ('processing', 'NN')],
 [('it', 'PRP'),
  ('is', 'VBZ'),
  ('one', 'CD'),
  ('of', 'IN'),
  ('the', 'DT'),
  ('most', 'RBS'),
  ('exciting', 'JJ'),
  ('research', 'NN'),
  ('areas', 'NNS'),
  ('as', 'IN'),
  ('of', 'IN'),
  ('today', 'NN')],
 [('we', 'PRP'),
  ('will', 'MD'),
  ('see', 'VB'),
  ('how', 'WRB'),
  ('python', 'NN'),
  ('can', 'MD'),
  ('be', 'VB'),
  ('used', 'VBN'),
  ('to', 'TO'),
  ('work', 'VB'),
  ('with', 'IN'),
  ('text', 'JJ'),
  ('files', 'NNS')]]

## Bag of Words

In [49]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer(max_features=100)
joined = []
for x in final_stemmed:
    joined.append(' '.join(x))
print(joined)

X = vectorizer.fit_transform(joined).toarray()
print(X)

['welcom natur languag process', 'one excit research area today', 'see python use work text file']
[[0 0 0 1 1 0 1 0 0 0 0 0 0 1 0]
 [1 1 0 0 0 1 0 0 1 0 0 1 0 0 0]
 [0 0 1 0 0 0 0 1 0 1 1 0 1 0 1]]
