In [None]:
import os
import re

In [None]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.corpus import stopwords
from nltk import word_tokenize
from nltk.util import ngrams
from nltk.util import bigrams
from nltk.collocations import *

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [None]:
import pandas as pd
import numpy as np

In [None]:
from wordcloud import WordCloud
import matplotlib.pyplot as plt

### Read the text

In [None]:
with open('/content/jd.txt') as f:
    data = f.read()

### Convert the text to lowercase 

In [None]:
data = data.lower()

### Finding total number of words and punctuation in the text

In [None]:
tokens = nltk.word_tokenize(data)

In [None]:
print(len(tokens))

360


### Removing Stop-words

In [None]:
stop = set(stopwords.words('english'))

In [None]:
token_list1 = [ ]

In [None]:
for token in tokens:
    if token not in stop:
        token_list1.append(token)

In [None]:
print(len(token_list1))

297


### Removing punctuation

In [None]:
punctuation = re.compile(r'[-.?!,":;()|0-9]')

In [None]:
token_list2 = [ ]

In [None]:
for token in token_list1:
    word = punctuation.sub("", token)
    if len(word)>0:
        token_list2.append(word)

In [None]:
print(len(token_list2))

240


### Finding POS tags of words

In [None]:
tokens_pos_tag = nltk.pos_tag(token_list2)

In [None]:
pos_df = pd.DataFrame(tokens_pos_tag, columns = ('word','POS'))

In [None]:
pos_df.head()

Unnamed: 0,word,POS
0,data,NNS
1,analyst,NN
2,job,NN
3,responsibilities,NNS
4,develops,VBZ


In [None]:
pos_sum = pos_df.groupby('POS', as_index=False).count() # group by POS tags

In [None]:
pos_sum.sort_values(['word'], ascending=[False]) # in descending order of number of words per tag

Unnamed: 0,POS,word
4,NN,93
6,NNS,60
2,JJ,29
9,VBG,17
11,VBP,15
12,VBZ,9
7,RB,5
8,VBD,4
10,VBN,3
5,NNP,2


### filtering nouns out of all the words

In [None]:
filtered_pos = [ ]

In [None]:
for one in tokens_pos_tag:
    if one[1] == 'NN' or one[1] == 'NNS' or one[1] == 'NNP' or one[1] == 'NNPS':
        filtered_pos.append(one)
        

In [None]:
print (len(filtered_pos))

155


### Finding most frequent words

In [None]:
fdist_pos = nltk.FreqDist(filtered_pos)

In [None]:
top_100_words = fdist_pos.most_common(100)

In [None]:
print(top_100_words)

[(('data', 'NNS'), 9), (('business', 'NN'), 4), (('analyst', 'NN'), 3), (('job', 'NN'), 3), (('models', 'NNS'), 3), (('hours', 'NNS'), 3), (('benefits', 'NNS'), 3), (('education', 'NN'), 3), (('knowledge', 'NN'), 3), (('analysis', 'NN'), 3), (('process', 'NN'), 3), (('experience', 'NN'), 3), (('management', 'NN'), 3), (('information', 'NN'), 2), (('support', 'NN'), 2), (('decision', 'NN'), 2), (('systems', 'NNS'), 2), (('analyses', 'NNS'), 2), (('analysts', 'NNS'), 2), (('skills', 'NNS'), 2), (('software', 'NN'), 2), (('proficiency', 'NN'), 2), (('skill', 'NN'), 2), (('communication', 'NN'), 2), (('group', 'NN'), 2), (('responsibilities', 'NNS'), 1), (('manages', 'NNS'), 1), (('making', 'NN'), 1), (('performs', 'NNS'), 1), (('facilitates', 'NNS'), 1), (('trains', 'NNS'), 1), (('issues', 'NNS'), 1), (('risk', 'NN'), 1), (('assessment', 'NN'), 1), (('processes', 'NNS'), 1), (('services', 'NNS'), 1), (('leadership', 'NN'), 1), (('training', 'NN'), 1), (('methodology', 'NN'), 1), (('priori

In [None]:
top_words_df = pd.DataFrame(top_100_words, columns = ('pos','count'))

In [None]:
top_words_df.head()

Unnamed: 0,pos,count
0,"(data, NNS)",9
1,"(business, NN)",4
2,"(analyst, NN)",3
3,"(job, NN)",3
4,"(models, NNS)",3


In [None]:
top_words_df['Word'] = top_words_df['pos'].apply(lambda x: x[0]) # split the tuple of POS

In [None]:
top_words_df = top_words_df.drop('pos', 1) # drop the previous column

In [None]:
top_words_df.head()

Unnamed: 0,count,Word
0,9,data
1,4,business
2,3,analyst
3,3,job
4,3,models


In [None]:
subset_pos = top_words_df[['Word', 'count']]

In [None]:
tuples_pos = [tuple(x) for x in subset_pos.values]

In [None]:
print(tuples_pos)

[('data', 9), ('business', 4), ('analyst', 3), ('job', 3), ('models', 3), ('hours', 3), ('benefits', 3), ('education', 3), ('knowledge', 3), ('analysis', 3), ('process', 3), ('experience', 3), ('management', 3), ('information', 2), ('support', 2), ('decision', 2), ('systems', 2), ('analyses', 2), ('analysts', 2), ('skills', 2), ('software', 2), ('proficiency', 2), ('skill', 2), ('communication', 2), ('group', 2), ('responsibilities', 1), ('manages', 1), ('making', 1), ('performs', 1), ('facilitates', 1), ('trains', 1), ('issues', 1), ('risk', 1), ('assessment', 1), ('processes', 1), ('services', 1), ('leadership', 1), ('training', 1), ('methodology', 1), ('prioritization', 1), ('projects', 1), ('problems', 1), ('determines', 1), ('causes', 1), ('defines', 1), ('company', 1), ('assets', 1), ('hivesql', 1), ('jobs', 1), ('works', 1), ('units', 1), ('engineering', 1), ('teams', 1), ('strategy', 1), ('term', 1), ('platform', 1), ('architecture', 1), ('[', 1), ('work', 1), ('seekers', 1), (