In [19]:
from IPython.display import HTML, display
def set_css():
  display(HTML('''
  <style>
    pre {
        white-space: pre-wrap;
    }
  </style>
  '''))
get_ipython().events.register('pre_run_cell', set_css)

## 2A. Study of various corpus – Brown, Inaugural, Reuters, udhr with various methods like fields, raw, words, sents, categories.

In [20]:
import nltk
from nltk.corpus import brown

nltk.download('brown')

# Display file ids of brown corpus
print('File ids of brown corpus\n', brown.fileids())

# Pick out the first of these texts — Emma by Jane Austen — and give it a short name, ca01
ca01 = brown.words('ca01')
# Display first few words
print('\nca01 has the following words:\n', ca01[:20])
# Total number of words in ca01
print('\nca01 has', len(ca01), 'words')

# Categories or files in brown corpus
print('\n\nCategories or files in brown corpus:\n')
print(brown.categories())

# Display other information about each text by looping over all the values of fileid
# and then computing statistics for each text.
print('\n\nStatistics for each text:\n')
print('AvgWordLen\tAvgSentenceLen\tNo. of Times Each Word Appears On Avg\tFileName')

for fileid in brown.fileids():
    num_chars = len(brown.raw(fileid))
    num_words = len(brown.words(fileid))
    num_sents = len(brown.sents(fileid))
    num_vocab = len(set(w.lower() for w in brown.words(fileid)))

    print(f"{int(num_chars / num_words)}\t\t\t"
          f"{int(num_words / num_sents)}\t\t\t"
          f"{int(num_words / num_vocab)}\t\t\t"
          f"{fileid}")


[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!


File ids of brown corpus
 ['ca01', 'ca02', 'ca03', 'ca04', 'ca05', 'ca06', 'ca07', 'ca08', 'ca09', 'ca10', 'ca11', 'ca12', 'ca13', 'ca14', 'ca15', 'ca16', 'ca17', 'ca18', 'ca19', 'ca20', 'ca21', 'ca22', 'ca23', 'ca24', 'ca25', 'ca26', 'ca27', 'ca28', 'ca29', 'ca30', 'ca31', 'ca32', 'ca33', 'ca34', 'ca35', 'ca36', 'ca37', 'ca38', 'ca39', 'ca40', 'ca41', 'ca42', 'ca43', 'ca44', 'cb01', 'cb02', 'cb03', 'cb04', 'cb05', 'cb06', 'cb07', 'cb08', 'cb09', 'cb10', 'cb11', 'cb12', 'cb13', 'cb14', 'cb15', 'cb16', 'cb17', 'cb18', 'cb19', 'cb20', 'cb21', 'cb22', 'cb23', 'cb24', 'cb25', 'cb26', 'cb27', 'cc01', 'cc02', 'cc03', 'cc04', 'cc05', 'cc06', 'cc07', 'cc08', 'cc09', 'cc10', 'cc11', 'cc12', 'cc13', 'cc14', 'cc15', 'cc16', 'cc17', 'cd01', 'cd02', 'cd03', 'cd04', 'cd05', 'cd06', 'cd07', 'cd08', 'cd09', 'cd10', 'cd11', 'cd12', 'cd13', 'cd14', 'cd15', 'cd16', 'cd17', 'ce01', 'ce02', 'ce03', 'ce04', 'ce05', 'ce06', 'ce07', 'ce08', 'ce09', 'ce10', 'ce11', 'ce12', 'ce13', 'ce14', 'ce15', 'ce16', 'ce17

---

## 2B. Create and use your own corpora (plaintext, categorical).

In [21]:
import os
import nltk
nltk.download('punkt')
from nltk.corpus import PlaintextCorpusReader

# Set the path to your corpus
corpus_root = '/content/uni'
filelist = PlaintextCorpusReader(corpus_root, '.*')

# Display file list
print('\nFile list:\n')
print(filelist.fileids())
print(filelist.root)

# Display other information about each text by looping over all the values of fileid
# and then computing statistics for each text.
print('\n\nStatistics for each text:\n')
print('AvgWordLen\tAvgSentenceLen\tNo. of Times Each Word Appears On Avg\tFileName')

for fileid in filelist.fileids():
    num_chars = len(filelist.raw(fileid))
    num_words = len(filelist.words(fileid))
    num_sents = len(filelist.sents(fileid))
    num_vocab = len(set(w.lower() for w in filelist.words(fileid)))

    print(f"{int(num_chars / num_words)}\t\t\t"
          f"{int(num_words / num_sents)}\t\t\t"
          f"{int(num_words / num_vocab)}\t\t"
          f"{fileid}")



File list:

['NLP_1A_TTS.py', 'NLP_1B_STT.py', 'nlp_3a.py', 'nlp_3b.py', 'nlp_3c.py']
/content/uni


Statistics for each text:

AvgWordLen	AvgSentenceLen	No. of Times Each Word Appears On Avg	FileName
4			16			2		NLP_1A_TTS.py
5			30			1		NLP_1B_STT.py
4			27			2		nlp_3a.py
4			38			3		nlp_3b.py
5			17			2		nlp_3c.py


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


---

## 2c. Study Conditional frequency distributions

In [22]:
# Process a sequence of pairs
text = ['The', 'Fulton', 'County', 'Grand', 'Jury', 'said', ...]
pairs = [('news', 'The'), ('news', 'Fulton'), ('news', 'County'), ...]

import nltk
from nltk.corpus import brown
nltk.download('inaugural')
nltk.download('udhr')

fd = nltk.ConditionalFreqDist(
    (genre, word)
    for genre in brown.categories()
    for word in brown.words(categories=genre)
)

genre_word = [
    (genre, word)
    for genre in ['news', 'romance']
    for word in brown.words(categories=genre)
]

print(len(genre_word))
print(genre_word[:4])
print(genre_word[-4:])

cfd = nltk.ConditionalFreqDist(genre_word)

print(cfd)
print(cfd.conditions())
print(cfd['news'])
print(cfd['romance'])
print(list(cfd['romance']))

from nltk.corpus import inaugural

cfd = nltk.ConditionalFreqDist(
    (target, fileid[:4])
    for fileid in inaugural.fileids()
    for w in inaugural.words(fileid)
    for target in ['america', 'citizen']
    if w.lower().startswith(target)
)

from nltk.corpus import udhr

languages = [
    'Chickasaw', 'English', 'German_Deutsch',
    'Greenlandic_Inuktikut', 'Hungarian_Magyar', 'Ibibio_Efik'
]

cfd = nltk.ConditionalFreqDist(
    (lang, len(word))
    for lang in languages
    for word in udhr.words(lang + '-Latin1')
)

cfd.tabulate(conditions=['English', 'German_Deutsch'], samples=range(10), cumulative=True)


[nltk_data] Downloading package inaugural to /root/nltk_data...
[nltk_data]   Package inaugural is already up-to-date!
[nltk_data] Downloading package udhr to /root/nltk_data...
[nltk_data]   Package udhr is already up-to-date!


170576
[('news', 'The'), ('news', 'Fulton'), ('news', 'County'), ('news', 'Grand')]
[('romance', 'afraid'), ('romance', 'not'), ('romance', "''"), ('romance', '.')]
<ConditionalFreqDist with 2 conditions>
['news', 'romance']
<FreqDist with 14394 samples and 100554 outcomes>
<FreqDist with 8452 samples and 70022 outcomes>
                  0    1    2    3    4    5    6    7    8    9 
       English    0  185  525  883  997 1166 1283 1440 1558 1638 
German_Deutsch    0  171  263  614  717  894 1013 1110 1213 1275 


---

## 2d. Study of tagged corpora with methods like tagged_sents, tagged_words.

In [23]:
import nltk
from nltk import tokenize

nltk.download('punkt')
nltk.download('words')

para = "Hello! My name is Ninad Karlekar. Today you'll be learning NLTK."
sents = tokenize.sent_tokenize(para)

print("\nSentence tokenization\n===================\n", sents)

# Word tokenization
print("\nWord tokenization\n===================\n")
for index in range(len(sents)):
    words = tokenize.word_tokenize(sents[index])
    print(words)



Sentence tokenization
 ['Hello!', 'My name is Ninad Karlekar.', "Today you'll be learning NLTK."]

Word tokenization

['Hello', '!']
['My', 'name', 'is', 'Ninad', 'Karlekar', '.']
['Today', 'you', "'ll", 'be', 'learning', 'NLTK', '.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


---

## 2e. Write a program to find the most frequent noun tags.

In [24]:
import nltk
from collections import defaultdict

nltk.download('averaged_perceptron_tagger')

text = nltk.word_tokenize("Ninad likes to play football. Ninad does not like to play cricket.")
tagged = nltk.pos_tag(text)
print(tagged)

# Checking if it is a noun or not
addNounWords = []
count = 0

for words in tagged:
    val = tagged[count][1]
    if val in ('NN', 'NNS', 'NNPS', 'NNP'):
        addNounWords.append(tagged[count][0])
    count += 1

print(addNounWords)

temp = defaultdict(int)

# Memoizing count
for sub in addNounWords:
    for wrd in sub.split():
        temp[wrd] += 1

# Getting max frequency
res = max(temp, key=temp.get)

# Printing result
print("Word with maximum frequency : " + str(res))


[('Ninad', 'NNP'), ('likes', 'VBZ'), ('to', 'TO'), ('play', 'VB'), ('football', 'NN'), ('.', '.'), ('Ninad', 'NNP'), ('does', 'VBZ'), ('not', 'RB'), ('like', 'VB'), ('to', 'TO'), ('play', 'VB'), ('cricket', 'NN'), ('.', '.')]
['Ninad', 'football', 'Ninad', 'cricket']
Word with maximum frequency : Ninad


[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


---

## 2f. Map Words to Properties Using Python Dictionaries

In [25]:
# Creating and printing a dictionary by mapping word with its properties
thisdict = {
    "brand": "Ford",
    "model": "Mustang",
    "year": 1964
}

print(thisdict)
print(thisdict["brand"])
print(len(thisdict))
print(type(thisdict))


{'brand': 'Ford', 'model': 'Mustang', 'year': 1964}
Ford
3
<class 'dict'>


---

## 2g. Find different words from a given plain text without any space by comparing this text with a given corpus of words. Also find the score of words.

In [32]:
from __future__ import with_statement  # with statement for reading file
import re  # Regular expression

words = []  # corpus file words
testword = []  # test words
ans = []  # words matches with corpus

print("MENU")
print("-----------")
print(" 1. Hash tag segmentation")
print(" 2. URL segmentation")
print("Enter the input choice for performing word segmentation:")
choice = int(input())

if choice == 1:
    text = "#whatismyname"  # hash tag test data to segment
    print("Input with HashTag:", text)
    pattern = re.compile("[^\w']")
    a = pattern.sub('', text)
elif choice == 2:
    text = "www.whatismyname.com"  # URL test data to segment
    print("Input with URL:", text)
    a = re.split('\s|(?<!\d)[,.](?!\d)', text)
    splitwords = ["www", "com", "in"]  # remove the words which is containing in the list
    a = "".join([each for each in a if each not in splitwords])
else:
    print("Wrong choice...try again")
    exit()

print(a)

for each in a:
    testword.append(each)  # test word
test_lenth = len(testword)  # length of the test data

# Reading the corpus
with open('words.txt', 'r') as f:
    lines = f.readlines()
    words = [e.strip() for e in lines]

def Seg(a, lenth):
    ans = []
    for k in range(0, lenth + 1):  # this loop checks char by char in the corpus
        if a[0:k] in words:
            print(a[0:k], "- appears in the corpus")
            ans.append(a[0:k])
            break
    if ans != []:
        g = max(ans, key=len)
        return g
    return ""

test_tot_itr = 0  # each iteration value
answer = []  # Store each word that contains the corpus
Score = 0  # initial value for score
N = 37  # total number of corpus
M = 0
C = 0

while test_tot_itr < test_lenth:
    ans_words = Seg(a, test_lenth)
    if ans_words != "":
        test_itr = len(ans_words)
        answer.append(ans_words)
        a = a[test_itr:test_lenth]
        test_tot_itr += test_itr

Aft_Seg = " ".join([each for each in answer])
# print segmented words in the list
print("Output")
print("---------")
print(Aft_Seg)  # print after segmentation the input

# Calculating Score
C = len(answer)
score = C * N / N  # Calculate the score
print("Score", score)


MENU
-----------
 1. Hash tag segmentation
 2. URL segmentation
Enter the input choice for performing word segmentation:
2
Input with URL: www.whatismyname.com
whatismyname
what - appears in the corpus
is - appears in the corpus
my - appears in the corpus
name - appears in the corpus
Output
---------
what is my name
Score 4.0
