In [None]:
nltk.download('all')

# CHAPTER 6

1.	Using the names corpus in NLTK, build a gender classifier that predicts whether a name is male or female based on the last letter of the name. Evaluate its accuracy.



In [None]:
from nltk.corpus import names
import random

# Prepare the training data using last letter of names
def gender_features(name):
    return {'last_letter': name[-1]}

# Create training and test datasets
names_data = [(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]
random.shuffle(names_data)

train_data = [(gender_features(name), gender) for (name, gender) in names_data[:int(len(names_data) * 0.8)]]
test_data = [(gender_features(name), gender) for (name, gender) in names_data[int(len(names_data) * 0.8):]]

# Train the classifier
classifier = nltk.NaiveBayesClassifier.train(train_data)

# Evaluate the classifier
accuracy = nltk.classify.accuracy(classifier, test_data)
print(f"Accuracy: {accuracy}")

Accuracy: 0.7658904971680303


2.	Enhance the gender classifier by including features such as the first letter and the length of the name. Evaluate if these features improve the classifier's accuracy.


In [None]:
# Enhanced feature extractor: first letter and name length
def enhanced_gender_features(name):
    return {
        'last_letter': name[-1],
        'first_letter': name[0],
        'length': len(name)
    }

# Train the enhanced classifier
train_data_enhanced = [(enhanced_gender_features(name), gender) for (name, gender) in names_data[:int(len(names_data) * 0.8)]]
test_data_enhanced = [(enhanced_gender_features(name), gender) for (name, gender) in names_data[int(len(names_data) * 0.8):]]

# Train the classifier
enhanced_classifier = nltk.NaiveBayesClassifier.train(train_data_enhanced)

# Evaluate the enhanced classifier
enhanced_accuracy = nltk.classify.accuracy(enhanced_classifier, test_data_enhanced)
print(f"Enhanced Accuracy: {enhanced_accuracy}")

Enhanced Accuracy: 0.7696664568911264


3.	Using the movie_reviews corpus in NLTK, build a document classifier to categorize movie reviews as positive or negative. Evaluate its performance.


In [None]:
from nltk.corpus import movie_reviews

# Feature extractor: bag of words
def extract_features(words):
    return {word: True for word in words}

# Prepare the dataset
positive_reviews = movie_reviews.categories('pos')
negative_reviews = movie_reviews.categories('neg')

positive_data = [(extract_features(movie_reviews.words(fileid)), 'pos') for fileid in movie_reviews.fileids('pos')]
negative_data = [(extract_features(movie_reviews.words(fileid)), 'neg') for fileid in movie_reviews.fileids('neg')]

# Train-test split
train_data = positive_data[:int(len(positive_data) * 0.8)] + negative_data[:int(len(negative_data) * 0.8)]
test_data = positive_data[int(len(positive_data) * 0.8):] + negative_data[int(len(negative_data) * 0.8):]

# Train Naive Bayes classifier
movie_classifier = nltk.NaiveBayesClassifier.train(train_data)

# Evaluate the classifier
movie_accuracy = nltk.classify.accuracy(movie_classifier, test_data)
print(f"Movie Review Classifier Accuracy: {movie_accuracy}")

Movie Review Classifier Accuracy: 0.735


4.	Implement a custom feature extractor for the movie review classifier that considers bigrams (pairs of consecutive words) in addition to unigrams (single words). Evaluate its impact on classification accuracy.


In [None]:
from nltk import bigrams

# Feature extractor with bigrams
def extract_bigram_features(words):
    word_bigrams = bigrams(words)
    return {f"bigram_{w1}_{w2}": True for w1, w2 in word_bigrams}

# Update dataset with bigrams
train_data_bigram = [(extract_bigram_features(movie_reviews.words(fileid)), category)
                     for fileid, category in zip(movie_reviews.fileids(), ['pos', 'neg'])]

# Train Naive Bayes classifier
bigram_classifier = nltk.NaiveBayesClassifier.train(train_data_bigram)

# Evaluate the classifier
bigram_accuracy = nltk.classify.accuracy(bigram_classifier, test_data)
print(f"Bigram Classifier Accuracy: {bigram_accuracy}")

Bigram Classifier Accuracy: 0.5


5.	Build a Naive Bayes classifier using the names corpus to predict gender based on both the first and last letters of a name. Evaluate the model's accuracy.


In [None]:
# Feature extractor using both first and last letters
def name_gender_features(name):
    return {'first_letter': name[0], 'last_letter': name[-1]}

# Create training and test datasets for gender classification
train_data_gender = [(name_gender_features(name), gender) for (name, gender) in names_data[:int(len(names_data) * 0.8)]]
test_data_gender = [(name_gender_features(name), gender) for (name, gender) in names_data[int(len(names_data) * 0.8):]]

# Train Naive Bayes classifier
gender_classifier = nltk.NaiveBayesClassifier.train(train_data_gender)

# Evaluate the classifier
gender_accuracy = nltk.classify.accuracy(gender_classifier, test_data_gender)
print(f"Gender Classifier Accuracy: {gender_accuracy}")

Gender Classifier Accuracy: 0.7696664568911264


6.	Write a Python program using the movie_reviews corpus to identify the 10 most common words in positive reviews.


In [None]:
from nltk.probability import FreqDist

# Get the words from positive reviews
positive_words = [word.lower() for fileid in movie_reviews.fileids('pos') for word in movie_reviews.words(fileid)]

# Calculate frequency distribution
fdist = FreqDist(positive_words)

# Get the 10 most common words
print(fdist.most_common(10))

[(',', 42448), ('the', 41471), ('.', 33714), ('a', 20196), ('and', 19896), ('of', 18636), ('to', 16517), ("'", 15268), ('is', 14059), ('in', 11725)]


7.	Implement a Naive Bayes classifier to classify movie reviews using only adjectives as features.


In [None]:
# Feature extractor that only considers adjectives
def extract_adjective_features(words):
    adjectives = [word for word, tag in nltk.pos_tag(words) if tag in ['JJ', 'JJR', 'JJS']]
    return {word: True for word in adjectives}

# Prepare the dataset
train_data_adj = [(extract_adjective_features(movie_reviews.words(fileid)), category)
                  for fileid, category in zip(movie_reviews.fileids(), ['pos', 'neg'])]

# Train Naive Bayes classifier
adj_classifier = nltk.NaiveBayesClassifier.train(train_data_adj)

# Evaluate the classifier
adj_accuracy = nltk.classify.accuracy(adj_classifier, test_data)
print(f"Adjective Classifier Accuracy: {adj_accuracy}")

Adjective Classifier Accuracy: 0.5


8.	Write a Python program that uses the names corpus to build a gender classifier using Decision Tree Classifier from NLTK.


In [None]:
from nltk.classify import DecisionTreeClassifier

# Train Decision Tree Classifier on gender features
tree_classifier = DecisionTreeClassifier.train(train_data_gender)

# Evaluate the classifier
tree_accuracy = nltk.classify.accuracy(tree_classifier, test_data_gender)
print(f"Decision Tree Classifier Accuracy: {tree_accuracy}")

Decision Tree Classifier Accuracy: 0.7803650094398993


9.	Write a Python program using NLTK to identify bigrams that are frequently used together in movie reviews.


In [None]:
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures

# Find bigrams in the movie reviews
bigram_finder = BigramCollocationFinder.from_words(movie_reviews.words())
bigrams = bigram_finder.nbest(BigramAssocMeasures.likelihood_ratio, 10)

# Print the most frequent bigrams
print(bigrams)

[("'", 's'), ("'", 't'), (',', 'but'), (',', 'and'), ('of', 'the'), ('the', 'film'), ('it', "'"), ('to', 'be'), ('in', 'the'), ('doesn', "'")]


10.	Write a Python program to train a Naive Bayes classifier on the movie_reviews corpus with features based on word lengths.


In [None]:
# Feature extractor based on word lengths
def extract_word_length_features(words):
    return {'word_length': len(words)}

# Prepare the dataset
train_data_length = [(extract_word_length_features(movie_reviews.words(fileid)), category)
                     for fileid, category in zip(movie_reviews.fileids(), ['pos', 'neg'])]

# Train Naive Bayes classifier
length_classifier = nltk.NaiveBayesClassifier.train(train_data_length)

# Evaluate the classifier
length_accuracy = nltk.classify.accuracy(length_classifier, test_data)
print(f"Word Length Classifier Accuracy: {length_accuracy}")

Word Length Classifier Accuracy: 0.5


11.	Write a Python program using NLTK to create a frequency distribution of word lengths from movie reviews.


In [None]:
# Get word lengths from movie reviews
word_lengths = [len(word) for word in movie_reviews.words()]

# Calculate frequency distribution of word lengths
fdist_lengths = FreqDist(word_lengths)

# Print the frequency distribution
print(fdist_lengths)

<FreqDist with 31 samples and 1583820 outcomes>


12.	Write a Python program to extract all named entities from a given text using NLTK's ne_chunk functionality.

In [None]:
from nltk import word_tokenize, pos_tag, ne_chunk

# Sample text
text = "Barack Obama was born in Hawaii. He is a prominent figure in politics."

# Tokenize, POS tag and extract named entities
tokens = word_tokenize(text)
tags = pos_tag(tokens)
tree = ne_chunk(tags)

# Print named entities
print(tree)

(S
  (PERSON Barack/NNP)
  (PERSON Obama/NNP)
  was/VBD
  born/VBN
  in/IN
  (GPE Hawaii/NNP)
  ./.
  He/PRP
  is/VBZ
  a/DT
  prominent/JJ
  figure/NN
  in/IN
  politics/NNS
  ./.)
