# Level-1

In [3]:
from nltk import NaiveBayesClassifier
from nltk.corpus import names

# Extract last N letters from the input word
# and that will act as our "feature"
def extract_features(word, N):
    letters = word[:]
    return {'feature': letters.lower()}


In [4]:
if __name__=='__main__':
    # Create training data using labeled names available in NLTK
    male_list = [(name, 'male') for name in names.words('male.txt')]
    female_list = [(name, 'female') for name in names.words('female.txt')]
    data = (male_list + female_list)

    
    # Create test data
    input_names = ['Harry', 'Hermione', 'Kate', 'Ron']

    # Define the number of samples used for train and test
    num_train = int(0.8 * len(data))
    features = [(extract_features(n, 8), gender) for (n, gender) in data]
    train_data, test_data = features[:num_train], features[num_train:]
    classifier = NaiveBayesClassifier.train(train_data)

    # Predict outputs for input names using the trained classifier model
    for name in input_names:
        print(name, '==>', classifier.classify(extract_features(name, 6)))

Harry ==> male
Hermione ==> female
Kate ==> female
Ron ==> male


# Level-2

In [6]:
from nltk.tokenize import sent_tokenize, word_tokenize, WordPunctTokenizer

f=open('category_check.txt','r',errors='ignore')
input_text=f.read()
input_data=sent_tokenize(input_text)



In [7]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer

# Define the category map
category_map = {'talk.politics.misc': 'Politics', 'rec.autos': 'Autos', 
        'rec.sport.hockey': 'Hockey', 'sci.electronics': 'Artificial Intelligence', 
        'sci.med': 'Medicine'}

# Get the training dataset
training_data = fetch_20newsgroups(subset='train', categories=category_map.keys(), shuffle=True, random_state=5)

# Build a count vectorizer and extract term counts 
count_vectorizer = CountVectorizer()
train_tc = count_vectorizer.fit_transform(training_data.data)

# Create the tf-idf transformer
tfidf = TfidfTransformer()
train_tfidf = tfidf.fit_transform(train_tc)



# Train a Multinomial Naive Bayes classifier
classifier = MultinomialNB().fit(train_tfidf, training_data.target)

# Transform input data using count vectorizer
input_tc = count_vectorizer.transform(input_data)

# Transform vectorized data using tfidf transformer
input_tfidf = tfidf.transform(input_tc)

# Predict the output categories
predictions = classifier.predict(input_tfidf)

# Print the outputs
for sent, category in zip(input_data, predictions):
    print('\nInput:', sent, '\nPredicted category:', category_map[training_data.target_names[category]])


Input: Tesla is offering multiple models for cheaper prices. 
Predicted category: Autos

Input: ChatGPT is an chatbot developed by OpenAI which is built on the ground of Natural Language Processing and Machine Learning. 
Predicted category: Artificial Intelligence

Input: Players need to be careful when they are close to goal posts. 
Predicted category: Hockey

Input: Political debates help us understand the perspectives of both sides. 
Predicted category: Politics

Input: One of the most remarkable use of the newly developed Bioinformatics field is the rapid invention of Covid-19 vaccine. 
Predicted category: Medicine


# Level-3

In [8]:
from nltk.corpus import movie_reviews 
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy as nltk_accuracy
 
    
# Extract features from the input list of words
def extract_features(words):
    return dict([(word, True) for word in words])
 
if __name__=='__main__':
    # Load the reviews from the corpus 
    fileids_pos = movie_reviews.fileids('pos')
    fileids_neg = movie_reviews.fileids('neg')
     
    # Extract the features from the reviews
    features_pos = [(extract_features(movie_reviews.words(fileids=[f])), 'Positive') for f in fileids_pos]
    features_neg = [(extract_features(movie_reviews.words(fileids=[f])), 'Negative') for f in fileids_neg]
     
    # Define the train and test split (80% and 20%)
    threshold = 0.8
    num_pos = int(threshold * len(features_pos))
    num_neg = int(threshold * len(features_neg))
     
     # Create training and training datasets
    features_train = features_pos[:num_pos] + features_neg[:num_neg]
    features_test = features_pos[num_pos:] + features_neg[num_neg:]  

    # Train a Naive Bayes classifier 
    classifier = NaiveBayesClassifier.train(features_train)
    
    input_reviews = [
        'The actors in this movie were great', 
        'This is such an idiotic movie. I will not recommend it to anyone.' 
    ]

    print("\nMovie review predictions:")
    for review in input_reviews:
        print("\nReview:", review)

        # Compute the probabilities
        probabilities = classifier.prob_classify(extract_features(review.split()))

        # Pick the maximum value
        predicted_sentiment = probabilities.max()

        # Print outputs
        print("Predicted sentiment:", predicted_sentiment)
        print("Probability:", round(probabilities.prob(predicted_sentiment), 2))


Movie review predictions:

Review: The actors in this movie were great
Predicted sentiment: Positive
Probability: 0.56

Review: This is such an idiotic movie. I will not recommend it to anyone.
Predicted sentiment: Negative
Probability: 0.87


# Level-4

In [20]:
from nltk.tokenize import RegexpTokenizer  
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from gensim import models, corpora

# Load input data
def load_data(input_file):
    data = []
    with open(input_file, 'r') as f:
        for line in f.readlines():
            data.append(line[:-1])

    return data

# Processor function for tokenizing, removing stop 
# words, and stemming
def process(input_text):
    # Create a regular expression tokenizer
    tokenizer = RegexpTokenizer(r'\w+')

    # Create a Snowball stemmer 
    stemmer = SnowballStemmer('english')

    # Get the list of stop words 
    stop_words = stopwords.words('english')
    
    # Tokenize the input string
    tokens = tokenizer.tokenize(input_text.lower())

    # Remove the stop words 
    tokens = [x for x in tokens if not x in stop_words]
    
    # Perform stemming on the tokenized words 
    tokens_stemmed = [stemmer.stem(x) for x in tokens]

    return tokens_stemmed
    
if __name__=='__main__':
    # Load input data
    data = load_data('movie_review.txt')

    # Create a list for sentence tokens
    tokens = [process(x) for x in data]

    # Create a dictionary based on the sentence tokens 
    dict_tokens = corpora.Dictionary(tokens)
        
    # Create a document-term matrix
    doc_term_mat = [dict_tokens.doc2bow(token) for token in tokens]

    # Define the number of topics for the LDA model

    # Generate the LDA model 
    ldamodel = models.ldamodel.LdaModel(doc_term_mat, 2, id2word=dict_tokens, passes=25)

    num_words = 5
    print('\nTop ' + str(num_words) + ' contributing words to each topic:')
    for item in ldamodel.print_topics(num_topics=2, num_words=num_words):
        print('\nTopic', item[0])

        # Print the contributing words along with their relative contributions 
        list_of_strings = item[1].split(' + ')
        for text in list_of_strings:
            weight = text.split('*')[0]
            word = text.split('*')[1]
            print(word, '==>', str(round(float(weight) * 100, 2)) + '%')


Top 5 contributing words to each topic:

Topic 0
"movi" ==> 7.1%
"amaz" ==> 7.1%
"terribl" ==> 7.1%
"except" ==> 7.1%
"director" ==> 7.1%

Topic 1
"movi" ==> 11.4%
"find" ==> 6.8%
"peopl" ==> 6.8%
"great" ==> 6.8%
"storytel" ==> 6.8%
