# POS Tagging

In [1]:
#Importing the libraries

import numpy as np
import sklearn 
import nltk
import pandas as pd
from matplotlib import pyplot as plt
from nltk.corpus import treebank
# Download the required dataset
nltk.download('treebank')

[nltk_data] Downloading package treebank to
[nltk_data]     C:\Users\rouna\AppData\Roaming\nltk_data...
[nltk_data]   Package treebank is already up-to-date!


True

In [2]:
#Finding the tagged sentences from the treebank dataset
treebank_sents = treebank.tagged_sents()

In [3]:
tags = [tag for sentence in treebank_sents for _, tag in sentence]
unique_tags = set(tags)
len(unique_tags)

46

In [4]:
words = [tag for sentence in treebank_sents for tag, _ in sentence]
unique_words = set(words)
len(unique_words)

12408

In [5]:
# Provided set of POS tags
pos_tags = unique_tags.copy()

# Initialize dictionaries to categorize the tags
categories = {
    'NN': {'NN', 'NNP', 'NNPS', 'NNS'},
    'PRP': {'PRP', 'PRP$'},
    'VB': {'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'MD'},
    'ADJ': {'JJ', 'JJR', 'JJS'},
    'ADV': {'RB', 'RBR', 'RBS'},
    'PP': {'IN'},
    'CC': {'CC'},
    'IJ': {'UH'},
    'OTH': {'#', '$', "''", ',', '-LRB-', '-NONE-', '-RRB-', ':','.', 'EX', 'FW', 'LS', 'CD', 'PDT', 'POS', 'RP', 'SYM', 'TO', '``', 'WDT', 'WP', 'WP$', 'WRB'},
}

# Initialize dictionaries to store categorized tags
categorized_tags = {category: set() for category in categories.keys()}

# Categorize the tags
for tag in pos_tags:
    for category, tag_set in categories.items():
        if tag in tag_set:
            categorized_tags[category].add(tag)

# Print the categorized tags
for category, tags in categorized_tags.items():
    print(f'{category} Tags:', tags)


NN Tags: {'NNS', 'NN', 'NNPS', 'NNP'}
PRP Tags: {'PRP', 'PRP$'}
VB Tags: {'VBP', 'VBN', 'MD', 'VBD', 'VB', 'VBG', 'VBZ'}
ADJ Tags: {'JJR', 'JJ', 'JJS'}
ADV Tags: {'RB', 'RBS', 'RBR'}
PP Tags: {'IN'}
CC Tags: {'CC'}
IJ Tags: {'UH'}
OTH Tags: {'$', 'SYM', '-LRB-', 'CD', 'TO', '-RRB-', 'PDT', ':', ',', 'WRB', '``', '-NONE-', 'LS', '#', 'WP$', 'RP', 'WP', 'FW', 'POS', '.', 'EX', 'WDT', "''"}


In [6]:
#Changing original tags to custom tags
new_treebank_sents = []
for i, sentence in enumerate(treebank_sents):
    words_and_changed_pos = [tuple(['', 'SOS'])]
    p=0
    for j, word in enumerate(sentence):
        p=0

        for key, value in categories.items():
            if word[1] in value:
                words_and_changed_pos.append(tuple([word[0], key]))
                p=1
        
        if(p==0):
            words_and_changed_pos.append(tuple([word[0], 'DIF']))
    words_and_changed_pos.append(tuple(['', 'EOS']))
    new_treebank_sents.append(words_and_changed_pos)

In [7]:
#dictionary to keep track of counts of tags
final_tags = {
    'SOS': 0,
    'NN': 0,
    'PRP': 0,
    'VB': 0,
    'ADJ': 0,
    'ADV': 0,
    'PP': 0,
    'CC': 0,
    'IJ': 0,
    'OTH': 0,
    'DIF': 0,
    'EOS': 0,
}

for i, sentence in enumerate(new_treebank_sents):
    for j, word in enumerate(sentence):
        final_tags[word[1]]+=1;

In [8]:
final_tags

{'SOS': 3914,
 'NN': 28867,
 'PRP': 2482,
 'VB': 13564,
 'ADJ': 6397,
 'ADV': 2993,
 'PP': 9857,
 'CC': 2265,
 'IJ': 3,
 'OTH': 26083,
 'DIF': 8165,
 'EOS': 3914}

In [9]:
dict_to_list = ['SOS',
    'NN',
    'PRP',
    'VB',
    'ADJ',
    'ADV',
    'PP',
    'CC',
    'IJ',
    'OTH',
    'DIF',
    'EOS']

In [10]:
emission_probs = {}
transition_probs = {}

In [11]:
def transition_probability(tag1, tag2):
    if(tuple[tag1, tag2] in transition_probs):          #if already t2->t2 is present return it
        return transition_probs[tuple([tag1, tag2])]
    else:    
        count=0
        for i, sentence in enumerate(new_treebank_sents):    #else go through the treebank dataset and find transition t1->t2
            for j, word in enumerate(sentence):
                if(j>0):
                    if(new_treebank_sents[i][j-1][1]==tag1 and new_treebank_sents[i][j][1]==tag2):
                        count+=1
        transition_probs[tuple([tag1, tag2])] = count/final_tags[tag1]    #No smoothing needed since all combinations are already available
        return transition_probs[tuple([tag1, tag2])]

In [12]:
def emmision_probability(word_input, tag1, tag2='SOS', m=1, n=3):  
    if(tuple[word_input, tag1] in emission_probs):        #if already w->t is present return it
        return emission_probs[tuple[word_input, tag1]]
    else:
        count=0
        for i, sentence in enumerate(new_treebank_sents):
            for j, word in enumerate(sentence):
                if(word[0]==word_input and word[1]==tag1):
                    count+=1                                   #counting if (w,t) is present in corpus
        if(tag1!='SOS' and (tag1=='NN' or tag1=='VB' or tag1=='ADJ' or tag1=='ADV')):
            # Smoothing for the probabilities
            # m is for counting number of Nouns, adjective, adverb and Verb since new words will be mostly from these 4 tags
            # n is for counting probability of bigram transition, e.g chance of VB coming after noun is high, so emission probability should also be high
            if(tag2!='SOS'):
                emission_probs[tuple[word_input, tag1]] = (count+m*final_tags[tag1]/(final_tags['NN']+final_tags['VB']+final_tags['ADJ']+final_tags['ADV'])+n*transition_probability(tag2, tag1))/(final_tags[tag1]+m+n)
            else:
                n=0
                emission_probs[tuple[word_input, tag1]] = (count+m*final_tags[tag1]/(final_tags['NN']+final_tags['VB']+final_tags['ADJ']+final_tags['ADV'])+n*transition_probability(tag2, tag1))/(final_tags[tag1]+m+n)
        else:
            m=0
            n=0
            emission_probs[tuple[word_input, tag1]] = count/final_tags[tag1]
        
        return emission_probs[tuple[word_input, tag1]]

In [13]:
import time
start_time = time.time()

In [453]:
'''def viterbi_algorithm(tokens):
    N = len(tokens)
    Ns = len(final_tags)
    
    path = []
    prev_row = []
    prev_num = []
    for i in range(0,N):
        arg_max_term='SOS'
        max_term=1
        start_time = time.time()
        if(len(prev_row)):
            max_y = prev_row[0]
            arg_mac = 0
            b=0
            for term in prev_row:
                if(term>max_y):
                    max_y = term
                    arg_mac = prev_num[b]
                b+=1
            arg_max_term = dict_to_list[arg_mac]
            max_term = max_y
            prev_num=[]
            prev_row=[]
        else:
            arg_max_term = dict_to_list[0]
            max_term = 1
        #arg_max_term = dict_to_list[np.argmax(matrix[i-1][:])]
        
        for j in range(Ns):
            
            if(i>0 and j>0):
                max_term_before = max_term
                p = max_term*emmision_probability(tokens[i], dict_to_list[j], arg_max_term)*transition_probability(arg_max_term, dict_to_list[j])
                if(p>0):
                    prev_row.append(p)
                    prev_num.append(j)
        end_time = time.time()
        elapsed_time = end_time - start_time
        print(f"Elapsed time: {elapsed_time:.2f} seconds")
        if arg_max_term!='SOS' and arg_max_term!='EOS':
            path.append(arg_max_term)  
    path.insert(0, 'SOS')
    path.append('EOS')
    return path, p'''

'def viterbi_algorithm(tokens):\n    N = len(tokens)\n    Ns = len(final_tags)\n    \n    path = []\n    prev_row = []\n    prev_num = []\n    for i in range(0,N):\n        arg_max_term=\'SOS\'\n        max_term=1\n        start_time = time.time()\n        if(len(prev_row)):\n            max_y = prev_row[0]\n            arg_mac = 0\n            b=0\n            for term in prev_row:\n                if(term>max_y):\n                    max_y = term\n                    arg_mac = prev_num[b]\n                b+=1\n            arg_max_term = dict_to_list[arg_mac]\n            max_term = max_y\n            prev_num=[]\n            prev_row=[]\n        else:\n            arg_max_term = dict_to_list[0]\n            max_term = 1\n        #arg_max_term = dict_to_list[np.argmax(matrix[i-1][:])]\n        \n        for j in range(Ns):\n            \n            if(i>0 and j>0):\n                max_term_before = max_term\n                p = max_term*emmision_probability(tokens[i], dict_to_list[

In [14]:
def viterbi_algorithm(tokens):
    N = len(tokens)
    Ns = len(final_tags)
    matrix = np.zeros((N, Ns))
    matrix[0][0] = 1
    path = []
    for i in range(N):
        arg_max_term = dict_to_list[np.argmax(matrix[i-1][:])]
        for j in range(Ns):
            if(i>0 and j>0):
                max_term_before = max(matrix[i-1][:])
                matrix[i][j] = max(matrix[i-1][:])*emmision_probability(tokens[i], dict_to_list[j], dict_to_list[j-1])*transition_probability(dict_to_list[np.argmax(matrix[i-1][:])], dict_to_list[j])
        if arg_max_term!='SOS' and arg_max_term!='EOS':
            path.append(arg_max_term)  
    path.insert(0, 'SOS')
    path.append('EOS')
    return path, matrix

In [18]:
from nltk.tokenize import word_tokenize
sentence = "The quick brown fox jumps over the lazy dog. They race through the lush forest, chasing dreams of adventure beneath the starry night sky, filled with wonder and endless possibilities."
#sentence = "I am a boy"
#sentence = positive_sentences[0]
tokens = word_tokenize(sentence)[:200]
tokens.append('')
tokens.insert(0, '')


In [19]:
path1, matrix = viterbi_algorithm(tokens)
[(tokens[i], path1[i]) for i in range(len(tokens))]

[('', 'SOS'),
 ('The', 'DIF'),
 ('quick', 'ADJ'),
 ('brown', 'NN'),
 ('fox', 'VB'),
 ('jumps', 'VB'),
 ('over', 'PP'),
 ('the', 'DIF'),
 ('lazy', 'NN'),
 ('dog', 'VB'),
 ('.', 'OTH'),
 ('They', 'PRP'),
 ('race', 'VB'),
 ('through', 'PP'),
 ('the', 'DIF'),
 ('lush', 'NN'),
 ('forest', 'VB'),
 (',', 'OTH'),
 ('chasing', 'VB'),
 ('dreams', 'VB'),
 ('of', 'PP'),
 ('adventure', 'NN'),
 ('beneath', 'VB'),
 ('the', 'DIF'),
 ('starry', 'NN'),
 ('night', 'NN'),
 ('sky', 'VB'),
 (',', 'OTH'),
 ('filled', 'VB'),
 ('with', 'PP'),
 ('wonder', 'NN'),
 ('and', 'CC'),
 ('endless', 'ADJ'),
 ('possibilities', 'NN'),
 ('.', 'OTH'),
 ('', 'EOS')]

In [410]:
path1

['SOS',
 'DIF',
 'VB',
 'VB',
 'PP',
 'DIF',
 'OTH',
 'VB',
 'DIF',
 'CC',
 'ADJ',
 'PP',
 'DIF',
 'OTH',
 'EOS']

# Vanilla Sentiment Classifier

In [114]:
import nltk
from nltk.corpus import movie_reviews
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
nltk.download('movie_reviews')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag

[nltk_data] Downloading package movie_reviews to
[nltk_data]     C:\Users\rouna\AppData\Roaming\nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!


In [115]:
positive_fileids = movie_reviews.fileids('pos')
negative_fileids = movie_reviews.fileids('neg')

def get_sentences(file_ids):
    sentences = []
    for file_id in file_ids:
        
        review_text = list(movie_reviews.words(file_id))
        text = " ".join(review_text)
        sentences.append(text)
    return sentences

positive_sentences = get_sentences(positive_fileids)   # sentences with positive reviews
negative_sentences = get_sentences(negative_fileids)   # sentences with negative reviews

In [48]:
import random

# Initialize an empty list to store the corpus
corpus = []

# Assuming you have two lists: positive_sentences and negative_sentences
# Add positive sentences to the corpus with a label of 1 (indicating positive sentiment)
for sentence in positive_sentences:
    corpus.append(tuple([sentence, 1]))

# Add negative sentences to the corpus with a label of 0 (indicating negative sentiment)
for sentence in negative_sentences:
    corpus.append(tuple([sentence, 0]))

# Shuffle the corpus to randomize the order of sentences
random.shuffle(corpus)

# The 'corpus' now contains sentence-label pairs in the form (sentence, review)


In [49]:
# Import necessary libraries and download required resources
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from stop_words import get_stop_words

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Sample sentence
corpus_new = []

# Preprocessing step 1: Remove stopwords
for sentence, val in corpus:
    # Tokenize the sentence
    words = word_tokenize(sentence)

    # Get the list of English stopwords
    stop_words = list(get_stop_words('en'))         # About 900 stopwords
    nltk_words = list(stopwords.words('english'))   # About 150 stopwords
    stop_words.extend(nltk_words)

    # Filter out stopwords from the sentence
    filtered_words = [word for word in words if word.lower() not in stop_words]
    filtered_sentence = ' '.join(filtered_words)
    corpus_new.append(tuple([filtered_sentence, val]))

# Initialize a new corpus for stemmed words
new_corpus = []

# Preprocessing step 2: Stemming
for sentence, val in corpus_new:
    # Tokenize the sentence
    words = word_tokenize(sentence)
    
    # Create a Porter stemmer instance
    stemmer = PorterStemmer()
    
    # Apply stemming to each word in the sentence
    stemmed_words = [stemmer.stem(word) for word in words]
    
    # Join the stemmed words back into a sentence
    stemmed_sentence = ' '.join(stemmed_words)
    
    # Append the stemmed sentence and its label to the new corpus
    new_corpus.append(tuple([stemmed_sentence, val]))


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rouna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rouna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rouna\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [50]:
list_sentences = [sentence for sentence,_ in new_corpus]

In [51]:
# Import the necessary library
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming 'list_sentences' contains your list of sentences and 'new_corpus' contains your labeled data

# Create a copy of the 'list_sentences' (assuming it's a list of sentences)
sentences = list_sentences.copy()

# Initialize a TfidfVectorizer with a maximum of 2000 features (words or terms)
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the sentences using the TfidfVectorizer
X = tfidf_vectorizer.fit_transform(sentences)

# Assuming 'new_corpus' is a list of tuples where the second element in each tuple is the label (1 or 0)
# Extract the labels (y) from 'new_corpus'
y = [val for _, val in new_corpus]

# The 'X' variable now contains the TF-IDF vectorized representation of your sentences,
# and 'y' contains the corresponding labels.


In [52]:
# train-test splitting
X_train = X[:1800]
y_train = y[:1800]
X_test = X[1800:]
y_test = y[1800:]

In [54]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
import catboost
# training with CatBoostClassifier
from catboost import CatBoostClassifier
models = {
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(),
    'CatBoost': CatBoostClassifier(verbose=0)
}

trained_models = {}
for model_name, model in models.items():
    model.fit(X_train.toarray(), y_train)
    trained_models[model_name] = model

# Step 3: Make predictions
predictions = {}
for model_name, model in trained_models.items():
    y_pred = model.predict(X_test.toarray())
    predictions[model_name] = y_pred

# Step 4: Evaluate the models
classification_scores = {}
for model_name, y_pred in predictions.items():
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    classification_scores[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1
    }

# Print classification scores for each model
for model_name, scores in classification_scores.items():
    print(f"Model: {model_name}")
    for metric, score in scores.items():
        print(f"{metric}: {score:.4f}")
    print()


Learning rate set to 0.013241
0:	learn: 0.6907227	total: 268ms	remaining: 4m 28s
1:	learn: 0.6887364	total: 509ms	remaining: 4m 14s
2:	learn: 0.6864846	total: 766ms	remaining: 4m 14s
3:	learn: 0.6845819	total: 1.02s	remaining: 4m 13s
4:	learn: 0.6817174	total: 1.26s	remaining: 4m 10s
5:	learn: 0.6793544	total: 1.51s	remaining: 4m 10s
6:	learn: 0.6774416	total: 1.79s	remaining: 4m 14s
7:	learn: 0.6748642	total: 2.04s	remaining: 4m 13s
8:	learn: 0.6730348	total: 2.29s	remaining: 4m 11s
9:	learn: 0.6709713	total: 2.53s	remaining: 4m 10s
10:	learn: 0.6687515	total: 2.77s	remaining: 4m 9s
11:	learn: 0.6666151	total: 3.02s	remaining: 4m 8s
12:	learn: 0.6644471	total: 3.26s	remaining: 4m 7s
13:	learn: 0.6626934	total: 3.51s	remaining: 4m 7s
14:	learn: 0.6611005	total: 3.75s	remaining: 4m 6s
15:	learn: 0.6589488	total: 4s	remaining: 4m 6s
16:	learn: 0.6574820	total: 4.25s	remaining: 4m 5s
17:	learn: 0.6549623	total: 4.52s	remaining: 4m 6s
18:	learn: 0.6532207	total: 4.76s	remaining: 4m 5s
19:	

  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


Model: Random Forest
Accuracy: 0.8700
Precision: 0.8700
Recall: 0.8700
F1-Score: 0.8700

Model: Logistic Regression
Accuracy: 0.8700
Precision: 0.8558
Recall: 0.8900
F1-Score: 0.8725

Model: SVM
Accuracy: 0.8600
Precision: 0.8396
Recall: 0.8900
F1-Score: 0.8641

Model: K-Nearest Neighbors
Accuracy: 0.6850
Precision: 0.6581
Recall: 0.7700
F1-Score: 0.7097

Model: Naive Bayes
Accuracy: 0.7200
Precision: 0.7340
Recall: 0.6900
F1-Score: 0.7113

Model: Decision Tree
Accuracy: 0.6200
Precision: 0.6111
Recall: 0.6600
F1-Score: 0.6346

Model: CatBoost
Accuracy: 0.8250
Precision: 0.7826
Recall: 0.9000
F1-Score: 0.8372



In [53]:
# training with Support Vector Classifier
from sklearn.svm import SVC
svc = SVC()
model = svc.fit(X_train, y_train)
from sklearn.metrics import accuracy_score
#X_val_pca = clf.transform(X_val_tfidf)
y_pred = model.predict(X_test.toarray())
accuracy_score(y_pred, y_test)

0.86

In [243]:
# training with RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
rfc = RandomForestClassifier()
model = rfc.fit(X_train, y_train)
from sklearn.metrics import accuracy_score
#X_val_pca = clf.transform(X_val_tfidf)
y_pred = model.predict(X_test.toarray())
accuracy_score(y_pred, y_test)

0.805

In [245]:
# training with DecisionTreeClassifier
from sklearn.tree import DecisionTreeClassifier
dtc = DecisionTreeClassifier()
model = dtc.fit(X_train, y_train)
from sklearn.metrics import accuracy_score
#X_val_pca = clf.transform(X_val_tfidf)
y_pred = model.predict(X_test.toarray())
accuracy_score(y_pred, y_test)

0.67

In [420]:
from sklearn.naive_bayes import GaussianNB  
from sklearn.naive_bayes import MultinomialNB  
from sklearn.naive_bayes import BernoulliNB
# training with Naive Bayes
mulnb = MultinomialNB()
model = mulnb.fit(X_train.toarray(), y_train)
from sklearn.metrics import accuracy_score
#X_val_pca = clf.transform(X_val_tfidf)
y_pred = model.predict(X_test.toarray())
accuracy_score(y_pred, y_test)

0.825

In [55]:
import catboost
# training with CatBoostClassifier
from catboost import CatBoostClassifier
catc = CatBoostClassifier(verbose=0)
model = catc.fit(X_train, y_train)
from sklearn.metrics import accuracy_score
#X_val_pca = clf.transform(X_val_tfidf)
y_pred = model.predict(X_test.toarray())
accuracy_score(y_pred, y_test)

0.825

In [None]:
# Attempt at using Word2Vec (failed)

In [208]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

sentences = list_sentences.copy()

tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

model = Word2Vec(tokenized_sentences, vector_size=10, window=5, min_count=1, sg=0)

vector = model.wv['boy']

Vector representation of 'document':
[-2.7164922  -0.77762073  1.6346495   0.6768557   2.2300434  -0.83758444
  1.8690907   1.0641067  -2.7853336  -0.2433745 ]


In [234]:
X=[]
for sentence in sentences:
    sent = sentence.split(' ')
    list1 = []
    for word in sent:
        if(len(word)!=1):
            list1.append(model.wv(word))
    X.append(list1)

TypeError: 'KeyedVectors' object is not callable

# POS tagging in Sentiment Classifier

In [116]:
import random

# Initialize an empty list to store the corpus
corpus = []

# Assuming you have two lists: positive_sentences and negative_sentences
# Add positive sentences to the corpus with a label of 1 (indicating positive sentiment)
for sentence in positive_sentences:
    corpus.append(tuple([sentence, 1]))

# Add negative sentences to the corpus with a label of 0 (indicating negative sentiment)
for sentence in negative_sentences:
    corpus.append(tuple([sentence, 0]))

# Shuffle the corpus to randomize the order of sentences
random.shuffle(corpus)

# The 'corpus' now contains sentence-label pairs in the form (sentence, review)


In [117]:
# Import necessary libraries and download required resources
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from stop_words import get_stop_words

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

# Sample sentence
corpus_new = []

# Preprocessing step 1: Remove stopwords
for sentence, val in corpus:
    # Tokenize the sentence
    words = word_tokenize(sentence)

    # Get the list of English stopwords
    stop_words = list(get_stop_words('en'))         # About 900 stopwords
    nltk_words = list(stopwords.words('english'))   # About 150 stopwords
    stop_words.extend(nltk_words)

    # Filter out stopwords from the sentence
    filtered_words = [word for word in words if word.lower() not in stop_words]
    filtered_sentence = ' '.join(filtered_words)
    corpus_new.append(tuple([filtered_sentence, val]))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rouna\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rouna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rouna\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [118]:
corpus_new[0]

("days , witnessing deluge films based old , cult tv shows . times , fans shows shudder thinking hollywood hacks present memories past . , five six years ago , ' many movies trend ' look depressing . , people , like author review , grew watching flintstones , popular 1960s animated series `` modern stone age `` family , ' particularly worried word came live action remake . , producer behind project steven spielberg , anything else , least special effects good . plot revolves flintstones , family set fictious stone age `` town `` bedrock , whose members enjoy lifestyle 1950s middle class america . fred flinstone ( john goodman ) works quarry , one day helps best friend neighbour barney rubble ( rick moranis ) wife betty ( rosie ' donnell ) adopt baby . return favour , barney switches results aptitude test fred , , based , fred gets well - paid job management . , course , sham - corrupt official cliff vandercave ( kyle maclachlan ) sultry secretary sharon stone ( halle berry ) need scape

In [None]:
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
#tags_to_eliminate = ['DT', 'IN', 'CC', 'PRP', '.', ',', ':', 'RP', 'UH', 'CD']
pos_tags_to_keep = ['NN', 'NNS', 'VB', 'VBD', 'VBG', 'VBN', 'VBP', 'VBZ', 'JJ', 'JJR', 'JJS', 'RB', 'RBR', 'RBS']
from nltk import word_tokenize, pos_tag
corpus_pos = []
j=0
for sentence, val in corpus_new: # removing any word other than Noun, Verb, adverb and adjective since these are the most important     
    if(j%100==0):
        print(j)
    j+=1
    new_sent = []
    path = pos_tag(word_tokenize(sentence))
    for word, element in path:
        if(element in pos_tags_to_keep):
            new_sent.append(word)
    new_sent = ' '.join(new_sent)
    corpus_pos.append(tuple([new_sent, val]))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rouna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\rouna\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


0
100
200
300


In [107]:
# Initialize a new corpus for stemmed words
new_corpus = []

# Preprocessing step 2: Stemming
for sentence, val in corpus_pos:
    #print(sentence)
    # Tokenize the sentence
    words = word_tokenize(sentence)
    
    # Create a Porter stemmer instance
    stemmer = PorterStemmer()
    
    # Apply stemming to each word in the sentence
    stemmed_words = [stemmer.stem(word) for word in words]
    
    # Join the stemmed words back into a sentence
    stemmed_sentence = ' '.join(stemmed_words)
    
    # Append the stemmed sentence and its label to the new corpus
    new_corpus.append(tuple([stemmed_sentence, val]))

In [108]:
# THEN USING THE SAME CLASSIFIER AS BEFORE

In [109]:
list_sentences = [sentence for sentence,_ in new_corpus]

In [110]:
# Import the necessary library
from sklearn.feature_extraction.text import TfidfVectorizer

# Assuming 'list_sentences' contains your list of sentences and 'new_corpus' contains your labeled data

# Create a copy of the 'list_sentences' (assuming it's a list of sentences)
sentences = list_sentences.copy()

# Initialize a TfidfVectorizer with a maximum of 2000 features (words or terms)
tfidf_vectorizer = TfidfVectorizer(max_features=5000)

# Fit and transform the sentences using the TfidfVectorizer
X = tfidf_vectorizer.fit_transform(sentences)

# Assuming 'new_corpus' is a list of tuples where the second element in each tuple is the label (1 or 0)
# Extract the labels (y) from 'new_corpus'
y = [val for _, val in new_corpus]

# The 'X' variable now contains the TF-IDF vectorized representation of your sentences,
# and 'y' contains the corresponding labels.


In [111]:
# train-test splitting
X_train = X[:1800]
y_train = y[:1800]
X_test = X[1800:]
y_test = y[1800:]

In [112]:
# training with Support Vector Classifier
from sklearn.svm import SVC
svc = SVC()
model = svc.fit(X_train, y_train)
from sklearn.metrics import accuracy_score
#X_val_pca = clf.transform(X_val_tfidf)
y_pred = model.predict(X_test.toarray())
accuracy_score(y_pred, y_test)

0.855

In [113]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
import catboost
# training with CatBoostClassifier
from catboost import CatBoostClassifier
models = {
    'Random Forest': RandomForestClassifier(),
    'Logistic Regression': LogisticRegression(),
    'SVM': SVC(),
    'K-Nearest Neighbors': KNeighborsClassifier(),
    'Naive Bayes': GaussianNB(),
    'Decision Tree': DecisionTreeClassifier(),
    'CatBoost': CatBoostClassifier(verbose=0)
}

trained_models = {}
for model_name, model in models.items():
    model.fit(X_train.toarray(), y_train)
    trained_models[model_name] = model

# Step 3: Make predictions
predictions = {}
for model_name, model in trained_models.items():
    y_pred = model.predict(X_test.toarray())
    predictions[model_name] = y_pred

# Step 4: Evaluate the models
classification_scores = {}
for model_name, y_pred in predictions.items():
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    
    classification_scores[model_name] = {
        'Accuracy': accuracy,
        'Precision': precision,
        'Recall': recall,
        'F1-Score': f1
    }

# Print classification scores for each model
for model_name, scores in classification_scores.items():
    print(f"Model: {model_name}")
    for metric, score in scores.items():
        print(f"{metric}: {score:.4f}")
    print()


KeyboardInterrupt: 

In [None]:
# SOME OTHER TRIES

In [267]:
feature_names = tfidf_vectorizer.get_feature_names_out()
unique_dict = {}
for i in range(len(sentences)):
    
    if(i%100==0): print(i)
    for j, word in enumerate(feature_names):
        tfidf_value = X[i, j]
        if(tfidf_value!=0):
            unique_dict[word] = tfidf_value

0
100
200
300
400
500
600
700
800
900
1000
1100
1200
1300
1400
1500
1600
1700
1800
1900


In [100]:
simplified_pos_tags = [[tag[1][0] if tag[1][0] in ['N', 'V', 'J'] else 'Other' for tag in doc] for doc in pos_tagged_documents]

# Join words and POS tags to create combined tokens
list_sent = [sentence for sentence,_ in corpus_pos]
tokenized_documents = [word_tokenize(doc) for doc in list_sent]
pos_tagged_documents = [pos_tag(tokens) for tokens in tokenized_documents]

combined_tokens = [' '.join([word, pos]) for words, poses in zip(tokenized_documents, simplified_pos_tags) for word, pos in zip(words, poses)]

In [80]:
unique_list = []
for key,items in unique_dict_var.items():
    unique_list.append(items)

In [273]:
unique_dict_var = {}
i=0
for key, value in unique_dict.items():
    list_1 = []
    if(i%2==0): 
        print(i)
        i+=1
    list_1.append(value)
    A = [1,3,4,5]
    tag_1 = [dict_to_list[j] for j in A]
    list_1.append([emmision_probability(key, tag) for tag in tag_1])
    unique_dict_var[key] = list_1

0


In [83]:
word_number = {}
n=1
for key,items in unique_dict.items():
    word_number[key] = n
    n+=1

In [None]:
X=[]
for sentence in sentences:
    vector = unique_list.copy()
    for word in sentence:
        if(word in unique_dict.keys() == False):
            vector[word_number[word]] = [0]*(len(final_tags)+1)
    

In [52]:
unique_dict

26022

In [None]:
unique_dict = {}
for i in range(2000):
    

In [32]:
tfidf_values = X.toarray()
feature_names = tfidf_vectorizer.get_feature_names_out()

unique_words_tfidf = {}

# Populate the dictionary with unique words and their TF-IDF values
for j, word in enumerate(feature_names):
    tfidf_values_for_word = tfidf_values[:, j]
    unique_words_tfidf[word] = list(tfidf_values_for_word)


In [None]:
unique_words_tfidf

In [68]:
!pip install xgboost

Collecting xgboost
  Downloading xgboost-2.0.0-py3-none-win_amd64.whl (99.7 MB)
     ---------------------------------------- 99.7/99.7 MB 3.0 MB/s eta 0:00:00
Installing collected packages: xgboost
Successfully installed xgboost-2.0.0


In [31]:
print(train_data[5])

  (0, 2038)	0.46847214607810606
  (0, 3250)	0.4615748873552156
  (0, 3929)	0.29183767244813036
  (0, 2991)	0.3543046043529594
  (0, 4880)	0.5973118883685008


In [66]:
Xpca.shape

(45258, 100)

0.6875

In [68]:
list_sentences_test = [sentence for sentence,_ in val_data]
X_val_tfidf = tfidf_vectorizer.transform(list_sentences_test)

ValueError: not enough values to unpack (expected 2, got 1)

In [76]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

sentences = list_sentences.copy()

tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

model = Word2Vec(tokenized_sentences, vector_size=10, window=5, min_count=1, sg=0)

vector = model.wv['boy']
print("Vector representation of 'document':")
print(vector)

Vector representation of 'document':
[-0.24991283 -1.6413478   0.7961875   1.3395231  -0.01218642 -0.68260735
  1.6818783   2.517201   -3.1405737   0.38644934]


In [None]:
tokens = word_tokenize(sent)
print('Word Tokens: ',tokens)
print()

print('POS Tags: ', pos_tag(tokens))

In [None]:
from nltk.stem import PorterStemmer

nltk.download('omw-1.4')
stemmer = PorterStemmer()

In [704]:
from sklearn.feature_extraction.text import TfidfVectorizer

sentences = [sentence for sentence, _ in train_data]

tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(sentences)

print("TfidfVectorizer Vocabulary:")
print(tfidf_vectorizer.get_feature_names_out())
print()

print("TfidfVectorizer Vectorized Data:")
print()
# print(X_tfidf.toarray())

print(sentences[0])
print(X_tfidf[0].toarray())
print()

TfidfVectorizer Vocabulary:
['00' '000' '007' ... 'zwigoff' 'zycie' 'zzzzzzz']

TfidfVectorizer Vectorized Data:

it does not need to be seen twice : its every detail and implication can be absorbed in one viewing .
[[0. 0. 0. ... 0. 0. 0.]]



In [5]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\rouna\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\rouna\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


True

In [634]:
sent = 'My name is Rounak Mandal. will'

tokens = sent_tokenize(sent)
print('Sentence Tokens: ',tokens)
print()

tokens = word_tokenize(sent)
print('Word Tokens: ',tokens)
print()

print('POS Tags: ', pos_tag(tokens))

Sentence Tokens:  ['My name is Rounak Mandal.', 'will']

Word Tokens:  ['My', 'name', 'is', 'Rounak', 'Mandal', '.', 'will']

POS Tags:  [('My', 'PRP$'), ('name', 'NN'), ('is', 'VBZ'), ('Rounak', 'NNP'), ('Mandal', 'NNP'), ('.', '.'), ('will', 'MD')]


In [37]:
from nltk.stem import PorterStemmer, WordNetLemmatizer

nltk.download('wordnet')
nltk.download('omw-1.4')
stemmer = PorterStemmer()

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\rouna\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\rouna\AppData\Roaming\nltk_data...


In [50]:
sent = 'I am studying Natural Language Processing. I also study nature'
words = word_tokenize(sent)

stemmed_words = [stemmer.stem(word) for word in words]

print(stemmed_words)

['i', 'am', 'studi', 'natur', 'languag', 'process', '.', 'i', 'also', 'studi', 'natur']


In [49]:
lemmatizer = WordNetLemmatizer()

lemmatized_words = [lemmatizer.lemmatize(word, pos='n') for word in words]

print(lemmatized_words)

['I', 'am', 'studying', 'Natural', 'Language', 'Processing', '.', 'I', 'also', 'study', 'nature']


In [51]:
from sklearn.feature_extraction.text import TfidfVectorizer

sentences = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]

tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(sentences)

print("TfidfVectorizer Vocabulary:")
print(tfidf_vectorizer.get_feature_names_out())
print()

print("TfidfVectorizer Vectorized Data:")
print()
# print(X_tfidf.toarray())
for i in range(len(sentences)):
    print(sentences[i])
    print(X_tfidf[i].toarray())
    print()

TfidfVectorizer Vocabulary:
['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']

TfidfVectorizer Vectorized Data:

This is the first document.
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]

This document is the second document.
[[0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]]

And this is the third one.
[[0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]]

Is this the first document?
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]



In [54]:
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize

sentences = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?"
]

tokenized_sentences = [word_tokenize(sentence.lower()) for sentence in sentences]

model = Word2Vec(tokenized_sentences, vector_size=10, window=5, min_count=1, sg=0)

vector = model.wv['document']
print("Vector representation of 'document':")
print(vector)

Vector representation of 'document':
[-0.00535778  0.00236474  0.05104287  0.09009314 -0.09300297 -0.07115427
  0.06456409  0.08971071 -0.05015172 -0.03761639]
