## Import relevant libraries

In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Oreo\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Oreo\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Oreo\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [10]:
from nltk.corpus import stopwords #=> stop words removal 
from nltk import pos_tag #=> for defining part of speech 
from nltk.stem.porter import PorterStemmer #=> stemming word into its root
from nltk.stem.wordnet import WordNetLemmatizer #=> Lemmatizing variants of word to its original form
from nltk.tokenize import word_tokenize , sent_tokenize #=> for tokenizing sentence

## 1. Normalize 

In [39]:
text = "storing storage to store"
text = text.lower()
print(text)

storing storage to store


## 2. Tokenize

In [40]:
text = word_tokenize(text)
print(text)

['storing', 'storage', 'to', 'store']


## 3. Remove Stopwords

In [41]:

#=> Remove stop words 
words = [w for w in text if w not in stopwords.words("english")]
print(words)

['storing', 'storage', 'store']


## 4. Stem/Lemmatize

In [42]:
#=> Stemming and Lemmatization 
#-> stemmer : returns Nouns and verbs to its root 
# branched , branches , branching -> branch 
stemmed = [PorterStemmer().stem(w) for w in words]
print(stemmed)
#-> Lemmatizer : another technique to return word's variant to its root 
# was , were , is -> be (uses a dictionairy)
# -> It is like the stemmer in most of stuff except it uses a dict + returns a meaningful word
lemmatized = [WordNetLemmatizer().lemmatize(w) for w in words]
print(lemmatized)

['store', 'storag', 'store']
['storing', 'storage', 'store']


## Spam Classifier by Naive Bayes Part II 

In [11]:
import os 
import pandas as pd 

path = "C:\\Users\\Oreo\\Documents\\NLP-Exercises-master\\NLP-Exercises-master\\1.5-spam-classifier"
print(os.listdir(path))

smsspamcollectionpath = path + "\\smsspamcollection\\SMSSpamCollection"
print(os.listdir(smsspamcollectionpath[:-18]))

df = pd.read_table(smsspamcollectionpath, 
              delimiter='\t' , 
              names=['label','sms'])

df.head(5)

['Bayesian_Inference.ipynb', 'Bayesian_Inference_solution.ipynb', 'images', 'smsspamcollection']
['readme', 'SMSSpamCollection']


Unnamed: 0,label,sms
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [12]:
df['label'] = df['label'].map({'ham':0 , 'spam':1})
df.head(3)

Unnamed: 0,label,sms
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...


In [51]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df['sms'], 
                                                    df['label'], 
                                                    random_state=1)

print('Number of rows in the total set: {}'.format(df.shape[0]))
print('Number of rows in the training set: {}'.format(X_train.shape[0]))
print('Number of rows in the test set: {}'.format(X_test.shape[0]))

Number of rows in the total set: 5572
Number of rows in the training set: 4179
Number of rows in the test set: 1393


In [52]:
# Instantiate the CountVectorizer method
count_vector = CountVectorizer()

# Fit the training data and then return the matrix
training_data = count_vector.fit_transform(X_train)


# Transform testing data and return the matrix. Note we are not fitting the testing data into the CountVectorizer()
testing_data = count_vector.transform(X_test)


In [53]:
from sklearn.naive_bayes import MultinomialNB
naive_bayes = MultinomialNB()
naive_bayes.fit(training_data, y_train)


predictions = naive_bayes.predict(testing_data)

In [54]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
#=> Accuracy: tp+tn/tp+tn+fp+fn
print('Accuracy score: ', format(accuracy_score(y_test, predictions)))
#=> Precision: tp/tp+fp   -> 90 actual emails spam / 90 actual + 10 falsely predicted positive 
#-> false positive : Type 1 , false negative : Type 2 
print('Precision score: ', format(precision_score(y_test, predictions)))
#=> Recall: tp/tp+fn -> 90 truely caught / 90 truely caught + 30 missed 
print('Recall score: ', format(recall_score(y_test, predictions)))
#=> F1 = 2*precision*recall/(precision+recall)
print('F1 score: ', format(f1_score(y_test, predictions)))

Accuracy score:  0.9885139985642498
Precision score:  0.9720670391061452
Recall score:  0.9405405405405406
F1 score:  0.9560439560439562


In [62]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test,predictions)
#--> actual 

 
# [[TP FP] 
# [FN TN]]

array([[1203,    5],
       [  11,  174]], dtype=int64)

##  Another Example

In [27]:
import pprint
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from toolz import pipe

count_vector = CountVectorizer()

documents = ['Hello, how are you!',
             'Win money, win from home.',
             'Call me now.',
             'Hello, Call hello you tomorrow?']

def lowercase(sentence):
    return sentence.lower()

def remove_punctuation(sentence):
    return sentence.translate(str.maketrans('', '', string.punctuation))

def remove_whitespace(sentence):
    return sentence.split(' ')

preprocessed_documents = [] 
for document in documents: 
    processed_doc = pipe(document,
                     lowercase,
                      remove_punctuation,
                      remove_whitespace)
    processed_doc = list(processed_doc)
    preprocessed_documents.append(processed_doc)
    
preprocessed_documents

frequency_list = []

for i in preprocessed_documents:
    frequency_counts = Counter(i)
    frequency_list.append(frequency_counts)
pprint.pprint(frequency_list)

[Counter({'hello': 1, 'how': 1, 'are': 1, 'you': 1}),
 Counter({'win': 2, 'money': 1, 'from': 1, 'home': 1}),
 Counter({'call': 1, 'me': 1, 'now': 1}),
 Counter({'hello': 2, 'call': 1, 'you': 1, 'tomorrow': 1})]


In [28]:
count_vector.fit(documents)
count_vector.get_feature_names_out()

array(['are', 'call', 'from', 'hello', 'home', 'how', 'me', 'money',
       'now', 'tomorrow', 'win', 'you'], dtype=object)

In [50]:
doc_array = count_vector.transform(documents).toarray()
print(doc_array)

frequency_matrix = pd.DataFrame(doc_array, 
                                columns = count_vector.get_feature_names_out())
frequency_matrix

[[1 0 0 1 0 1 0 0 0 0 0 1]
 [0 0 1 0 1 0 0 1 0 0 2 0]
 [0 1 0 0 0 0 1 0 1 0 0 0]
 [0 1 0 2 0 0 0 0 0 1 0 1]]


Unnamed: 0,are,call,from,hello,home,how,me,money,now,tomorrow,win,you
0,1,0,0,1,0,1,0,0,0,0,0,1
1,0,0,1,0,1,0,0,1,0,0,2,0
2,0,1,0,0,0,0,1,0,1,0,0,0
3,0,1,0,2,0,0,0,0,0,1,0,1


## BLEU (will not follow this approach)


In [9]:
from nltk.translate.bleu_score import sentence_bleu

# Define a reference and a candidate sentence
reference = [['the', 'quic', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']]
candidate = ['the', 'fast', 'brown', 'fox', 'jumped', 'over', 'the', 'lazy', 'dog']

# Calculate BLEU score
bleu_score = sentence_bleu(reference, candidate)

print("BLEU Score:", bleu_score)


BLEU Score: 0.7506238537503395


## Keyword Matching score

In [31]:
STOP_WORDS = stopwords.words("english")

In [35]:
def keyword_matching_score(student_answer, keywords):
    """
    Calculate the score of a student's answer based on keyword matching.
    
    Parameters:
    - student_answer (str): The student's answer.
    - model_answer (str): The model answer.
    - keywords (list of str): Keywords to match.
    
    Returns:
    - score (float): The keyword matching score.
    """
    # Convert both answers and keywords to lowercase for case-insensitive matching
    student_answer_lower = student_answer.lower()
    
    keywords_lower = [keyword.lower() for keyword in keywords]
    
    # Initialize score
    score = 0
    
    # Calculate score based on keyword matching
    for keyword in keywords_lower:
        if keyword in student_answer_lower:
            score += 1
    
    # Normalize score by dividing by the total number of keywords
    if keywords_lower:
        score /= len(keywords_lower)
    
    return score

student_answer = "By using a dynamic programming approach along with graph ds"
keywords = ["dynamic programming", "bfs", "graph"]

score = keyword_matching_score(student_answer, keywords)
print("Keyword Matching Score:", score)


Keyword Matching Score: 0.6666666666666666


In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

def tfidf_similarity(student_answer, model_answer):
    """
    Calculate the TF-IDF similarity score between a student's answer and a model answer.
    
    Parameters:
    - student_answer (str): The student's answer.
    - model_answer (str): The model answer.
    
    Returns:
    - similarity_score (float): The TF-IDF similarity score.
    """
    
    
    # Create a TF-IDF vectorizer
    vectorizer = TfidfVectorizer(input='content',analyzer='char_wb',
                                 decode_error='ignore')

    # Fit the vectorizer on the model answer
    vectorizer.fit([model_answer])

    # Transform both answers into TF-IDF vectors
    tfidf_matrix = vectorizer.transform([student_answer, model_answer])

    # Calculate cosine similarity between the TF-IDF vectors
    similarity_score = cosine_similarity(tfidf_matrix)[0, 1]

    return similarity_score

# Example usage:
model_answer = "A data structure is a storage format that defines the way data is stored, organized, and manipulated. Like trees , graphs and lists"
student_answer = "A data structure is a format for storage , ogranizing and manipulating data. some examples of data structures are trees , graphs and lists"

tfidf_score = tfidf_similarity(student_answer, model_answer)
print("TF-IDF Similarity Score:", tfidf_score)


TF-IDF Similarity Score: 0.9914506671503621


## Manual Tf-IDF