In [1]:
import math
import os
import pandas as pd
from sklearn.model_selection import train_test_split

Problem 1

In [2]:
df_sst = pd.read_csv("SST-2/train.tsv", delimiter="\t")
labels = df_sst['label']
X_train, X_temp, y_train, y_temp=train_test_split(df_sst, labels, test_size=(100 + 100), random_state=42)
X_validation, X_test, y_validation, y_test=train_test_split(X_temp, y_temp, test_size=100, random_state=42)
prior_prob_positive=y_train.sum()/len(y_train)
prior_prob_negative=1-prior_prob_positive
print("Prior probability of positive class in the training set:", prior_prob_positive)
print("Prior probability of negative class in the training set:", prior_prob_negative)

Prior probability of positive class in the training set: 0.5576702557000104
Prior probability of negative class in the training set: 0.4423297442999896


Problem 2

In [3]:
def tokenizer(sentence):
    retList=['<s>']
    tokenized_sentence=sentence.split()
    for word in tokenized_sentence:
        retList.append(word)
    retList.append('</s>')
    return retList

X_train['tokenized_sentence']=X_train['sentence'].apply(tokenizer)
print("Tokenization of the first sentence in the training set:")
print(type(X_train['tokenized_sentence'].iloc[0]))
vocabulary=set()
for tokens in X_train['tokenized_sentence']:
    vocabulary.update(tokens)

vocabulary_size=len(vocabulary)
print("\nVocabulary size of the training set (including start and end symbols):", vocabulary_size)

Tokenization of the first sentence in the training set:
<class 'list'>

Vocabulary size of the training set (including start and end symbols): 14817


Problem 3

In [4]:
def bigram_counts_organizer(tokenized_sequences):
    bigram_counts={}
    for sequence in tokenized_sequences:
        for i in range(len(sequence)-1):
            wi=sequence[i]
            wj=sequence[i+1]
            if wi not in bigram_counts:
                bigram_counts[wi] ={}
            if wj not in bigram_counts[wi]:
                bigram_counts[wi][wj]=0
            bigram_counts[wi][wj]+=1
    return bigram_counts

bigram_counts = bigram_counts_organizer(X_train['tokenized_sentence'])
start_with_the_count = bigram_counts["<s>"].get("the", 0)

print("The bigram ('<s>', 'the') occurs", start_with_the_count,"times in the training set:")


The bigram ('<s>', 'the') occurs 4451 times in the training set:


Problem 4

In [5]:
def Lidstone_smoothing(curr_word,prev_word,bigram_counts,alpha,vocab_size):
    count_bigram=0
    if prev_word in bigram_counts and curr_word in bigram_counts[prev_word]:
        count_bigram=bigram_counts[prev_word][curr_word]
    count_prev_word=sum(bigram_counts.get(prev_word, {}).values())
    return math.log(((count_bigram+alpha)/(count_prev_word+alpha*vocab_size)))

Lidstone_smoothing("award","academy",bigram_counts,.001,vocabulary_size)#these words are solely based on 

-1.0251860898691059

In [6]:
Lidstone_smoothing("award","academy",bigram_counts,.5,vocabulary_size)

-6.173181082203538

Problem 5

In [7]:
def log_prob_calculator(sentence,bigram_counts,alpha,vocabulary_size):
    sentence=sentence.split()
    log_probability=0
    prev_word="<s>"
    for curr_word in sentence:
        log_probability+=Lidstone_smoothing(curr_word,prev_word,bigram_counts,alpha,vocabulary_size)
        prev_word=curr_word
    return log_probability
natural_English="this was a really great movie but it was a little too long."
unnatural_English="long too little a was it but movie great really a was this."

log_prob_calculator(natural_English,bigram_counts,.0001,vocabulary_size)

-80.27771915479497

In [8]:
log_prob_calculator(unnatural_English,bigram_counts,.0001,vocabulary_size)

-173.9217151851817

Problem 6

In [9]:
X_validation['tokenized_sentence']=X_validation['sentence'].apply(tokenizer)
tokenized_sentence_list=X_validation['tokenized_sentence'].tolist()
tokenized_sentence_list.remove(tokenized_sentence_list[0])
tokenized_sentence_list.remove(tokenized_sentence_list[-1])
tokenized_sentence_list=[word for sentence in tokenized_sentence_list for word in sentence]
tokenized_sentence_list=tokenized_sentence_list[1:-1]
tokenized_sentence=' '.join(tokenized_sentence_list)
selected_alpha=log_prob_calculator(tokenized_sentence,bigram_counts,.001,vocabulary_size)
print(selected_alpha)

-5787.20481731709


In [10]:
log_prob_calculator(tokenized_sentence,bigram_counts,.01,vocabulary_size)

-6322.267898978088

In [11]:
log_prob_calculator(tokenized_sentence,bigram_counts,.1,vocabulary_size)

-7415.913096828978

Problem 7

In [12]:
positive_sentences=X_train[X_train['label'] == 1]['tokenized_sentence'].tolist()
negative_sentences=X_train[X_train['label'] == 0]['tokenized_sentence'].tolist()
positive_vocab_size=len(set(word for sentence in positive_sentences for word in sentence))
negative_vocab_size=len(set(word for sentence in negative_sentences for word in sentence))
positive_bigram_counts=bigram_counts_organizer(positive_sentences)
negative_bigram_counts=bigram_counts_organizer(negative_sentences)
predicted_labels=[]
true_labels=X_test['label'].tolist()
correct_predictions=0
for i, sentence in enumerate(X_test['sentence']):
    log_odds_positive=log_prob_calculator(sentence,positive_bigram_counts,selected_alpha,positive_vocab_size)+math.log(prior_prob_positive)-log_prob_calculator(sentence,negative_bigram_counts,selected_alpha,negative_vocab_size)-math.log(prior_prob_negative)
    if log_odds_positive<0:
        predicted_label=0
    else:
        predicted_label=1
    predicted_labels.append(predicted_label)
    if predicted_label==true_labels[i]:
        correct_predictions += 1
class_distribution={
    "Predicted Positive Sentiment":predicted_labels.count(1),
    "Predicted Negative Sentiment":predicted_labels.count(0)
}
print("Distribution of Predicted Sentiments:",class_distribution)
print("Accuracy:",correct_predictions/len(X_test))#coin flip, 1/2 ull get the right answer,

Distribution of Predicted Sentiments: {'Predicted Positive Sentiment': 45, 'Predicted Negative Sentiment': 55}
Accuracy: 0.52
