In [None]:
# load in data
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
import os

# remove stop words and punctuation and html tags and lowercase
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # remove html tags
    text = re.sub('<[^<]+?>', ' ', text)
    # remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # remove stop words and lemmatize
    text = ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text) if word.lower() not in stop_words])
    return text.lower()

# load in the data
data = pd.DataFrame(columns=['text'])

#navigate to folder
# for file in os.listdir('aclImdb/train/unsup'):
#     if file.endswith('.txt'):
#         with open('aclImdb/train/unsup/' + file, 'r') as f:
#             data.loc[len(data)] = preprocess_text(f.read())

# load in the data
data = pd.DataFrame(columns=["text"])

# navigate to folder
for file in os.listdir("train/pos"):
    if file.endswith(".txt"):
        with open("train/pos/" + file, "r") as f:
            data.loc[len(data)] = preprocess_text(f.read())

for file in os.listdir("train/neg"):
    if file.endswith(".txt"):
        with open("train/neg/" + file, "r") as f:
            data.loc[len(data)] = preprocess_text(f.read())

# shuffle the data
data = data.sample(frac=1).reset_index(drop=True)

print(data.head())

# load in the data
test_data = pd.DataFrame(columns=["text", "sentiment"])

# navigate to folder
for file in os.listdir("test/pos"):
    if file.endswith(".txt"):
        with open("test/pos/" + file, "r") as f:
            test_data.loc[len(data)] = [preprocess_text(f.read()), 1]

for file in os.listdir("test/neg"):
    if file.endswith(".txt"):
        with open("test/neg/" + file, "r") as f:
            test_data.loc[len(data)] = [preprocess_text(f.read()), -1]

# shuffle the data
test_data = test_data.sample(frac=1).reset_index(drop=True)

print(test_data.head())




                                                text
0  remember original series vividly mostly due un...
1  girlfight using wellknown formula someone poin...
2  maybe wasnt good whole second episode first on...
3  two thing haunt throughout lintrus intruder wh...
4  couldnt believe put movie dvd player thought i...


In [23]:
#preprocess the training data
print(data.shape)
print(data.loc[0, 'text'])

# set seed words
pos_seed_words = ['good', 'great', 'excellent', 'amazing', 'awesome', 'fantastic', 'terrific', 'wonderful', 'superb', 'brilliant']
neg_seed_words = ['bad', 'terrible', 'crap', 'useless', 'hate', 'horrible', 'awful', 'worst', 'boring', 'disgusting']

# pos_seed_words = ['bad']
# neg_seed_words = ['great']


num_pos = 0
num_neg = 0

#I have currently set the automatic sentiment analysis to be based on the number of seed words in the review
# if there are 4 more positive seed words than negative seed words, the review is positive and vice versa
# this gives about 1000 seeds and 3 takes it up to 2000
diff_threshold = 4

# # create a target variable
# y = np.zeros(data.shape[0])
# for i in range(data.shape[0]):
#     pos_count = 0
#     neg_count = 0
#     for word in data.loc[i, 'text'].split():
#         if word in pos_seed_words:
#             pos_count += 1
#         if word in neg_seed_words:
#             neg_count += 1
#     if pos_count - neg_count > diff_threshold:
#         y[i] = 1
#         num_pos += 1
#         # print(i, "positive")
#     elif neg_count - pos_count > diff_threshold:
#         y[i] = -1
#         num_neg += 1
#         # print(i, "negative")

#create a target variable
y = np.zeros(data.shape[0])
for i in range(data.shape[0]):
    pos_count = 0
    neg_count = 0
    for word in data.loc[i, 'text'].split():
        if word in pos_seed_words:
            pos_count += 1
        if word in neg_seed_words:
            neg_count += 1
    if pos_count > diff_threshold and neg_count == 0:
        y[i] = 1
        num_pos += 1
        # print(i, "positive")
    elif neg_count > diff_threshold and pos_count == 0:
        y[i] = -1
        num_neg += 1
        # prin

print("total positive reviews:", num_pos)
print("total negative reviews:", num_neg)

# add the target variable to the data
data['sentiment'] = y


(25000, 1)
remember original series vividly mostly due unique blend wry humor macabre subject matter kolchak hardbitten newsman ben hecht school bigcity reporting gritty determination wiseass demeanor made even mundane episode eminently watchable personal fave spanish moss murders due totally original storyline poortroubled cajun youth louisiana bayou country take part sleep research experiment purpose dream analysis something go inexplicably wrong literally dream life swamp creature inhabiting dark folk tale youth malevolent manifestation seek person wronged dreamer conscious state brutally suffocates death kolchak investigates uncovers horrible truth much chagrin police captain joe mad dog siskawonderfully essayed grumpy keenan wynnand head sleep researcher played second city improv founder severn darden droll understated perfection wickedly funny harrowing finale take place chicago sewer system series highlight kolchak never got better timeless
total positive reviews: 577
total nega

In [24]:
# print the first positive review
print(data.loc[y == 1, 'text'].iloc[0])

# test seed words to make sure they preprocess correctly
pos_sentence = ' '.join(word for word in pos_seed_words)
neg_sentence = ' '.join(word for word in neg_seed_words)

print(pos_sentence)
print(neg_sentence)

preprocess_text(pos_sentence)
preprocess_text(neg_sentence)

print(pos_sentence)
print(neg_sentence)

intend write review read default review show movie url felt compelled write rebuttal movie word superlative deserve slanderous review writer written think writer totally missed point movie large extent fact turned excessive show evangelist devotion occupied middle movie large extent however must beg differ reviewer movie end propaganda piece evangelist action think director shown religion enough find answer religion large extent incapable providing answer basic simple question one may ask religion offer sometimes banal platitude one kind another demonstrate value judgment religion remember religion transmuted expressed ordinary mostly well meaning basically good people usually monopoly truth thus religion end provide ultimate answer question life ultimately matter faith take faith thats given faith appreciate show faith given faith show faith tiresome thus time instructive note reviewer reaction movie case director show u one choose accept religious interpretation event answer question

In [27]:
# create decision tree usingt seed sete
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.calibration import CalibratedClassifierCV

vectorizer = CountVectorizer()


# create a decision tree classifier
clf = DecisionTreeClassifier(max_depth=5, class_weight='balanced')

# Wrap the decision tree in a calibrated classifier
calibrated_clf = CalibratedClassifierCV(clf, method='isotonic')

max_iterations = 10
conf_thresh = 0.9

 # separate seed set from unlabelled set
seed_set = data[data['sentiment'] != 0]
unlabelled_set = data[data['sentiment'] == 0]

X = seed_set['text']
y = seed_set['sentiment']

for i in range(max_iterations):

    # vectorize the text
    vector_X = vectorizer.fit_transform(X)
    unlabelled_X = vectorizer.transform(unlabelled_set['text'])

    # train classifier
    clf.fit(vector_X, y)
    calibrated_clf.fit(vector_X, y)

    # predict labels for unlabelled set
    y_pred = calibrated_clf.predict(unlabelled_X)

    # get confidence scores
    conf_scores = calibrated_clf.predict_proba(unlabelled_X)
    print("scores", conf_scores) # this is the confidence score for each class and if it is 0, 1 it is determining the class immediately

    # get indices of high confidence predictions
    high_conf_indices = np.where(np.max(conf_scores, axis=1) > conf_thresh)[0]
    print("Number of high confidence predictions:", len(high_conf_indices))

    if len(high_conf_indices) == 0:
        print("No high confidence predictions left", i)
        break

    # add high confidence predictions to seed set
    X = np.concatenate((X, unlabelled_set.iloc[high_conf_indices]['text']))
    y = np.concatenate((y, y_pred[high_conf_indices]))
    # remove high confidence predictions from unlabelled set
    unlabelled_set = unlabelled_set.drop(unlabelled_set.index[high_conf_indices])

    if (unlabelled_set.shape[0] == 0):
        print("No more unlabelled data left")
        break



scores [[0.01018496 0.98981504]
 [0.01018496 0.98981504]
 [0.01018496 0.98981504]
 ...
 [0.01018496 0.98981504]
 [0.01018496 0.98981504]
 [0.20514294 0.79485706]]
Number of high confidence predictions: 22791
scores [[0.80042283 0.19957717]
 [0.80042283 0.19957717]
 [0.80042283 0.19957717]
 ...
 [0.2        0.8       ]
 [0.80042283 0.19957717]
 [0.2        0.8       ]]
Number of high confidence predictions: 22
scores [[9.99296270e-01 7.03729768e-04]
 [9.99296270e-01 7.03729768e-04]
 [9.99296270e-01 7.03729768e-04]
 ...
 [2.41545894e-04 9.99758454e-01]
 [9.99296270e-01 7.03729768e-04]
 [2.41545894e-04 9.99758454e-01]]
Number of high confidence predictions: 1370
scores [[0.40022995 0.59977005]]
Number of high confidence predictions: 0
No high confidence predictions left 3


In [28]:


# test the classifier
X_test = test_data['text']
y_test = test_data['sentiment']

vector_X_test = vectorizer.transform(X_test)
y_pred = calibrated_clf.predict(vector_X_test)

from sklearn.metrics import accuracy_score
print("Accuracy:", accuracy_score(y_test, y_pred))

from sklearn.metrics import confusion_matrix
print("Confusion matrix:")
print(confusion_matrix(y_test, y_pred))

from sklearn.metrics import classification_report
print("Classification report:")
print(classification_report(y_test, y_pred))





Accuracy: 0.0
Confusion matrix:
[[0 1]
 [0 0]]
Classification report:
              precision    recall  f1-score   support

        -1.0       0.00      0.00      0.00       1.0
         1.0       0.00      0.00      0.00       0.0

    accuracy                           0.00       1.0
   macro avg       0.00      0.00      0.00       1.0
weighted avg       0.00      0.00      0.00       1.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
