In [2]:
# load in data
import pandas as pd
import numpy as np
# import matplotlib.pyplot as plt
import os

# remove stop words and punctuation and html tags and lowercase
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # remove html tags
    text = re.sub('<[^<]+?>', ' ', text)
    # remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # remove stop words and lemmatize
    text = ' '.join([lemmatizer.lemmatize(word) for word in word_tokenize(text) if word.lower() not in stop_words])
    return text.lower()

# load in the data
data = pd.DataFrame(columns=['text'])

#navigate to folder
# for file in os.listdir('aclImdb/train/unsup'):
#     if file.endswith('.txt'):
#         with open('aclImdb/train/unsup/' + file, 'r') as f:
#             data.loc[len(data)] = preprocess_text(f.read())

# load in the data
data = pd.DataFrame(columns=["text"])

# navigate to folder
for file in os.listdir("train/pos"):
    if file.endswith(".txt"):
        with open("train/pos/" + file, "r") as f:
            data.loc[len(data)] = preprocess_text(f.read())

for file in os.listdir("train/neg"):
    if file.endswith(".txt"):
        with open("train/neg/" + file, "r") as f:
            data.loc[len(data)] = preprocess_text(f.read())

# shuffle the data
data = data.sample(frac=1).reset_index(drop=True)

print(data.head())




                                                text
0  moonchild difficult movie categorise easiest t...
1  beginning movie give feeling director trying p...
2  found movie thoughtprovoking ambiguity refresh...
3  thirty year ago author numa sadoul published b...
4  best movie ever seen ive seen movie dutch tele...


In [None]:
#preprocess the training data
print(data.shape)
print(data.loc[0, 'text'])

# set seed words
pos_seed_words = ['good', 'great', 'excellent', 'amazing', 'awesome', 'fantastic', 'terrific', 'wonderful', 'superb', 'brilliant']
neg_seed_words = ['bad', 'terrible', 'crap', 'useless', 'hate', 'horrible', 'awful', 'worst', 'boring', 'disgusting']

num_pos = 0
num_neg = 0

#I have currently set the automatic sentiment analysis to be based on the number of seed words in the review
# if there are 4 more positive seed words than negative seed words, the review is positive and vice versa
# this gives about 1000 seeds and 3 takes it up to 2000
diff_threshold = 4

# create a target variable
y = np.zeros(data.shape[0])
for i in range(data.shape[0]):
    pos_count = 0
    neg_count = 0
    for word in data.loc[i, 'text'].split():
        if word in pos_seed_words:
            pos_count += 1
        if word in neg_seed_words:
            neg_count += 1
    if pos_count - neg_count > diff_threshold:
        y[i] = 1
        num_pos += 1
        # print(i, "positive")
    elif neg_count - pos_count > diff_threshold:
        y[i] = -1
        num_neg += 1
        # print(i, "negative")

print("total positive reviews:", num_pos)
print("total negative reviews:", num_neg)

# add the target variable to the data
data['sentiment'] = y


(25000, 2)
moonchild difficult movie categorise easiest think several snapshot life two central character fact character member street gang set multicultural city near future one vampire preclude moment like people one place movie different anything else ive ever heard doesnt get wrapped fact one main character vampire something dealt like problem way character interact surprisingly realistic embarrassing relative trick meant look cool dont work leaf film lovely sense taking seriously part area really stood language fictional city mallepa contains various cultural group character speak language would expected speak japanese gang member speak japanese chinese talking character chinese descent possibly amusing exchange involves australian conducted english actor four arguably main character three separate mother tongue speak varying level others language quite feat movie made suppose brings lead actor much made fact movie star two japans biggest rockstars gackt hyde well taiwanese supers

In [None]:
# print the first positive review
print(data.loc[y == 1, 'text'].iloc[0])

# test seed words to make sure they preprocess correctly
pos_sentence = ' '.join(word for word in pos_seed_words)
neg_sentence = ' '.join(word for word in neg_seed_words)

print(pos_sentence)
print(neg_sentence)

preprocess_text(pos_sentence)
preprocess_text(neg_sentence)

print(pos_sentence)
print(neg_sentence)

american paris integrated musical meaning song dance blend perfectly story film inspired 1928 orchestral composition george gershwin story film interspersed showstopping dance number choreographed gene kelly set popular gershwin tune songs music include got rhythm wonderful love stay set new standard subgenre known songbook musical dozen gershwin tune buried underscore climax american paris ballet 18 minute dance featuring kelly caron set gershwins american paris featuring impressionistic period daydream style various painter one longest uninterrupted dance sequence hollywood film ballet alone cost half million dollar staggering sum time funny think work art born pool game film producer arthur freed singin rain wizard oz town meet st louis band wagon ira gershwin freeds idea buy title could use film paris gershwins idea would use gershwin music original cast cyd charisse discovered pregnant shooting began major reason gene kelly suggested leslie caron female lead felt movie needed real

In [None]:
# create decision tree usingt seed sete
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()


# create a decision tree classifier
clf = DecisionTreeClassifier(max_depth=5)

max_iterations = 10
conf_thresh = 0.9

 # separate seed set from unlabelled set
seed_set = data[data['sentiment'] != 0]
unlabelled_set = data[data['sentiment'] == 0]

X = seed_set['text']
y = seed_set['sentiment']

for i in range(max_iterations):

    # vectorize the text
    vector_X = vectorizer.fit_transform(X)
    unlabelled_X = vectorizer.transform(unlabelled_set['text'])

    # train classifier
    clf.fit(vector_X, y)

    # predict labels for unlabelled set
    y_pred = clf.predict(unlabelled_X)

    # get confidence scores
    conf_scores = clf.predict_proba(unlabelled_X)
    print("scores", conf_scores) # this is the confidence score for each class and if it is 0, 1 it is determining the class immediately

    # get indices of high confidence predictions
    high_conf_indices = np.where(np.max(conf_scores, axis=1) > conf_thresh)[0]
    print("Number of high confidence predictions:", len(high_conf_indices))

    if len(high_conf_indices) == 0:
        print("No high confidence predictions left", i)
        break

    # add high confidence predictions to seed set
    X = np.concatenate((X, unlabelled_set.iloc[high_conf_indices]['text']))
    y = np.concatenate((y, y_pred[high_conf_indices]))
    # remove high confidence predictions from unlabelled set
    unlabelled_set = unlabelled_set.drop(unlabelled_set.index[high_conf_indices])

    if (unlabelled_set.shape[0] == 0):
        print("No more unlabelled data left")
        break



scores [[0.00926641 0.99073359]
 [1.         0.        ]
 [0.02290076 0.97709924]
 ...
 [0.00926641 0.99073359]
 [0.00926641 0.99073359]
 [1.         0.        ]]
Number of high confidence predictions: 22633
scores [[0.8 0.2]
 [0.2 0.8]
 [0.8 0.2]
 [0.2 0.8]
 [0.8 0.2]
 [0.8 0.2]
 [0.2 0.8]
 [0.8 0.2]
 [0.8 0.2]
 [0.2 0.8]
 [0.8 0.2]
 [0.8 0.2]
 [0.2 0.8]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.2 0.8]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.2 0.8]
 [0.2 0.8]
 [0.8 0.2]
 [0.2 0.8]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.2 0.8]
 [0.2 0.8]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.2 0.8]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.2 0.8]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.8 0.2]
 [0.2 0.8]
 [0.8 0.2]
 [0.8 0.2]
 [0.