In [9]:
import os
import random
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.util import ngrams
from sentiment_read_subjectivity import readSubjectivity
from nltk.stem import PorterStemmer

lexicon_path = "/Users/subhiksha/Documents/NLP/subjclueslen1-HLTEMNLP05.tff"
data = []
negationwords = ['no', 'not', 'never', 'none', 'nowhere', 'nothing', 'noone', 'rather', 'hardly', 'scarcely', 'rarely', 'seldom', 'neither', 'nor']
negationwords.extend(['ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn'])
     
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def processkaggle(dirPath,flag):
    os.chdir(dirPath)
    if flag=='train':
        filepath = './train.tsv'
    else:
        filepath = './test.tsv'
    with open(filepath, 'r') as f:   
        phrasedata = []
        for line in f:
            if not line.startswith('Phrase'):
                line = line.strip()
                parts = line.split('\t')           
                if flag == 'train':
                    phrasedata.append((parts[2], parts[3])) 
                else:
                    phrasedata.append(parts[-1])
    if flag=='train':
        samples_per_class = 4000
        balanced_data = []
        labels = ['0', '1', '2', '3', '4']   
        for label in labels:
            class_data = [item for item in phrasedata if item[1] == label]
            if len(class_data) >= samples_per_class:
                class_sample = class_data[:samples_per_class]
            else:
                class_sample = class_data 
            balanced_data.extend(class_sample)
            
        random.shuffle(balanced_data)
        
        phraselist = balanced_data
    elif flag=='test':
        phraselist = phrasedata[:10000]
    
    #nltk.download('stopwords')
    nltkstopwords = nltk.corpus.stopwords.words('english')
    morestopwords = ['could', 'would', 'might', 'must', 'need', 'sha', 'wo', 'y', "'s", "'d", "'ll", "'t", "'m", "'re", "'ve", "n't"]
    punctuation = set(string.punctuation)
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    #pos_list, neutral_list, neg_list = readSubjectivity(lexicon_path)
    
    output_lines = []
    for item in phraselist:
        if flag == 'train':
            phrase, label = item
        else:
            phrase = item
        tokens = nltk.word_tokenize(phrase)
        stopwords = set(nltkstopwords+morestopwords) 
        stopwords = [word for word in stopwords if word not in negationwords]
        tagged = nltk.pos_tag(tokens)
        lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in tagged]
        filtered_tokens = [word for word in lemmatized_tokens if word.lower() not in stopwords and not all(char in punctuation for char in word)]
        if flag == 'train':  
            output_lines.append(','.join(filtered_tokens) + ',' + label)
        else:  
            output_lines.append(','.join(filtered_tokens))
        
    return output_lines

  
path= "/Users/subhiksha/Documents/NLP/NLP project/FinalProjectData/kagglemoviereviews/corpus"
train_data = processkaggle(path,'train')
test_data = processkaggle(path,'test')


In [10]:
sldict = readSubjectivity(lexicon_path)
processed_train_data = [(doc[:-2].split(','), doc[-1]) for doc in train_data]

def document_features(document, word_features, negationwords, SL):
    document_words = document
    features = {}
    for word in word_features:
        features['V_{}'.format(word)] = (word in document_words)
    # count variables for the 4 classes of subjectivity
    weakPos = 0
    strongPos = 0
    weakNeg = 0
    strongNeg = 0
    for word in document_words:
        if word in SL:
            strength, posTag, isStemmed, polarity = SL[word]
            if strength == 'weaksubj' and polarity == 'positive':
                weakPos += 1
            if strength == 'strongsubj' and polarity == 'positive':
                strongPos += 1
            if strength == 'weaksubj' and polarity == 'negative':
                weakNeg += 1
            if strength == 'strongsubj' and polarity == 'negative':
                strongNeg += 1
    features['positivecount'] = weakPos + (2 * strongPos)
    features['negativecount'] = weakNeg + (2 * strongNeg)
            
    return features

# all_words_list = [word for document in train_data for word in document]
# new_all_words = nltk.FreqDist(all_words_list)
# new_word_features = list(new_all_words)[:2000] 


# common_words_train = set(word for document in train_data for word in document[0])
# train_featuresets = [(document_features(doc, new_word_features,negationwords,sldict), doc[-1]) for doc in train_data]


# Build a list of all words in all documents
all_words_list = [word for document, label in processed_train_data for word in document]
all_words_freq = nltk.FreqDist(all_words_list)
word_features = list(all_words_freq)[:2000]  # top 2000 words as features

# Generate features for each document
train_featuresets = [(document_features(doc, word_features, negationwords, sldict), label) for doc, label in processed_train_data]


In [11]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC  
from sklearn.ensemble import RandomForestClassifier


def cross_validation_PRF(num_folds, featuresets):
    subset_size = len(featuresets) // num_folds
    accuracy_list = []

    for i in range(num_folds):
        test_this_round = featuresets[i * subset_size:][:subset_size]
        train_this_round = featuresets[:i * subset_size] + featuresets[(i + 1) * subset_size:]

        # logistic_regression = SklearnClassifier(LogisticRegression(random_state=42, max_iter=300))
        # logistic_regression.train(train_this_round)
        # accuracy = nltk.classify.accuracy(logistic_regression, test_this_round)
        
        # classifier = nltk.NaiveBayesClassifier.train(train_this_round)
        # accuracy = nltk.classify.accuracy(classifier, test_this_round)

        rf_classifier = SklearnClassifier(RandomForestClassifier(n_estimators=100, random_state=42))
        rf_classifier.train(train_this_round)
        accuracy = nltk.classify.accuracy(rf_classifier, test_this_round)

        accuracy_list.append(accuracy)
        print(f"Fold {i+1}: Accuracy = {accuracy}")

    avg_accuracy = sum(accuracy_list) / num_folds
    print(f"Average Accuracy across all folds: {avg_accuracy}")

cross_validation_PRF(5,train_featuresets)


Fold 1: Accuracy = 0.64275
Fold 2: Accuracy = 0.6385
Fold 3: Accuracy = 0.62975
Fold 4: Accuracy = 0.6475
Fold 5: Accuracy = 0.65575
Average Accuracy across all folds: 0.6428499999999999


In [4]:
train_featuresets[:1]

[({'V_film': False,
   'V_movie': False,
   'V_make': False,
   'V_one': False,
   'V_not': False,
   'V_like': False,
   'V_bad': False,
   'V_story': False,
   'V_good': False,
   'V_character': False,
   'V_-RRB-': False,
   'V_funny': True,
   'V_-LRB-': False,
   'V_comedy': False,
   'V_time': False,
   'V_performance': False,
   'V_see': False,
   'V_little': False,
   'V_feel': False,
   'V_way': False,
   'V_go': False,
   'V_life': False,
   'V_much': False,
   'V_get': False,
   'V_work': False,
   'V_best': False,
   'V_action': False,
   'V_love': False,
   'V_year': False,
   'V_come': False,
   'V_well': False,
   'V_even': False,
   'V_audience': False,
   'V_no': False,
   'V_': False,
   'V_take': False,
   'V_director': False,
   'V_u': False,
   'V_thing': False,
   'V_enough': False,
   'V_look': False,
   'V_great': False,
   'V_minute': False,
   'V_actor': False,
   'V_give': False,
   'V_ever': False,
   'V_never': False,
   'V_end': False,
   'V_really': False