In [70]:
import os
import random
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.util import ngrams
from sentiment_read_subjectivity import readSubjectivity
from nltk.stem import PorterStemmer

lexicon_path = "/Users/subhiksha/Documents/NLP/subjclueslen1-HLTEMNLP05.tff"
data = []
negationwords = ['no', 'not', 'never', 'none', 'nowhere', 'nothing', 'noone', 'rather', 'hardly', 'scarcely', 'rarely', 'seldom', 'neither', 'nor']
negationwords.extend(['ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn'])
     
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def processkaggle(dirPath,flag):
    os.chdir(dirPath)
    if flag=='train':
        filepath = './train.tsv'
    else:
        filepath = './test.tsv'
    with open(filepath, 'r') as f:   
        phrasedata = []
        for line in f:
            if not line.startswith('Phrase'):
                line = line.strip()
                parts = line.split('\t')           
                if flag == 'train':
                    phrasedata.append((parts[2], parts[3])) 
                else:
                    phrasedata.append(parts[-1])
    if flag=='train':
        samples_per_class = 4000
        balanced_data = []
        labels = ['0', '1', '2', '3', '4']   
        for label in labels:
            class_data = [item for item in phrasedata if item[1] == label]
            if len(class_data) >= samples_per_class:
                class_sample = class_data[:samples_per_class]
            else:
                class_sample = class_data 
            balanced_data.extend(class_sample)
            
        random.shuffle(balanced_data)
        
        phraselist = balanced_data
    elif flag=='test':
        phraselist = phrasedata[:10000]
    
    #nltk.download('stopwords')
    nltkstopwords = nltk.corpus.stopwords.words('english')
    morestopwords = ['could', 'would', 'might', 'must', 'need', 'sha', 'wo', 'y', "'s", "'d", "'ll", "'t", "'m", "'re", "'ve", "n't"]
    punctuation = set(string.punctuation)
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    #pos_list, neutral_list, neg_list = readSubjectivity(lexicon_path)
    
    output_lines = []
    for item in phraselist:
        if flag == 'train':
            phrase, label = item
        else:
            phrase = item
        tokens = nltk.word_tokenize(phrase)
        stopwords = set(nltkstopwords+morestopwords) 
        stopwords = [word for word in stopwords if word not in negationwords]
        tagged = nltk.pos_tag(tokens)
        lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in tagged]
        filtered_tokens = [word for word in lemmatized_tokens if word.lower() not in stopwords and not all(char in punctuation for char in word)]
        if flag == 'train':  
            output_lines.append(','.join(filtered_tokens) + ',' + label)
        else:  
            output_lines.append(','.join(filtered_tokens))
        
    return output_lines

  
path= "/Users/subhiksha/Documents/NLP/NLP project/FinalProjectData/kagglemoviereviews/corpus"
train_data = processkaggle(path,'train')
test_data = processkaggle(path,'test')


In [71]:
# !pip install xgboost
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
import numpy as np

def load_glove_embeddings(path):
    glove_model = {}
    with open(path, 'r', encoding='utf-8') as file:
        for line in file:
            split_line = line.split()
            word = split_line[0]
            embedding = np.array([float(val) for val in split_line[1:]], dtype=np.float32)
            glove_model[word] = embedding
    print(f"Loaded {len(glove_model)} words.")
    return glove_model

glove_path = '/Users/subhiksha/Downloads/glove.6B/glove.6B.100d.txt'
glove_model = load_glove_embeddings(glove_path)

documents = [' '.join(line.split(',')[:-1]) for line in train_data]
labels = [int(line.split(',')[-1]) for line in train_data]

def document_vector_glove(document, embeddings):
    words = document.split()  # Split document into words
    word_vectors = [embeddings.get(word, np.zeros(100)) for word in words]  # Handle out-of-vocabulary words
    return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(100)

X_glove = np.array([document_vector_glove(doc, glove_model) for doc in documents])

classifier = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
classifier.fit(X_glove, labels) 

kf = KFold(n_splits=5, random_state=42, shuffle=True)  
scores = cross_val_score(classifier, X_glove, labels, cv=kf)

print("Cross-Validation Accuracy Scores:", scores)
print("Mean CV Accuracy:", scores.mean())


Loaded 400000 words.
Cross-Validation Accuracy Scores: [0.63875 0.65825 0.643   0.64575 0.6515 ]
Mean CV Accuracy: 0.64745


In [75]:
import csv

test_documents = [line for line in test_data]
test_documents = [' '.join(line.split(',')[:-1]) for line in test_data]

X_test = np.array([document_vector_glove(doc, glove_model) for doc in test_documents])
y_pred = classifier.predict(X_test)

output_csv_path = '/Users/subhiksha/Documents/NLP/final_test_predictions.csv'
with open(output_csv_path, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Phrase', 'Prediction']) 
    for phrase, prediction in zip(test_documents, y_pred):
        writer.writerow([phrase, prediction])

print("Predictions have been saved to:", output_csv_path)

Predictions have been saved to: /Users/subhiksha/Documents/NLP/final_test_predictions.csv


In [49]:
# import fasttext
# import warnings
# from sklearn.exceptions import ConvergenceWarning
# import numpy as np
# from sklearn.model_selection import cross_val_score, KFold
# from sklearn.linear_model import LogisticRegression
# from sklearn.naive_bayes import GaussianNB
# from sklearn.ensemble import RandomForestClassifier



# # Train a FastText model
# model = fasttext.train_unsupervised('/Users/subhiksha/Documents/NLP/train_data.txt', model='skipgram')

# # Function to convert document to mean of word vectors
# def document_vector(doc):
#     word_vectors = [model.get_word_vector(word) for word in doc]
#     return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(model.get_dimension())

# # Extract documents and labels

# documents = [line.split(',')[:-1] for line in train_data]  
# labels = [int(line.split(',')[-1]) for line in train_data]  

# X = np.array([document_vector(doc) for doc in documents])
# y = np.array(labels)

# #classifier = RandomForestClassifier(n_estimators=100, max_depth=None, random_state=42)
# with warnings.catch_warnings():
#     warnings.filterwarnings("ignore", category=ConvergenceWarning,
#                             module="sklearn")
#     classifier = LogisticRegression(max_iter=1000, solver='lbfgs', random_state=42)
#     classifier.fit(X_train, y_train)


# kf = KFold(n_splits=5, random_state=42, shuffle=True)  

# scores = cross_val_score(classifier, X, y, cv=kf)

# print("Cross-validation scores:", scores)
# print("Average accuracy:", np.mean(scores))


In [73]:
print("Length of X_glove:", len(X_glove))
print("Length of labels (y):", len(labels))

Length of X_glove: 20000
Length of labels (y): 20000


In [None]:
# model = fasttext.train_unsupervised('/Users/subhiksha/Documents/NLP/train_data.txt', model='skipgram')

# def document_vector(doc):
#     word_vectors = [model.get_word_vector(word) for word in doc]
#     return np.mean(word_vectors, axis=0) if word_vectors else np.zeros(model.get_dimension())


# X_fasttext = np.array([document_vector(doc) for doc in documents])
# y = np.array(labels)

In [80]:
train_data[:5]

['not,mention,inappropriate,wildly,undeserved,0',
 'terrific,special,effect,4',
 'sensational,true-crime,4',
 'make,movie,depth,man,lack,1',
 'cinematic,shell,game,2']

In [81]:
# Specify the file path and name
filename = "/Users/subhiksha/Documents/NLP/train.csv"  

# # Open the file in write mode
# with open(filename, mode='w', newline='') as file:
#     writer = csv.writer(file)
#     for line in train_data:
#         row = line.split(',')
#         writer.writerow(row)

# print(f"Data has been written to {filename}")

with open(filename, mode='w', newline='') as file:
    writer = csv.writer(file)
    writer.writerow(['Phrase', 'Prediction']) 
    for line in train_data:
        elements = line.split(',')
        phrase = ','.join(elements[:-1])
        prediction = elements[-1]
        writer.writerow([phrase, prediction])

print(f"Data has been written to {filename}")

Data has been written to /Users/subhiksha/Documents/NLP/train.csv
