In [1]:
import os
import random
import nltk
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.util import ngrams
from sentiment_read_subjectivity import readSubjectivity
from nltk.stem import PorterStemmer

lexicon_path = "/Users/subhiksha/Documents/NLP/subjclueslen1-HLTEMNLP05.tff"
data = []
negationwords = ['no', 'not', 'never', 'none', 'nowhere', 'nothing', 'noone', 'rather', 'hardly', 'scarcely', 'rarely', 'seldom', 'neither', 'nor']
negationwords.extend(['ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn'])
     
def get_wordnet_pos(treebank_tag):
    if treebank_tag.startswith('J'):
        return wordnet.ADJ
    elif treebank_tag.startswith('V'):
        return wordnet.VERB
    elif treebank_tag.startswith('N'):
        return wordnet.NOUN
    elif treebank_tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def processkaggle(dirPath,flag):
    os.chdir(dirPath)
    if flag=='train':
        filepath = './train.tsv'
    else:
        filepath = './test.tsv'
    with open(filepath, 'r') as f:   
        phrasedata = []
        for line in f:
            if not line.startswith('Phrase'):
                line = line.strip()
                parts = line.split('\t')           
                if flag == 'train':
                    phrasedata.append((parts[2], parts[3])) 
                else:
                    phrasedata.append(parts[-1])
    if flag=='train':
        samples_per_class = 4000
        balanced_data = []
        labels = ['0', '1', '2', '3', '4']   
        for label in labels:
            class_data = [item for item in phrasedata if item[1] == label]
            if len(class_data) >= samples_per_class:
                class_sample = class_data[:samples_per_class]
            else:
                class_sample = class_data 
            balanced_data.extend(class_sample)
            
        random.shuffle(balanced_data)
        
        phraselist = balanced_data
    elif flag=='test':
        phraselist = phrasedata[:10000]
    
    #nltk.download('stopwords')
    nltkstopwords = nltk.corpus.stopwords.words('english')
    morestopwords = ['could', 'would', 'might', 'must', 'need', 'sha', 'wo', 'y', "'s", "'d", "'ll", "'t", "'m", "'re", "'ve", "n't"]
    punctuation = set(string.punctuation)
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    #pos_list, neutral_list, neg_list = readSubjectivity(lexicon_path)
    
    output_lines = []
    for item in phraselist:
        if flag == 'train':
            phrase, label = item
        else:
            phrase = item
        tokens = nltk.word_tokenize(phrase)
        stopwords = set(nltkstopwords+morestopwords) 
        stopwords = [word for word in stopwords if word not in negationwords]
        tagged = nltk.pos_tag(tokens)
        lemmatized_tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in tagged]
        filtered_tokens = [word for word in lemmatized_tokens if word.lower() not in stopwords and not all(char in punctuation for char in word)]
        if flag == 'train':  
            output_lines.append(','.join(filtered_tokens) + ',' + label)
        else:  
            output_lines.append(','.join(filtered_tokens))
        
    return output_lines

  
path= "/Users/subhiksha/Documents/NLP/NLP project/FinalProjectData/kagglemoviereviews/corpus"
train_data = processkaggle(path,'train')
train_data = [entry for entry in train_data if entry and ',' in entry and entry.split(',')[0].strip()]

test_data = processkaggle(path,'test')


In [12]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score


train_documents = [' '.join(line.split(',')[:-1]) for line in train_data]
train_labels = [int(line.split(',')[-1]) for line in train_data]


vectorizer = CountVectorizer(ngram_range=(1, 2))
train_vect = vectorizer.fit_transform(train_documents) 

model = LogisticRegression(max_iter=1000, random_state=42)
cv_scores = cross_val_score(model, train_vect, train_labels, cv=5)

print("CV Scores:", cv_scores)
print("Average CV Score:", np.mean(cv_scores))



CV Scores: [0.6969697  0.68863636 0.68888889 0.68982066 0.68931548]
Average CV Score: 0.6907262189972471


In [4]:
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

train_documents = [' '.join(line.split(',')[:-1]) for line in train_data]
train_labels = [int(line.split(',')[-1]) for line in train_data]

vectorizer = CountVectorizer(ngram_range=(1, 2))
train_vect = vectorizer.fit_transform(train_documents)

param_grid = {
    'C': np.logspace(-3, 3, 7),  
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'] 
}
model = GridSearchCV(LogisticRegression(max_iter=5000), param_grid , cv=5, scoring='accuracy')
model.fit(train_vect, train_labels)
print("Best parameters:", model.best_params_)
print("Best CV score:", model.best_score_)


Best parameters: {'C': 1.0, 'solver': 'liblinear'}
Best CV score: 0.6911306165979064


In [6]:
# import csv

# test_documents = [line for line in test_data]
# test_documents = [' '.join(line.split(',')[:-1]) for line in test_data]

# X_test_vect = vectorizer.transform(test_documents)  # Only transform test data
# y_pred = model.predict(X_test_vect)
# print("Test Predictions:", y_pred)


# output_csv_path = '/Users/subhiksha/Documents/NLP/final_test_predictions.csv'
# with open(output_csv_path, mode='w', newline='') as file:
#     writer = csv.writer(file)
#     writer.writerow(['Phrase', 'Prediction']) 
#     for phrase, prediction in zip(test_documents, y_pred):
#         writer.writerow([phrase, prediction])

# print("Predictions have been saved to:", output_csv_path)