In [2]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load positive and negative reviews
with open('corpus/positive-reviews.txt', 'r') as f:
    positive_reviews = f.readlines()

with open('corpus/negative-reviews.txt', 'r') as f:
    negative_reviews = f.readlines()



In [3]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

# Initialize tools
stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def preprocess_text(text, use_stemming=True, use_lemmatization=False):
    """
    Preprocess a single string of text.

    Args:
    text (str): Raw text input.
    use_stemming (bool): Whether to apply stemming.
    use_lemmatization (bool): Whether to apply lemmatization.

    Returns:
    str: Cleaned and preprocessed text.
    """
    # 1. Lowercase
    text = text.lower()

    # 2. Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)

    # 3. Tokenize
    tokens = word_tokenize(text)

    # 4. Remove stopwords
    tokens = [word for word in tokens if word not in stop_words]

    # 5. Apply stemming or lemmatization
    if use_stemming:
        tokens = [stemmer.stem(word) for word in tokens]
    elif use_lemmatization:
        tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # 6. Rejoin tokens
    processed_text = " ".join(tokens)
    return processed_text

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/rattanak/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/rattanak/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/rattanak/nltk_data...


In [4]:
#positive_reviews = preprocess_text(positive_reviews)

import nltk
nltk.download('punkt_tab')
#negative_reviews = preprocess_text(negative_reviews)

# Apply the preprocessing to each review within the lists
positive_reviews = [preprocess_text(review) for review in positive_reviews]
negative_reviews = [preprocess_text(review) for review in negative_reviews]
positive_reviews
negative_reviews

[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/rattanak/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


['imag qualiti good brand cheap feel bodi',
 'noth',
 'black text could better ink run kind fast',
 'thing wont work',
 'display graini small keypad text messag poor vibrat function',
 'unfriendli user interfac alot featur qwest servic',
 'serial connect optic zoom bulki',
 'nois fact broke',
 'flip button difficult press time',
 'voic qualiti patchi web servic patchi',
 'digit camera buy recharg batteri adaptor',
 'size price stylu error',
 'view panel',
 'hassl hassl hassl hassl',
 'screen color phone look best',
 'fabric fade wheel squeak doesnt go bump easili',
 'use alot ink cartridg expensivenot fast print',
 'lag connect batteri life',
 'readi carri floppi',
 'get good red eye touchup con',
 'ink run fast ink cartridg pricey',
 'realli need second batteri',
 'poor poor qualiti flimsi loos flip buggi softwar',
 'overpr get flimsi antenna',
 'price high',
 'limit disk space',
 '8m memori stick hold still use moon light shot',
 'power hog micro size fumbl',
 'what',
 'hard travel',

In [5]:
# Split data into training and test sets
pos_train, pos_test = train_test_split(positive_reviews, test_size=0.2)
neg_train, neg_test = train_test_split(negative_reviews, test_size=0.2)

In [6]:
print(len(pos_train))
print(len(pos_test))
print(len(neg_train))
print(len(neg_test))

16000
4000
16000
4000


In [7]:
pos_train[0].split()

['great', 'camera']

In [8]:
train_data = [(review.strip(), 1) for review in pos_train] + [(review.strip(), 0) for review in neg_train]
test_data = [(review.strip(), 1) for review in pos_test] + [(review.strip(), 0) for review in neg_test]

In [9]:
print(len(train_data))
print(len(test_data))

32000
8000


In [10]:
# Load positive and negative words
with open('corpus/positive-words.txt', 'r') as f:
    positive_words = set(word.strip() for word in f if word.strip() and not word.startswith(';'))

with open('corpus/negative-words.txt', 'r') as f:
    negative_words = set(word.strip() for word in f if word.strip() and not word.startswith(';'))

In [11]:
# prompt: Preprocess positive and negative words

# Preprocess positive and negative words
positive_words = [preprocess_text(word) for word in positive_words]
negative_words = [preprocess_text(word) for word in negative_words]

In [12]:
print(len(positive_words))
print(len(negative_words))

2006
4780


In [13]:
# Feature extraction
from sklearn import preprocessing
def extract_features(review):

    words = review.split()
    #words = preprocess_text(words)

    positive_count = sum(1 for word in words if word in positive_words)
    negative_count = sum(1 for word in words if word in negative_words)

    #positive_count = sum(1 for word in words if word in lemmatized_positive_words)
    #negative_count = sum(1 for word in words if word in lemmatized_negative_words)

    feature_vector = {
        'positive_word_count': sum(1 for word in words if word in positive_words),
        'negative_word_count': sum(1 for word in words if word in negative_words),
        #'positive_word_count': sum(1 for word in words if word in lemmatized_positive_words),
        #'negative_word_count': sum(1 for word in words if word in lemmatized_negative_words),
        'contains_no': 1 if 'no' in review else 0,
        'pronoun_count': sum(1 for word in words if word.lower() in {'i', 'me', 'my', 'you', 'your'}),
        'contains_exclamation': 1 if '!' in review else 0,
        'log_review_length': math.log(len(words) + 1),  # Adding 1 to avoid log(0)
        'average_word_length': sum(len(word) for word in words) / len(words) if words else 0, #additional 1
        #'uppercase_word_count': sum(1 for word in words if word.isupper()), #additional 2
    }
    return feature_vector

In [14]:
# Convert data to feature matrices
train_X = pd.DataFrame([extract_features(review) for review, _ in train_data])
train_y = np.array([label for _, label in train_data])

test_X = pd.DataFrame([extract_features(review) for review, _ in test_data])
test_y = np.array([label for _, label in test_data])

In [14]:
### logging
train_X

Unnamed: 0,positive_word_count,negative_word_count,contains_no,pronoun_count,contains_exclamation,log_review_length,average_word_length
0,1,0,0,0,0,1.609438,5.750000
1,1,1,0,0,0,1.609438,6.250000
2,2,0,0,0,0,1.609438,6.500000
3,0,0,0,0,0,1.609438,6.750000
4,2,0,0,0,0,1.945910,4.166667
...,...,...,...,...,...,...,...
31995,0,0,0,0,0,1.098612,4.000000
31996,1,0,1,0,0,1.098612,6.000000
31997,0,0,0,0,0,2.197225,5.250000
31998,0,1,0,0,0,1.098612,6.500000


In [15]:
# Train multiple classifiers
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

In [16]:
results = {}
for model_name, model in models.items():
    # Train the model
    model.fit(train_X, train_y)

    # Test the model
    predictions = model.predict(test_X)
    accuracy = accuracy_score(test_y, predictions)
    results[model_name] = accuracy

# Display results
for model_name, accuracy in results.items():
    print(f"{model_name}: {accuracy:.4f}")

# Save results to a report
report = """
Text Classification Report

Features:
- Count of positive words
- Count of negative words
- Binary feature: whether the review contains the word 'no'
- Count of 1st and 2nd pronouns ('I', 'me', 'my', 'you', 'your')
- Binary feature: whether the review contains '!'
- Logarithm of the review length

Models Implemented:
- Logistic Regression
- Naive Bayes
- Random Forest

Results:
"""
for model_name, accuracy in results.items():
    report += f"{model_name}: {accuracy:.4f}\n"

with open('classification_report.txt', 'w') as f:
    f.write(report)

Logistic Regression: 0.8031
Naive Bayes: 0.8019
Random Forest: 0.7963


In [17]:
# Predict for an outside review
def predict_review(review, model_name='Logistic Regression'):
    # Extract features from the input review
    review_features = pd.DataFrame([extract_features(review)])

    # Get the selected model
    model = models.get(model_name)
    if not model:
        raise ValueError(f"Model {model_name} not found.")

    # Predict sentiment (1 = Positive, 0 = Negative)
    prediction = model.predict(review_features)[0]
    return "Positive" if prediction == 1 else "Negative"

# Example usage
outside_review = "The product is amazing and works perfectly!"
predicted_sentiment = predict_review(outside_review)
print(f"Review: {outside_review}\nPredicted Sentiment: {predicted_sentiment}")

Review: The product is amazing and works perfectly!
Predicted Sentiment: Positive


In [18]:
# Predict for an outside review
def predict_review(review, model_name='Logistic Regression'):
    # Extract features from the input review
    review_features = pd.DataFrame([extract_features(review)])

    # Get the selected model
    model = models.get(model_name)
    if not model:
        raise ValueError(f"Model {model_name} not found.")

    # Predict sentiment (1 = Positive, 0 = Negative)
    prediction = model.predict(review_features)[0]
    return "Positive" if prediction == 1 else "Negative"

# Example usage
outside_review = "You are not his father"
predicted_sentiment = predict_review(outside_review)
print(f"Review: {outside_review}\nPredicted Sentiment: {predicted_sentiment}")

Review: You are not his father
Predicted Sentiment: Negative


In [19]:
# Predict sentiment for challenge data
with open('corpus/challenge_data.txt', 'r') as f:
    challenge_reviews = f.readlines()

# challenge_reviews
selected_model = 'Random Forest'  # Choose the best model
# predictions = [str(predict_review(review.strip(), selected_model)) for review in challenge_reviews]
predictions = [str(1) if predict_review(review.strip(), selected_model) == "Positive" else str(0) for review in challenge_reviews]

# Create the output file
output = ''.join(predictions)
with open('random_forest_predictions.txt', 'w') as f:
    f.write(output)

print("Predictions saved to predictions.txt")



Predictions saved to predictions.txt


In [22]:
# Predict sentiment for challenge data
with open('corpus/challenge_data.txt', 'r') as f:
    challenge_reviews = f.readlines()

# challenge_reviews
selected_model = 'Random Forest'  # Choose the best model
# predictions = [str(predict_review(review.strip(), selected_model)) for review in challenge_reviews]
predictions = [str(1) if predict_review(review.strip(), selected_model) == "Positive" else str(0) for review in challenge_reviews]

# Create the output file
output = ''.join(predictions)
with open('random_forest_predictions.txt', 'w') as f:
    f.write(output)

print("Predictions saved to predictions.txt")

Predictions saved to predictions.txt
