In [1]:
import pandas as pd
import numpy as np
import math
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load positive and negative reviews
with open('corpus/positive-reviews.txt', 'r') as f:
    positive_reviews = f.readlines()

with open('corpus/negative-reviews.txt', 'r') as f:
    negative_reviews = f.readlines()

# Split data into training and test sets
pos_train, pos_test = train_test_split(positive_reviews, test_size=0.2)
neg_train, neg_test = train_test_split(negative_reviews, test_size=0.2)

In [2]:
pos_train[0].split()

['many', 'partnerships']

In [3]:
train_data = [(review.strip(), 1) for review in pos_train] + [(review.strip(), 0) for review in neg_train]
test_data = [(review.strip(), 1) for review in pos_test] + [(review.strip(), 0) for review in neg_test]

In [4]:
train_data

[('many partnerships', 1),
 ('Amazing print quality', 1),
 ('Super', 1),
 ('Simple to set up and use; easy to change cartridges; versatile', 1),
 ('Good design; cool extra features like speakerphone, USB connectivity,   AIM',
  1),
 ('great photo reproduction', 1),
 ('Easy to use, high-quality photos', 1),
 ('price  amp; style', 1),
 ('Lightweight, durable, compact', 1),
 ('lightweight, maneuverability', 1),
 ('great text prints, decent color, fairly fast', 1),
 ('5 point harness, adjustable handles, compact, light, versatile, durable.',
  1),
 ('Value,style,convenience', 1),
 ('Low price for a VERY GOOD QUALITY', 1),
 ('Fast printing Balck and Color alike; Lots of options; Great Camera Prints',
  1),
 ('Very sturdy. Comes with a infant car seat and has a compartment for parents.',
  1),
 ('Very low price, good quality color, fast and quiet operation', 1),
 ('Small, cute faceplate and luminiscent backlight. Volume adjusts during calls.',
  1),
 ('Quick to print', 1),
 ('No special atta

In [5]:
# Load positive and negative words
with open('corpus/positive-words.txt', 'r') as f:
    positive_words = set(word.strip() for word in f if word.strip() and not word.startswith(';'))

with open('corpus/negative-words.txt', 'r') as f:
    negative_words = set(word.strip() for word in f if word.strip() and not word.startswith(';'))

In [6]:
# Feature extraction
"""def extract_features(review):
    words = review.split()
    feature_vector = {
        'positive_word_count': sum(1 for word in words if word in positive_words),
        'negative_word_count': sum(1 for word in words if word in negative_words),
        'contains_no': 1 if 'no' in review else 0,
        'pronoun_count': sum(1 for word in words if word.lower() in {'i', 'me', 'my', 'you', 'your'}),
        'contains_exclamation': 1 if '!' in review else 0,
        'log_review_length': math.log(len(words) + 1)  # Adding 1 to avoid log(0)
    }
    return feature_vector"""

# Feature extraction
def extract_features(review):
    words = review.split()

    # positive_count = sum(1 for word in words if word in positive_words)
    # negative_count = sum(1 for word in words if word in negative_words)

    feature_vector = {
        'positive_word_count': sum(1 for word in words if word in positive_words),
        'negative_word_count': sum(1 for word in words if word in negative_words),
        'contains_no': 1 if 'no' in review else 0,
        'pronoun_count': sum(1 for word in words if word.lower() in {'i', 'me', 'my', 'you', 'your'}),
        'contains_exclamation': 1 if '!' in review else 0,
        'log_review_length': math.log(len(words) + 1),  # Adding 1 to avoid log(0)
        'average_word_length': sum(len(word) for word in words) / len(words) if words else 0, #additional 1
        'uppercase_word_count': sum(1 for word in words if word.isupper()), #additional 2
    }
    return feature_vector

In [7]:
# Convert data to feature matrices
train_X = pd.DataFrame([extract_features(review) for review, _ in train_data])
train_y = np.array([label for _, label in train_data])

test_X = pd.DataFrame([extract_features(review) for review, _ in test_data])
test_y = np.array([label for _, label in test_data])

In [8]:
### logging
test_X

Unnamed: 0,positive_word_count,negative_word_count,contains_no,pronoun_count,contains_exclamation,log_review_length,average_word_length,uppercase_word_count
0,2,0,0,0,0,2.079442,5.428571,0
1,0,0,0,0,1,1.386294,5.666667,3
2,1,0,0,0,0,2.484907,5.363636,2
3,0,0,0,0,0,1.098612,11.000000,0
4,0,0,0,0,0,1.098612,5.500000,0
...,...,...,...,...,...,...,...,...
7995,0,0,1,0,0,2.302585,4.888889,0
7996,0,1,0,0,0,1.098612,9.000000,0
7997,1,0,0,0,0,2.302585,4.000000,0
7998,0,0,1,0,0,1.098612,3.500000,0


### Train multiple classifiers

In [9]:
# Train multiple classifiers
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42)
}

In [None]:
results = {}
for model_name, model in models.items():
    # Train the model
    model.fit(train_X, train_y)
    
    # Test the model
    predictions = model.predict(test_X)
    accuracy = accuracy_score(test_y, predictions)
    results[model_name] = accuracy

# Display results
for model_name, accuracy in results.items():
    print(f"{model_name}: {accuracy:.4f}")

# Save results to a report
report = """
Text Classification Report

Features:
- Count of positive words
- Count of negative words
- Binary feature: whether the review contains the word 'no'
- Count of 1st and 2nd pronouns ('I', 'me', 'my', 'you', 'your')
- Binary feature: whether the review contains '!'
- Logarithm of the review length
- Average word length
- Uppercase word count

Models Implemented:
- Logistic Regression
- Naive Bayes
- Random Forest

Results:
"""
for model_name, accuracy in results.items():
    report += f"{model_name}: {accuracy:.4f}\n"

with open('classification_report.txt', 'w') as f:
    f.write(report)

Logistic Regression: 0.7566
Naive Bayes: 0.7064
Random Forest: 0.7402


In [11]:
# Predict for an outside review
def predict_review(review, model_name='Logistic Regression'):
    # Extract features from the input review
    review_features = pd.DataFrame([extract_features(review)])
    
    # Get the selected model
    model = models.get(model_name)
    if not model:
        raise ValueError(f"Model {model_name} not found.")
    
    # Predict sentiment (1 = Positive, 0 = Negative)
    prediction = model.predict(review_features)[0]
    return "Positive" if prediction == 1 else "Negative"

# Example usage
outside_review = "The product is amazing and works perfectly!"
predicted_sentiment = predict_review(outside_review)
print(f"Review: {outside_review}\nPredicted Sentiment: {predicted_sentiment}")

Review: The product is amazing and works perfectly!
Predicted Sentiment: Positive


### Predict sentiment for challenge data

In [12]:
# Predict sentiment for challenge data
with open('corpus/challenge_data.txt', 'r') as f:
    challenge_reviews = f.readlines()

# challenge_reviews
selected_model = 'Logistic Regression'  # Choose the best model
# predictions = [str(predict_review(review.strip(), selected_model)) for review in challenge_reviews]
predictions = [str(1) if predict_review(review.strip(), selected_model) == "Positive" else str(0) for review in challenge_reviews]

# Create the output file
output = ''.join(predictions)
with open('logistic_regression_predictions.txt', 'w') as f:
    f.write(output)

print("Predictions saved to predictions.txt")


Predictions saved to predictions.txt


In [13]:
# Predict sentiment for challenge data
with open('corpus/challenge_data.txt', 'r') as f:
    challenge_reviews = f.readlines()

# challenge_reviews
selected_model = 'Random Forest'  # Choose the best model
# predictions = [str(predict_review(review.strip(), selected_model)) for review in challenge_reviews]
predictions = [str(1) if predict_review(review.strip(), selected_model) == "Positive" else str(0) for review in challenge_reviews]

# Create the output file
output = ''.join(predictions)
with open('random_forest_predictions.txt', 'w') as f:
    f.write(output)

print("Predictions saved to predictions.txt")

Predictions saved to predictions.txt
