In [7]:
import json
import re
import statistics

#part a)data loading and preprocessing
def load(file_path):
    reviews = []
    with open(file_path, 'r') as file:
        for line in file:
            entry = json.loads(line)
            review_text = entry.get('reviewText', '')
            rating = entry.get('overall', 0)
            reviews.append({'reviewText': review_text, 'rating': rating})
    return reviews

def explore_dataset(reviews):
    print("Number of reviews:", len(reviews))
    print("Sample review:", reviews[56])

def text_preprocessing(review):
    #removing punctuations and converting to lower case
    review = re.sub(r'[^\w\s]', '', review).lower()
    return review

def remove_stop_words(review):
    stop_words = set(['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', 'should', 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', 'couldn', 'didn', 'doesn', 'hadn', 'hasn', 'haven', 'isn', 'ma', 'mightn', 'mustn', 'needn', 'shan', 'shouldn', 'wasn', 'weren', 'won', 'wouldn'])
    
    #removing stop words
    review = ' '.join(word for word in review.split() if word not in stop_words)
    return review

#part b)thematic analysis

positive_words = set(['good', 'amazing', 'excellent', 'awesome', 'fantastic', 'outstanding', 'superb', 'wonderful', 'brilliant', 'perfect', 'terrific', 'impressive', 'pleasing', 'exceptional', 'splendid', 'great', 'positive', 'satisfying', 'super', 'top-notch', 'marvelous', 'exemplary', 'stellar', 'remarkable', 'incredible', 'delightful', 'perfectly', 'best', 'extraordinary', 'first-rate', 'fabulous', 'premium', 'superior', 'fine', 'noteworthy', 'commendable', 'ideal', 'pleasurable', 'enjoyable', 'lovely', 'admirable', 'skilful', 'praiseworthy', 'sensational', 'remarkable', 'skillful', 'admirable', 'satisfactory', 'astounding', 'happy', 'love'])
negative_words = set(['bad', 'poor', 'terrible', 'awful', 'horrible', 'dreadful', 'inferior', 'subpar', 'unsatisfactory', 'lousy', 'disappointing', 'unpleasant', 'displeasing', 'deficient', 'faulty', 'imperfect', 'substandard', 'low-quality', 'below-average', 'mediocre', 'flawed', 'unsound', 'unacceptable', 'inadequate', 'lame', 'poorly', 'unimpressive', 'disagreeable', 'unfavorable', 'unfortunate', 'negative', 'not good', 'regrettable', 'miserable', 'abysmal', 'crummy', 'deplorable', 'detestable', 'dreadful', 'execrable', 'hopeless', 'horrid', 'lamentable', 'laughable', 'repugnant', 'unfortunate', 'wretched', 'atrocious', 'sad', 'hate'])

def thematic_analysis(reviews):
    positive_counts = 0
    negative_counts = 0

    for review in reviews:
        words = review['reviewText'].split()
        for word in words:
            if word in positive_words:
                positive_counts += 1
            elif word in negative_words:
                negative_counts += 1

    return positive_counts, negative_counts

#part c)sentiment analysis
def rule_based_sentiment_analysis(review, positive_counts, negative_counts):
    #sentiment analysis with thematic analysis results
    sentiment_weights = {
        'good': 0.8, 'amazing': 0.9, 'excellent': 0.85, 'awesome': 0.9, 'fantastic': 0.85,
        'outstanding': 0.9, 'superb': 0.85, 'wonderful': 0.9, 'brilliant': 0.9, 'perfect': 0.9,
        'terrific': 0.9, 'impressive': 0.85, 'pleasing': 0.8, 'exceptional': 0.9, 'splendid': 0.8,
        'great': 0.8, 'positive': 0.75, 'satisfying': 0.7, 'super': 0.75, 'top-notch': 0.85,
        'marvelous': 0.9, 'exemplary': 0.9, 'stellar': 0.9, 'remarkable': 0.85, 'incredible': 0.9,
        'delightful': 0.8, 'perfectly': 0.9, 'best': 0.85, 'extraordinary': 0.9, 'first-rate': 0.9,
        'satisfied': 0.85, 'premium': 0.9, 'superior': 0.9, 'fine': 0.8, 'noteworthy': 0.8,
        'commendable': 0.85, 'ideal': 0.9, 'pleasurable': 0.8, 'enjoyable': 0.8, 'lovely': 0.8,
        'admirable': 0.9, 'skilful': 0.85, 'praiseworthy': 0.9, 'sensational': 0.9, 'skillful': 0.85,
        'admirable': 0.9, 'satisfactory': 0.75, 'astounding': 0.9, 'happy': 0.8, 'sad': -0.7, 'love': 0.9,
        'bad': -0.7, 'poor': -0.65, 'terrible': -0.75, 'awful': -0.8, 'horrible': -0.8,
        'dreadful': -0.8, 'inferior': -0.7, 'subpar': -0.7, 'unsatisfactory': -0.7, 'lousy': -0.75,
        'disappointing': -0.7, 'unpleasant': -0.7, 'displeasing': -0.7, 'deficient': -0.7, 'faulty': -0.8,
        'imperfect': -0.75, 'substandard': -0.7, 'low-quality': -0.7, 'below-average': -0.7, 'mediocre': -0.7,
        'flawed': -0.75, 'unsound': -0.7, 'unacceptable': -0.8, 'inadequate': -0.7, 'lame': -0.7, 'poorly': -0.7,
        'unimpressive': -0.7, 'disagreeable': -0.7, 'disappointed': -0.7, 'unfortunate': -0.7, 'negative': -0.7,
        'not good': -0.7, 'regrettable': -0.7, 'miserable': -0.8, 'abysmal': -0.8, 'bad': -0.75, 'waste': -0.8,
        'detestable': -0.8, 'dreadful': -0.8, 'execrable': -0.8, 'cheap': -0.8, 'didnt work': -0.8, 'stopped working': -0.8,
        'damaged': -0.8, 'worst': -0.8, 'unfortunate': -0.8, 'stopped completely': -0.8, 'atrocious': -0.8,
    }

    #adjust sentiment weights based on thematic analysis results
    adjusted_positive_weight = 0.1 * positive_counts
    adjusted_negative_weight = 0.1 * negative_counts

    sentiment_weights = {word: weight + adjusted_positive_weight if word in positive_words else weight for word, weight in sentiment_weights.items()}
    sentiment_weights = {word: weight - adjusted_negative_weight if word in negative_words else weight for word, weight in sentiment_weights.items()}

    words = review.split()
    sentiment_score = sum(sentiment_weights.get(word, 0) for word in words)

    #threshold for sentiment classification
    threshold = 0.5
    if sentiment_score > threshold:
        return 'positive'
    elif sentiment_score < -threshold:
        return 'negative'
    else:
        return 'neutral'

#part d)saving results
def save_results(reviews_with_sentiment, output_file):
    with open(output_file, 'w') as file:
        for entry in reviews_with_sentiment:
            file.write(f"{entry['reviewText']} - {entry['sentiment']}\n")

# Load the reviews
file_path = "F:\\Studies\\FAST\\Semester 4\\Fundamentals of Big Data Analytics\\Theory\\Assignments\\A0\\Cell_Phones_and_Accessories_5.json"
reviews = load(file_path)
explore_dataset(reviews)

# Thematic analysis
positive_counts, negative_counts = thematic_analysis(reviews)

reviews_with_sentiment = []
for review_entry in reviews:
    review_text = text_preprocessing(review_entry['reviewText'])
    review_text = remove_stop_words(review_text)
    sentiment = rule_based_sentiment_analysis(review_text, positive_counts, negative_counts)
    reviews_with_sentiment.append({'reviewText': review_entry['reviewText'], 'sentiment': sentiment})

output_file = "F:\\Studies\\FAST\\Semester 4\\Fundamentals of Big Data Analytics\\Theory\\Assignments\\A0\\output.txt"
save_results(reviews_with_sentiment, output_file)


Number of reviews: 194439
Sample review: {'reviewText': 'I was worried about this order because the picture on the description page kept changing to a less-desireable green/orange cover.  But alas, it came a day earlier than expected, and the beautiful pink product expected!  The "rubberized" cover feels a little greasy (like it was armor-all\'d), but the design is gorgeous and a little 3D looking!', 'rating': 5.0}
