In [7]:
import json
import re
import statistics

# Step 1: Data Loading & Preprocessing
def load_data(file_path):
    """Load data from a JSON file."""
    with open(file_path, 'r') as file:
        lines = file.readlines()
        data = [json.loads(line) for line in lines]
    return data

def preprocess_text(text):
    """Preprocess text by removing punctuation and converting to lowercase."""
    text = re.sub(r'[^\w\s]', '', text.lower())
    return text

def remove_stopwords(text, stopwords):
    """Remove stopwords from the text."""
    words = text.split()
    filtered_words = [word for word in words if word not in stopwords]
    return ' '.join(filtered_words)

def preprocess_reviews(reviews):
    """Preprocess reviews by removing stopwords."""
    stopwords = set([
        "i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours",
        "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself",
        "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which",
        "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be",
        "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an",
        "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for",
        "with", "about", "against", "between", "into", "through", "during", "before", "after", "above",
        "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again",
        "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any",
        "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only",
        "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should",
        "now"
    ])
    
    preprocessed_reviews = []
    for review in reviews:
        text = review['reviewText']
        text = preprocess_text(text)
        text = remove_stopwords(text, stopwords)
        preprocessed_reviews.append(text)
    return preprocessed_reviews

# Step 2: Thematic Analysis
def thematic_analysis(reviews, ratings):
    """Analyze themes based on reviews and ratings."""
    positive_words = {}
    negative_words = {}
    
    for review, rating in zip(reviews, ratings):
        words = review.split()
        for word in words:
            if rating >= 4:
                positive_words[word] = positive_words.get(word, 0) + 1
            elif rating <= 2:
                negative_words[word] = negative_words.get(word, 0) + 1
    
    return positive_words, negative_words

# Step 3: Sentiment Analysis
def calculate_sentiment(review, positive_words, negative_words):
    """Calculate sentiment score for a review."""
    positive_score = sum(positive_words.get(word, 0) for word in review.split())
    negative_score = sum(negative_words.get(word, 0) for word in review.split())
    total_score = positive_score - negative_score
    return total_score

def categorize_sentiment(score, threshold=250000):
    """Categorize sentiment based on score."""
    if score > threshold:
        return 'positive'
    elif score < 100000:
        return 'negative'
    else:
        return 'neutral'

# Step 4: Storage
def save_results(reviews, sentiments, scores, output_file):
    """Save sentiment analysis results to a file."""
    with open(output_file, 'w') as file:
        for review, sentiment, score in zip(reviews, sentiments, scores):
            file.write(f"Sentiment: {sentiment}, Score: {score}\nReview: {review}\n\n")

if __name__ == "__main__":
    # Step 1: Data Loading & Preprocessing
    file_path = r"C:\Users\Muhammad Omer Hafeez\Desktop\22i1859_Assignment 1\Cell_Phones_and_Accessories_5.json"
    data = load_data(file_path)
    reviews = [review for review in data]
    ratings = [int(review['overall']) for review in data]
    preprocessed_reviews = preprocess_reviews(reviews)
    
    # Step 2: Thematic Analysis
    positive_words, negative_words = thematic_analysis(preprocessed_reviews, ratings)
    
    # Step 3: Sentiment Analysis
    scores = []
    sentiments = []
    for review in preprocessed_reviews:
        score = calculate_sentiment(review, positive_words, negative_words)
        sentiment = categorize_sentiment(score)
        scores.append(score)
        sentiments.append(sentiment)
    
    # Step 4: Storage
    output_file = "result.txt"
    save_results(reviews, sentiments, scores, output_file)

