In [1]:
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.metrics import classification_report
from scipy.sparse import coo_matrix

# Load the two CSV files into separate DataFrames
posts_df = pd.read_csv('Top_posts.csv')
comments_df = pd.read_csv('Top_posts_comments.csv')

# Drop the 'flair_text' column
posts_df = posts_df.drop('flair_text', axis=1)

# Merge the two DataFrames on the post ID column
merged_df = pd.merge(posts_df, comments_df, on='post_id')

merged_df.dropna(subset=['comment'], inplace=True)
# we drop the missing values 

In [2]:
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

# Define the preprocessing function
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Tokenize the text
    tokens = word_tokenize(text.lower())
    
    # Remove stop words
    tokens = [token for token in tokens if token not in stop_words]
    
    # Join the tokens back into a single string
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

# Apply the preprocessing function to the 'comment' column of your DataFrame
merged_df['preprocessed_comment'] = merged_df['comment'].apply(preprocess_text)




[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
merged_df['preprocessed_comment'] = merged_df['comment'].apply(preprocess_text)


In [4]:
def create_vocabulary(texts, min_count=1):
    # Create an empty Counter object
    vocabulary = Counter()

    # Loop over each text and count the frequency of each word
    for text in texts:
        for word in text.split():
            vocabulary[word] += 1

    # Return the vocabulary as a list of words, sorted by frequency
    return [word for word, freq in vocabulary.most_common() if freq >= min_count]



In [5]:
# Create a vocabulary from the preprocessed comments
vocabulary = create_vocabulary(merged_df['preprocessed_comment'])


In [6]:
print(vocabulary[:10])
#first 10 words


['.', ',', ')', '(', '?', "'s", "n't", ':', '*', 'data']


In [7]:
import numpy as np
from collections import Counter

def create_bag_of_words(text, vocabulary):
    bag_of_words = Counter(text.split())
    for word in set(text.split()):
        if word not in vocabulary:
            bag_of_words.pop(word)
    return np.array([bag_of_words.get(word, 0) for word in vocabulary])

# Create a list of all unique words in your dataset
vocabulary = set()
for comment in merged_df['preprocessed_comment']:
    for word in comment.split():
        vocabulary.add(word)

# Create a bag of words for each preprocessed comment
merged_df['bag_of_words'] = merged_df['preprocessed_comment'].apply(lambda x: create_bag_of_words(x, vocabulary))



KeyboardInterrupt: 

In [11]:
import pandas as pd

data = pd.DataFrame(data, columns=['post_id', 'post_title', 'subreddit', 'post_url', 'flair_text', 'score',
       'comments', 'upvote_ratio', 'date-time', 'year', 'comment',
       'preprocessed_comment'])

data['bag_of_words'] = data['preprocessed_comment'].apply(lambda x: create_bag_of_words(x, vocabulary))


NameError: name 'data' is not defined

In [None]:
from scipy.sparse import coo_matrix

# Convert the list of bag of words to a sparse matrix
rows = []
cols = []
data = []
for i, row in enumerate(bag_of_words_list):
    for j, value in enumerate(row):
        if value != 0:
            rows.append(i)
            cols.append(j)
            data.append(value)
bag_of_words_matrix = coo_matrix((data, (rows, cols)), shape=(len(bag_of_words_list), len(vocabulary)))

In [None]:
# Split the data into training and test sets
train_size = int(len(merged_df) * 0.8)
train_data = merged_df[:train_size]
test_data = merged_df[train_size:]


In [None]:
# Define a function to train the Naive Bayes classifier
def train_naive_bayes(train_data):
    # Count the number of comments in each class
    class_counts = Counter(train_data['score'])
    
    # Calculate the total number of comments
    total_comments = len(train_data)
    
    # Create an empty dictionary to store the conditional probabilities
    cond_probs = {}
    
    # Loop over each class
    for class_label in class_counts.keys():
        # Get the subset of comments with the current class label
        class_data = train_data[train_data['score'] == class_label]
        
        # Count the number of comments in the current class
        class_comment_count = len(class_data)
        
        # Calculate the prior probability of the current class
        prior_prob = class_counts[class_label] / total_comments
        
        # Create an empty dictionary to store the conditional probabilities for the current class
        cond_probs_for_class = {}
        
        # Loop over each word in the vocabulary
        for word in vocabulary:
            # Count the number of comments in the current class that contain the current word
            word_count = sum(class_data['bag_of_words'].apply(lambda x: 1 if x[vocabulary.index(word)] > 0 else 0))
            
            # Calculate the conditional probability of the current word given the current class
            cond_prob = (word_count + 1) / (class_comment_count + len(vocabulary))
            
            # Add the conditional probability to the dictionary
            cond_probs_for_class[word] = cond_prob
        
        # Add the dictionary of conditional probabilities for the current class to the overall dictionary
        cond_probs[class_label] = cond_probs_for_class
    
    # Return the prior probabilities and conditional probabilities
    return class_counts, prior_prob, cond_probs


In [None]:
# Define a function to classify a comment using the Naive Bayes classifier
def classify_comment(comment, class_counts, prior_probs, cond_probs):
    # Create an empty dictionary to store the log probabilities for each class
    log_probs = {}
    
    # Loop over each class
    for class_label in class_counts.keys():
        # Initialize the log probability with the logarithm of the prior probability
        log_prob = np.log(prior_probs[class_label])
        
        # Loop over each word in the comment
        for word in comment:
            # Check if the word is in the vocabulary
            if word in vocabulary:
                # Add the logarithm of the conditional probability to the log probability
                log_prob += np.log(cond_probs[class_label][word])
        
        # Add the log probability to the dictionary
        log_probs[class_label] = log_prob
    
    # Return the class label with the highest log probability
    return max(log_probs, key=log_probs.get)


In [None]:
print(train_data.columns)

In [None]:
# Train the Naive Bayes classifier
class_counts, prior_probs, cond_probs = train_naive_bayes(train_data)

In [None]:
# Classify the test comments
test_data['predicted_score'] = test_data['preprocessed_comment'].apply(lambda x: classify_comment(x, class_counts, prior_probs, cond_probs))

In [10]:
# Print the classification report
print(classification_report(test_data['score'], test_data['predicted_score']))

NameError: name 'test_data' is not defined

In [9]:
def classify_comments(test_data, class_counts, prior_prob, cond_probs):
    # Create an empty list to store the predicted class labels
    predicted_labels = []
    
    # Loop over each comment in the test data
    for comment in test_data['preprocessed_comment']:
        # Create an empty dictionary to store the posterior probabilities for each class
        posteriors = {}
        
        # Loop over each class
        for class_label in class_counts.keys():
            # Get the prior probability for the current class
            prior = prior_prob[class_label]
            
            # Get the dictionary of conditional probabilities for the current class
            cond_probs_for_class = cond_probs[class_label]
            
            # Calculate the likelihood of the comment given the current class
            likelihood = 1
            for word in comment.split():
                likelihood *= cond_probs_for_class.get(word, 1)
            
            # Calculate the posterior probability of the current class
            posterior = prior * likelihood
            
            # Add the posterior probability to the dictionary
            posteriors[class_label] = posterior
        
        # Determine the predicted class label for the current comment
        predicted_label = max(posteriors, key=posteriors.get)
        
        # Add the predicted class label to the list
        predicted_labels.append(predicted_label)
    
    # Return the list of predicted class labels
    return predicted_labels
