In [2]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score

In [3]:
import numpy as np

class LogisticRegression:
    def __init__(self, lr=0.01, num_iters=10000, fit_intercept=True):
        self.lr = lr
        self.num_iters = num_iters
        self.fit_intercept = fit_intercept
    
    def add_intercept(self, X):
        # Check if the input array has 2 dimensions
        if X.ndim == 1:
            X = X.reshape(-1, 1)
    
        # Add an intercept column to the input array
        intercept = np.ones((X.shape[0], 1))
        return np.concatenate((intercept, X), axis=1)

    def sigmoid(self, z):
        return 1 / (1 + np.exp(-z))
    
    def loss(self, h, y):
        return (-y * np.log(h) - (1 - y) * np.log(1 - h)).mean()
    
    def initialize(self, X):
        if self.fit_intercept:
            X = self.add_intercept(X)
        self.weights = np.zeros((X.shape[1], 1))
        return self.weights, X
    
    def fit(self,X,y,alpha=0.001,iter=100):
        params,X = self.initialize(X)
        cost_list = np.zeros(iter,)
        for i in range(iter):
            params = params - alpha * dot(X.T, self.sigmoid(dot(X,params)) - np.reshape(y,(len(y),1)))
            cost_list[i] = cost(params)
        self.params = params
        return cost_list
    
    def predict(self,X):
        z = dot(self.initialize(X)[1],self.weights)
        lis = []
        for i in self.sigmoid(z):
            if i>0.5:
                lis.append(1)
            else:
                lis.append(0)
        return lis


In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Define a function to preprocess the text
def preprocess_text(text):
    # Convert the text to lowercase
    text = text.lower()
    
    # Tokenize the text into individual words
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords from the text
    stopwords_list = stopwords.words('english')
    filtered_tokens = [token for token in tokens if token not in stopwords_list]
    
    # Apply stemming or lemmatization to the tokens
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    # Return the preprocessed text as a string
    return ' '.join(lemmatized_tokens)


In [6]:
# Load the two CSV files into separate DataFrames
posts_df = pd.read_csv('Top_posts.csv')
comments_df = pd.read_csv('Top_posts_comments.csv')


In [7]:
# Drop the 'flair_text' column
posts_df = posts_df.drop('flair_text', axis=1)

In [8]:
# Merge the two DataFrames on the post ID column
merged_df = pd.merge(posts_df, comments_df, on='post_id')

In [9]:
# Remove missing values
merged_df.dropna(subset=['comment'], inplace=True)

In [10]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(merged_df['comment'], merged_df['subreddit'], test_size=0.2, random_state=42)



In [11]:
import nltk
from nltk.tokenize import word_tokenize
nltk.download('punkt')

# Preprocess the training and testing data
vectorizer = TfidfVectorizer(tokenizer=preprocess_text)
X_tr = vectorizer.fit_transform(X_train)
X_test_processed = vectorizer.transform(X_test)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [12]:
# Create and train the logistic regression model
log_reg_model = LogisticRegression(lr=0.1, num_iters=1000)
log_reg_model.fit(X_train_processed, y_train)

NameError: name 'X_train_processed' is not defined

In [None]:
# Use the trained model to predict on the test data
y_pred = log_reg_model.predict(X_test_processed)

In [None]:
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)