In [1]:
import numpy as np
import pandas 
import os
import nltk
nltk.download('vader_lexicon')
nltk.download('stopwords')

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/yeetusonthefetus/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yeetusonthefetus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
#  Load the data from combined_df.csv
combined_df = pandas.read_csv('../combined_df.csv')

In [3]:
# Tokenize and clean text data (you can use regular expressions for more advanced cleaning)
combined_df['post'] = combined_df['post'].apply(lambda x: x.lower())  # Convert to lowercase

In [4]:
nltk.download('punkt') 
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/yeetusonthefetus/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/yeetusonthefetus/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

def preprocess_text(text):
    # Tokenization and removing punctuation
    words = nltk.word_tokenize(text)
    words = [word for word in words if word.isalnum()]
    
    # Removing stop words and lemmatization
    stop_words = set(stopwords.words("english"))
    words = [WordNetLemmatizer().lemmatize(word) for word in words if word not in stop_words]
    
    return " ".join(words)
    
combined_df['cleaned_posts'] = combined_df['post'].apply(preprocess_text)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/yeetusonthefetus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from gensim import corpora, models


documents = combined_df['cleaned_posts'].str.split()  # Split the preprocessed text into words

# Create a dictionary mapping words to IDs
dictionary = corpora.Dictionary(documents)

# Create a bag of words corpus
corpus = [dictionary.doc2bow(doc) for doc in documents]

# Train the LDA model
lda_model = models.LdaModel(corpus, num_topics=10, id2word=dictionary, passes=15)  # Adjust num_topics as needed

In [7]:
topic_distributions = [lda_model[doc] for doc in corpus]

In [8]:
num_topics = max(len(topic) for topic in topic_distributions)
topic_features = np.zeros((len(topic_distributions), num_topics))  # num_topics is the number of topics in your LDA model

for i, doc_topics in enumerate(topic_distributions):
    for topic, weight in doc_topics:
        topic_features[i, topic] = weight

In [9]:
shape_topic = topic_features.shape  # Shape of the topic modeling output

# Create an empty array to accommodate the topic modeling data
X_combined = np.empty((shape_topic[0], shape_topic[1]))

# Copy data from the original topic modeling array to the combined array
X_combined[:, :shape_topic[1]] = topic_features


In [10]:
from sklearn.model_selection import train_test_split

X = combined_df.drop(['subreddit', 'post'],  axis=1)  # Features
y = combined_df['subreddit']  # Target

In [11]:
# from sklearn.preprocessing import LabelEncoder
# label_encoder = LabelEncoder()
# y_encoded = label_encoder.fit_transform(y)
X_train, X_test, y_train, y_test = train_test_split(X_combined  , y, test_size=0.2, random_state=42)

In [12]:


print(X_train.shape)
print(y_train.shape)

(299599, 10)
(299599,)


In [14]:
import numpy as np
from keras.models import Sequential
from keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense
from sklearn.metrics import classification_report
from sklearn.preprocessing import LabelBinarizer
from tensorflow.keras.callbacks import EarlyStopping  # Import EarlyStopping

# Convert y_train and y_test to one-hot encoded format
label_binarizer = LabelBinarizer()
y_train_onehot = label_binarizer.fit_transform(y_train)
y_test_onehot = label_binarizer.transform(y_test)

# Define the CNN model
model = Sequential()
model.add(Embedding(input_dim=topic_features.shape[1], output_dim=512, input_length=topic_features.shape[1]))
model.add(Conv1D(512, 5, activation='relu'))
model.add(MaxPooling1D(5))
model.add(Flatten())
model.add(Dense(512, activation='relu'))
model.add(Dense(len(label_binarizer.classes_), activation='softmax'))  # Number of classes for multi-class

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)
history = model.fit(X_train, y_train_onehot, epochs=10, batch_size=16, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model on the test set
y_pred = model.predict(X_test)
y_pred_labels = label_binarizer.inverse_transform(y_pred)

# Generate a classification report
report = classification_report(y_test, y_pred)
print(report)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


ValueError: Classification metrics can't handle a mix of multiclass and continuous-multioutput targets