In [1]:
import pandas as pd

# Load the two CSV files into separate DataFrames
posts_df= pd.read_csv('Top_posts.csv')
comments_df = pd.read_csv('Top_posts_comments.csv')

# Merge the two DataFrames on the post ID column
merged_df = pd.merge(posts_df, comments_df, on='post_id')


In [2]:
print(merged_df)

       post_id                                         post_title  \
0       gh1dj9  [Project] From books to presentations in 10s w...   
1       gh1dj9  [Project] From books to presentations in 10s w...   
2       gh1dj9  [Project] From books to presentations in 10s w...   
3       gh1dj9  [Project] From books to presentations in 10s w...   
4       gh1dj9  [Project] From books to presentations in 10s w...   
...        ...                                                ...   
223163  efk5n3  Tesla's Neural Net can now identify red and gr...   
223164  efk5n3  Tesla's Neural Net can now identify red and gr...   
223165  efk5n3  Tesla's Neural Net can now identify red and gr...   
223166  efk5n3  Tesla's Neural Net can now identify red and gr...   
223167  efk5n3  Tesla's Neural Net can now identify red and gr...   

              subreddit                                           post_url  \
0       MachineLearning                    https://v.redd.it/v492uoheuxx41   
1       Machine

In [3]:
print(merged_df.shape)


(223168, 11)


In [4]:
print(merged_df.head())


  post_id                                         post_title        subreddit  \
0  gh1dj9  [Project] From books to presentations in 10s w...  MachineLearning   
1  gh1dj9  [Project] From books to presentations in 10s w...  MachineLearning   
2  gh1dj9  [Project] From books to presentations in 10s w...  MachineLearning   
3  gh1dj9  [Project] From books to presentations in 10s w...  MachineLearning   
4  gh1dj9  [Project] From books to presentations in 10s w...  MachineLearning   

                          post_url flair_text  score  comments  upvote_ratio  \
0  https://v.redd.it/v492uoheuxx41    Project   7798       186          0.99   
1  https://v.redd.it/v492uoheuxx41    Project   7798       186          0.99   
2  https://v.redd.it/v492uoheuxx41    Project   7798       186          0.99   
3  https://v.redd.it/v492uoheuxx41    Project   7798       186          0.99   
4  https://v.redd.it/v492uoheuxx41    Project   7798       186          0.99   

             date-time  year  \


In [6]:
print(merged_df['subreddit'].value_counts())


datascience        107156
MachineLearning     95702
artificial          20310
Name: subreddit, dtype: int64


In [7]:
print(merged_df.isnull().sum())


post_id             0
post_title          0
subreddit           0
post_url            0
flair_text      24738
score               0
comments            0
upvote_ratio        0
date-time           0
year                0
comment             9
dtype: int64


In [8]:
merged_df = merged_df.drop('flair_text', axis=1)
#dropped flair because it had missing values in it and I though it would be best to drop it

In [9]:
print(merged_df.head())
print(merged_df.columns)


  post_id                                         post_title        subreddit  \
0  gh1dj9  [Project] From books to presentations in 10s w...  MachineLearning   
1  gh1dj9  [Project] From books to presentations in 10s w...  MachineLearning   
2  gh1dj9  [Project] From books to presentations in 10s w...  MachineLearning   
3  gh1dj9  [Project] From books to presentations in 10s w...  MachineLearning   
4  gh1dj9  [Project] From books to presentations in 10s w...  MachineLearning   

                          post_url  score  comments  upvote_ratio  \
0  https://v.redd.it/v492uoheuxx41   7798       186          0.99   
1  https://v.redd.it/v492uoheuxx41   7798       186          0.99   
2  https://v.redd.it/v492uoheuxx41   7798       186          0.99   
3  https://v.redd.it/v492uoheuxx41   7798       186          0.99   
4  https://v.redd.it/v492uoheuxx41   7798       186          0.99   

             date-time  year  \
0  2020-05-10 13:19:54  2020   
1  2020-05-10 13:19:54  2020   
2 

In [13]:
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer

# Download the required NLTK resources
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Create an instance of the PorterStemmer and WordNetLemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

# Define a function to preprocess the text
def preprocess_text(text):
    # Convert the text to lowercase
    text = text.lower()
    
    # Tokenize the text into individual words
    tokens = nltk.word_tokenize(text)
    
    # Remove stopwords from the text
    stopwords_list = stopwords.words('english')
    filtered_tokens = [token for token in tokens if token not in stopwords_list]
    
    # Apply stemming or lemmatization to the tokens
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
    
    # Return the preprocessed text as a string
    return ' '.join(lemmatized_tokens)

# Apply the preprocess_text function to the 'comment' column of your DataFrame
merged_df['preprocessed_comment'] = merged_df['comment'].apply(preprocess_text)


[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Rahul\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [11]:
merged_df['comment'].isnull().sum()
# 9 missing values in the comment column

9

In [12]:
merged_df.dropna(subset=['comment'], inplace=True)
# we drop the missing values 

In [14]:
# Print a sample of the original 'comment' column
print(merged_df['comment'].head())

# Print a sample of the 'preprocessed_comment' column after applying the 'preprocess_text' function
print(merged_df['preprocessed_comment'].head())


0    Twitter thread: [https://twitter.com/cyrildiag...
1                                         The future 🤯
2    Simple yet very useful. Thank you for sharing ...
3    Almost guaranteed, Apple will copy your idea i...
4    Ohh the nightmare of making this into a stable...
Name: comment, dtype: object
0    twitter thread : [ http : //twitter.com/cyrild...
1                                             future 🤯
2             simple yet useful . thank sharing code .
3    almost guaranteed , apple copy idea 3 , 2 , 1 ...
4    ohh nightmare making stable product ... enough...
Name: preprocessed_comment, dtype: object


In [19]:
from sklearn.model_selection import train_test_split

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(merged_df['preprocessed_comment'], merged_df['subreddit'], test_size=0.2, random_state=42)

# Split the training data further into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)


In [20]:
print(X_train)

185231    airflow allows write data pipeline python . ma...
146600    hi prof. hinton , 'd like thank introduction m...
90000     > 's inappropriate try benefit also trying exe...
25033                                  france popular app .
152800                prefer word `` differently abled '' .
                                ...                        
181937    ingest : load postgresql via psql , transform ...
156967    project work algorithm research generative art...
62417                                                      
12413                        one task left jira : fix human
130565    unfortunately post feed onto current `` china ...
Name: preprocessed_comment, Length: 142821, dtype: object


In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report

# Convert the preprocessed comments into numerical feature vectors using the TF-IDF vectorizer
vectorizer = TfidfVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_val_vectorized = vectorizer.transform(X_val)
X_test_vectorized = vectorizer.transform(X_test)

# Train a Multinomial Naive Bayes classifier on the training set
mnb = MultinomialNB()
mnb.fit(X_train_vectorized, y_train)

# Make predictions on the validation set and print the classification report
y_pred_val = mnb.predict(X_val_vectorized)
print('Classification report for validation set:')
print(classification_report(y_val, y_pred_val))

# Make predictions on the test set and print the classification report
y_pred_test = mnb.predict(X_test_vectorized)
print('Classification report for test set:')
print(classification_report(y_test, y_pred_test))


Classification report for validation set:
                 precision    recall  f1-score   support

MachineLearning       0.68      0.71      0.69     15214
     artificial       0.79      0.00      0.01      3251
    datascience       0.73      0.84      0.78     17241

       accuracy                           0.71     35706
      macro avg       0.73      0.52      0.49     35706
   weighted avg       0.71      0.71      0.67     35706

Classification report for test set:
                 precision    recall  f1-score   support

MachineLearning       0.69      0.70      0.69     19193
     artificial       0.92      0.00      0.01      4044
    datascience       0.72      0.84      0.78     21395

       accuracy                           0.71     44632
      macro avg       0.78      0.52      0.49     44632
   weighted avg       0.72      0.71      0.67     44632



In [24]:
# Make predictions on the test set
y_pred = mnb.predict(X_test_vectorized)

# Create a new DataFrame with the original subreddit and predicted subreddit columns
results_df = pd.DataFrame({'original_subreddit': y_test, 'predicted_subreddit': y_pred})

# Print the first 10 rows of the results DataFrame
print(results_df.head(50))


       original_subreddit predicted_subreddit
202182        datascience         datascience
162932         artificial     MachineLearning
43317         datascience         datascience
198271        datascience         datascience
135856        datascience         datascience
202604        datascience         datascience
176299        datascience         datascience
155230    MachineLearning         datascience
215428         artificial         datascience
159986    MachineLearning     MachineLearning
183599        datascience         datascience
8931      MachineLearning     MachineLearning
112367        datascience     MachineLearning
208699         artificial     MachineLearning
174199        datascience         datascience
186700        datascience     MachineLearning
118183    MachineLearning     MachineLearning
40354     MachineLearning     MachineLearning
47130     MachineLearning     MachineLearning
59464         datascience         datascience
59917     MachineLearning     Mach

In [25]:
from sklearn.metrics import accuracy_score

# Make predictions on the test set
y_pred = mnb.predict(X_test_vectorized)

# Calculate the accuracy of the predictions
accuracy = accuracy_score(y_test, y_pred)

print("Accuracy:", accuracy)

Accuracy: 0.7058388600107546
