In [1]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [2]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91971\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
# Load the dataset (scrapped data from reddit)

data_df = pd.read_csv('../data/new.csv')

In [3]:
# Viewing the dataset

data_df.head()

Unnamed: 0,title,url,comms_num,flair,score,id,created,author,body
0,AskScience Panel of Scientists XXVIII,https://www.reddit.com/r/askscience/comments/1...,31,Panel Applications,359,10e3nfa,1673932275,AskScienceModerator,**Please read this entire post carefully and f...
1,AskScience’s concerns regarding Reddit’s API c...,https://www.reddit.com/r/askscience/comments/1...,1,,40805,147epb8,1686543618,AskScienceModerator,"In April 2023, [Reddit announced they would be..."
2,"How can this substance ""Allkaline pyrogulic ac...",https://www.reddit.com/r/askscience/comments/1...,39,Chemistry,503,14wpwv1,1689078214,Ok-Sleep8655,
3,After the universe reaches maximum entropy and...,https://www.reddit.com/r/askscience/comments/1...,295,Physics,857,14vyevw,1689005667,Torpaskor,"I've thought about this before, but im nowhere..."
4,AskScience AMA Series: We are hear to discuss ...,https://www.reddit.com/r/askscience/comments/1...,99,Neuroscience,447,14vqstn,1688986829,AskScienceModerator,Hello Reddit! We are researchers at the Nation...


In [4]:
# Check how many null values in our dataframe

print(data_df.isnull().sum())
data_df.count()

title           0
url             0
comms_num       0
flair         224
score           0
id              0
created         0
author        420
body         3192
dtype: int64


title        13888
url          13888
comms_num    13888
flair        13664
score        13888
id           13888
created      13888
author       13468
body         10696
dtype: int64

In [5]:
# We can see above we have many null values
# We have a few options to solve this problem
# 1. Fill null value with a new categorical value like na (creates a different category for null values)
# 2. Drop all the null values (may cause the dataset to be less diverse)
# 3. Replace the null values with mean/median values (may create skewness)
# 4. Replace the null values with predicted values (less noise generated but a lot of processing time)

data_df = data_df.fillna("na")

In [6]:
# Check if all the values have been accounted for

print(data_df.isnull().sum())
print(data_df.count())

title        0
url          0
comms_num    0
flair        0
score        0
id           0
created      0
author       0
body         0
dtype: int64
title        13888
url          13888
comms_num    13888
flair        13888
score        13888
id           13888
created      13888
author       13888
body         13888
dtype: int64


In [7]:
data_df.head()

Unnamed: 0,title,url,comms_num,flair,score,id,created,author,body
0,AskScience Panel of Scientists XXVIII,https://www.reddit.com/r/askscience/comments/1...,31,Panel Applications,359,10e3nfa,1673932275,AskScienceModerator,**Please read this entire post carefully and f...
1,AskScience’s concerns regarding Reddit’s API c...,https://www.reddit.com/r/askscience/comments/1...,1,na,40805,147epb8,1686543618,AskScienceModerator,"In April 2023, [Reddit announced they would be..."
2,"How can this substance ""Allkaline pyrogulic ac...",https://www.reddit.com/r/askscience/comments/1...,39,Chemistry,503,14wpwv1,1689078214,Ok-Sleep8655,na
3,After the universe reaches maximum entropy and...,https://www.reddit.com/r/askscience/comments/1...,295,Physics,857,14vyevw,1689005667,Torpaskor,"I've thought about this before, but im nowhere..."
4,AskScience AMA Series: We are hear to discuss ...,https://www.reddit.com/r/askscience/comments/1...,99,Neuroscience,447,14vqstn,1688986829,AskScienceModerator,Hello Reddit! We are researchers at the Nation...


In [8]:
# Let's clean some texts
# First we will replace some special characters
# We will use python regex
# And nltk STOPWORDS for removing commonly used word (such as “the”, “a”, “an”, “in”)
# We also lower the text
# We will use PorterStemming: The Porter stemming algorithm (or ‘Porter stemmer’) is a 
# process for removing the commoner morphological and inflexional endings from words in English.

REPLACE_BY_SPACE = re.compile('[/(){}\[\]\|@,;]')
REPLACE_SYMBOL = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

In [9]:
# Let's define the function for cleaning

def clean_data(text):
    text = BeautifulSoup(text, "lxml").text
    text = text.lower()
    text = REPLACE_BY_SPACE.sub(' ', text)
    text = REPLACE_SYMBOL.sub('', text)
    text = ' '.join(word for word in text.split() if word not in STOPWORDS)
    return text

# Define string function

def stringConvert(text):
    return str(text)

In [10]:
# Cleaning the text

data_df['title'] = data_df['title'].apply(stringConvert)
data_df['title'] = data_df['title'].apply(clean_data)

  text = BeautifulSoup(text, "lxml").text


In [11]:
data = data_df.loc[:, ['title', 'flair']]

In [12]:
data.head()

Unnamed: 0,title,flair
0,askscience panel scientists xxviii,Panel Applications
1,asksciences concerns regarding reddits api cha...,na
2,substance allkaline pyrogulic acid acidic basi...,Chemistry
3,universe reaches maximum entropy completes hea...,Physics
4,askscience ama series hear discuss link viral ...,Neuroscience


In [13]:
x=data['title']
y=data['flair']

In [14]:
x

0                       askscience panel scientists xxviii
1        asksciences concerns regarding reddits api cha...
2        substance allkaline pyrogulic acid acidic basi...
3        universe reaches maximum entropy completes hea...
4        askscience ama series hear discuss link viral ...
                               ...                        
13883    incidence tobacco related illnesses among indi...
13884    temperature determine sex certain egg laying a...
13885         past tipping point insect population decline
13886    percentage prey animals die without becoming prey
13887        lepton flavor conserved oscillating neutrinos
Name: title, Length: 13888, dtype: object

In [15]:
y

0        Panel Applications
1                        na
2                 Chemistry
3                   Physics
4              Neuroscience
                ...        
13883              Medicine
13884               Biology
13885               Biology
13886               Biology
13887               Physics
Name: flair, Length: 13888, dtype: object

In [16]:
# Split the data into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.25, random_state=0)

In [17]:
# Vectorize the text data

vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

In [18]:
# Create and train the model

model = MultinomialNB()
model.fit(X_train, y_train)

In [19]:
# Make predictions on the test set

y_pred = model.predict(X_test)

In [20]:
# Evaluate the model

accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')

print(f'Accuracy: {accuracy}')
print(f'Precision: {precision}')
print(f'Recall: {recall}')
print(f'F1-score: {f1}')

Accuracy: 0.9815668202764977
Precision: 0.9781924019181403
Recall: 0.9815668202764977
F1-score: 0.9787892473925156
