# Reddit Submission Classification

In [1]:
import pandas as pd

In [2]:
posts = pd.read_csv('C:/Users/Tejaswini/Documents/MSIS_COURSE/Coursework/Summer_2022/Self_Practice/redditnlp_0na.csv',index_col=0)

In [3]:
posts.head()

Unnamed: 0,id,title,subreddit,body,num_comments,all_text
0,wopny7,Weekly Entering & Transitioning - Thread 15 Au...,datascience,\n\nWelcome to this week's entering & transit...,50,Weekly Entering & Transitioning - Thread 15 Au...
1,wqp4qm,When you are invited to a ‘town hall’ amidst a...,datascience,,85,When you are invited to a ‘town hall’ amidst a...
2,wrd1s7,Resume critique (trying to get an internship o...,datascience,,13,Resume critique (trying to get an internship o...
3,wrc8ad,Advice - Looking for a Data Scientist,datascience,Been trying to find a solid Data Scientist for...,51,Advice - Looking for a Data Scientist Been try...
4,wriszy,Recommendations for Udemy class on deploying a...,datascience,I use Python mostly. Trying to move from analy...,0,Recommendations for Udemy class on deploying a...


### Finding if any missing values

In [4]:
posts.isna().any()

id              False
title           False
subreddit       False
body             True
num_comments    False
all_text        False
dtype: bool

##### Body seems to be having missing values. Using fillna() to fill it with empty space

In [5]:
posts.body.fillna(' ',axis = 0, inplace=True)

In [6]:
posts['all_text'] = posts['title'] + ' '+posts['body']

In [7]:
posts.head()

Unnamed: 0,id,title,subreddit,body,num_comments,all_text
0,wopny7,Weekly Entering & Transitioning - Thread 15 Au...,datascience,\n\nWelcome to this week's entering & transit...,50,Weekly Entering & Transitioning - Thread 15 Au...
1,wqp4qm,When you are invited to a ‘town hall’ amidst a...,datascience,,85,When you are invited to a ‘town hall’ amidst a...
2,wrd1s7,Resume critique (trying to get an internship o...,datascience,,13,Resume critique (trying to get an internship o...
3,wrc8ad,Advice - Looking for a Data Scientist,datascience,Been trying to find a solid Data Scientist for...,51,Advice - Looking for a Data Scientist Been try...
4,wriszy,Recommendations for Udemy class on deploying a...,datascience,I use Python mostly. Trying to move from analy...,0,Recommendations for Udemy class on deploying a...


In [8]:
posts.isna().sum()

id              0
title           0
subreddit       0
body            0
num_comments    0
all_text        0
dtype: int64

#### Dropping the columns title and body

In [9]:
posts.drop(columns=['title','body'],axis=0,inplace = True)

In [10]:
posts.shape

(5729, 4)

#### Shuffling the records 

In [11]:
shuffled = posts.sample(frac=1).reset_index().drop(columns='index')

In [12]:
shuffled.head()

Unnamed: 0,id,subreddit,num_comments,all_text
0,w8nqot,artificial,1,How NASA AI Robot Assists Astronauts On Intern...
1,vjlhee,MachineLearning,4,[D] Publishing two papers at the same time Let...
2,vgmu9c,MachineLearning,4,[P] Colab Themes: A Chrome Extension to Custom...
3,iorbjg,datascience,86,Experience/Advice from a 10+ year data scienti...
4,wei6yb,datascience,0,Looking for: ESG / Impact / Sustainability dat...


In [13]:
shuffled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5729 entries, 0 to 5728
Data columns (total 4 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   id            5729 non-null   object
 1   subreddit     5729 non-null   object
 2   num_comments  5729 non-null   int64 
 3   all_text      5729 non-null   object
dtypes: int64(1), object(3)
memory usage: 179.2+ KB


In [14]:
shuffled.isna().sum()

id              0
subreddit       0
num_comments    0
all_text        0
dtype: int64

shuffled.dropna(axis = 0,how = 'any',inplace=True)

In [15]:
shuffled.shape

(5729, 4)

#### Removing links from the corpus

In [16]:
import regex as re

In [17]:
def remove_links(corpus):
    new_corpus = []
    for i in corpus:
        new_corpus.append(re.sub(r'http\S+', '', i))
    return new_corpus

In [18]:
shuffled['all_text_1']= pd.Series(remove_links(shuffled.all_text))

In [19]:
shuffled.sample(5)

Unnamed: 0,id,subreddit,num_comments,all_text,all_text_1
2824,cbnftu,MachineLearning,84,[News] DeepMind’s StarCraft II Agent AlphaStar...,[News] DeepMind’s StarCraft II Agent AlphaStar...
2286,q9hhqt,MachineLearning,61,[P] YoHa: A practical hand tracking engine.,[P] YoHa: A practical hand tracking engine.
2466,skc72q,datascience,137,What's a sign somebody's unusually good at SQL...,What's a sign somebody's unusually good at SQL...
2020,nino7x,datascience,88,"Need to go back to the basics, what's your fav...","Need to go back to the basics, what's your fav..."
2660,w59y4o,datascience,5,Missing observations at household level but ha...,Missing observations at household level but ha...


In [20]:
shuffled.isna().sum()

id              0
subreddit       0
num_comments    0
all_text        0
all_text_1      0
dtype: int64

### Clean Function

In [21]:
from gensim.utils import tokenize
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

def clean_text(text):
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    tokens = list(tokenize(text))
    #res = ' '.join([stemmer.stem(t.lower()) for t in tokens if t.lower() not in stop_words]) 
    res = ' '.join([lemmatizer.lemmatize(t.lower()) for t in tokens if t.lower() not in stop_words]) 
    if len(res) == 0:
        return ' '
    else:
        return res

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Tejaswini\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Tejaswini\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Train-Test-Split

In [22]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(shuffled.all_text_1,shuffled.subreddit, random_state=3, test_size= 0.3)

In [23]:
X_train.shape

(4010,)

In [24]:
X_test.shape

(1719,)

In [25]:
y_train.shape

(4010,)

In [26]:
y_test.shape

(1719,)

In [27]:
X_train

3536    [P] Cosplayer Faces generate by Nvidia StyleGA...
2733    [D] Train-Valid-Test split and featurizer desi...
4218    Data Analyst Research Oppurtunity for High Sch...
5284    This Tumblr user had a neural net generate and...
4585    In the next five years, computer programs that...
                              ...                        
789     [R] The Annotated Diffusion Model From hugging...
968     Poisson reg Hi, I am currently trying to test ...
1667    It's crazy how effective it's to include "Data...
3321    [D] Have you ever been asked to work on a soft...
1688    [Project] I've compiled weather/climate date f...
Name: all_text_1, Length: 4010, dtype: object

In [28]:
y_train

3536    MachineLearning
2733    MachineLearning
4218        datascience
5284         artificial
4585         artificial
             ...       
789     MachineLearning
968         datascience
1667        datascience
3321    MachineLearning
1688    MachineLearning
Name: subreddit, Length: 4010, dtype: object

## Make Document-Term Matrices(Using CountVectorizer)

### Build Vectorizer

In [29]:
from sklearn.feature_extraction.text import CountVectorizer 
count_vect = CountVectorizer(preprocessor=clean_text, ngram_range=(1,1)) 
count_vect.fit(X_train) 

CountVectorizer(preprocessor=<function clean_text at 0x000002BD15144790>)

In [30]:
count_vect.get_feature_names()[10:25]

['_by',
 '_causal',
 '_check',
 '_clip_gradients',
 '_cpu',
 '_creator',
 '_cv',
 '_data',
 '_debug',
 '_detection',
 '_diffusion',
 '_dims',
 '_dir',
 '_e',
 '_estimators']

### Build Document-Term matrices

In [31]:
X_train_mat = count_vect.transform(X_train)

In [32]:
X_train_mat.shape

(4010, 15809)

In [33]:
X_test_mat = count_vect.transform(X_test)

In [34]:
X_test_mat.shape

(1719, 15809)

In [35]:
y_train.shape

(4010,)

In [36]:
y_train.isna().sum()

0

### Train with Naive Bayes

In [37]:
from sklearn.naive_bayes import MultinomialNB
cl = MultinomialNB()
cl.fit(X_train_mat,y_train)

MultinomialNB()

### Predict

In [38]:
y_pred = cl.predict(X_test_mat)
y_pred

array(['MachineLearning', 'artificial', 'MachineLearning', ...,
       'MachineLearning', 'artificial', 'MachineLearning'], dtype='<U15')

In [39]:
cl.predict_proba(X_test_mat)

array([[1.00000000e+000, 3.22157517e-301, 0.00000000e+000],
       [7.82914997e-002, 9.19756338e-001, 1.95216271e-003],
       [5.38977772e-001, 4.40373152e-001, 2.06490763e-002],
       ...,
       [9.99962472e-001, 2.02778121e-007, 3.73251068e-005],
       [1.22384514e-001, 8.41808988e-001, 3.58064978e-002],
       [8.48862566e-001, 1.29770957e-001, 2.13664765e-002]])

In [40]:
import numpy as np
ind = np.abs(cl.coef_[0]).argsort()



In [41]:
ind

array([ 8815,  7847,  9958, ...,  5195, 11505,  4325], dtype=int64)

In [42]:
ind[-1]

4325

In [43]:
count_vect.get_feature_names()[ind[-1]]

'ed'

In [44]:
features = count_vect.get_feature_names()

In [46]:
for i in range(20):
    index = ind[i]
    print(f'{i}: {features[index]}')

0: model
1: learning
2: paper
3: data
4: like
5: ai
6: machine
7: one
8: research
9: time
10: ml
11: using
12: use
13: work
14: would
15: training
16: image
17: network
18: also
19: deep


### Analyze Performance

In [48]:
from sklearn.metrics import classification_report
print(classification_report(y_test,y_pred))

                 precision    recall  f1-score   support

MachineLearning       0.72      0.80      0.75       598
     artificial       0.83      0.66      0.73       561
    datascience       0.80      0.88      0.84       560

       accuracy                           0.78      1719
      macro avg       0.78      0.78      0.78      1719
   weighted avg       0.78      0.78      0.78      1719



#### Classification report for ngrams ranging from 1 to 4

In [49]:
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
for i in range(1,5):
    count_vect = CountVectorizer(preprocessor=clean_text, ngram_range=(1,i)) 
    count_vect.fit(X_train) 
    #Build Document -term matrices
    X_train_mat = count_vect.transform(X_train)
    print(f'Shape of X_train_mat for {i} ngram/ngrams: '+ str(X_train_mat.shape))
    X_test_mat = count_vect.transform(X_test)
    print(f'Shape of X_test_mat for {i}  ngram/ngrams: '+ str(X_test_mat.shape))
    print(f'Shape of y_train for {i}  ngram/ngrams: '+ str(y_train.shape))
    print('------------------------------------------------------------------------------------------------------')
    cl = MultinomialNB()
    cl.fit(X_train_mat,y_train)
    y_pred = cl.predict(X_test_mat)
    print(f'y_pred values: '+ str(list(y_pred)[:10]))
    print('------------------------------------------------------------------------------------------------------')
    print(f'prediction probability: '+ str(cl.predict_proba(X_test_mat)[:10]))
    print('------------------------------------------------------------------------------------------------------')
    print(f'Classification Report for {i} ngram/ngrams: \n'+classification_report(y_test,y_pred))
    print('------------------------------------------------------------------------------------------------------')

Shape of X_train_mat for 1 ngram/ngrams: (4010, 15809)
Shape of X_test_mat for 1  ngram/ngrams: (1719, 15809)
Shape of y_train for 1  ngram/ngrams: (4010,)
------------------------------------------------------------------------------------------------------
y_pred values: ['MachineLearning', 'artificial', 'MachineLearning', 'artificial', 'MachineLearning', 'artificial', 'MachineLearning', 'datascience', 'datascience', 'datascience']
------------------------------------------------------------------------------------------------------
prediction probability: [[1.00000000e+000 3.22157517e-301 0.00000000e+000]
 [7.82914997e-002 9.19756338e-001 1.95216271e-003]
 [5.38977772e-001 4.40373152e-001 2.06490763e-002]
 [2.19783802e-002 9.77884104e-001 1.37516159e-004]
 [5.86144876e-001 4.08324160e-001 5.53096345e-003]
 [5.80538566e-002 9.38445722e-001 3.50042164e-003]
 [5.47286020e-001 3.71310571e-001 8.14034088e-002]
 [1.97471505e-005 3.77696883e-007 9.99979875e-001]
 [3.07210236e-002 9.0300852

The unigrams had better precision and F1 score than any other n gram range

## Make Document-Term Matrix (Using TFIDF vectorizer)

To analyze the prediction for the words that are not common among the 3 classes 

### Build Vectorizer

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(preprocessor= clean_text,ngram_range=(1,1))
tfidf.fit(X_train)

TfidfVectorizer(preprocessor=<function clean_text at 0x000002BD15144790>)

### Build Document-Term Matrices 

In [51]:
X_train_mat1 = tfidf.transform(X_train)

In [52]:
X_train_mat1.shape

(4010, 15809)

In [53]:
X_test_mat1 = tfidf.transform(X_test)

In [54]:
X_test_mat1.shape

(1719, 15809)

### Train with Naive Bayes 

In [55]:
from sklearn.naive_bayes import MultinomialNB
cl = MultinomialNB()

In [56]:
cl.fit(X_train_mat1,y_train)

MultinomialNB()

### Predict

In [57]:
y_pred1 = cl.predict(X_test_mat1)
y_pred1

array(['MachineLearning', 'artificial', 'MachineLearning', ...,
       'MachineLearning', 'artificial', 'MachineLearning'], dtype='<U15')

In [58]:
cl.predict_proba(X_test_mat1)

array([[0.89067176, 0.02341545, 0.08591279],
       [0.29615529, 0.49097407, 0.21287064],
       [0.43729809, 0.33734631, 0.2253556 ],
       ...,
       [0.62900399, 0.07409674, 0.29689927],
       [0.26651137, 0.48454461, 0.24894402],
       [0.51993187, 0.31824092, 0.1618272 ]])

In [60]:
ind = np.abs(cl.coef_[0]).argsort()

In [61]:
ind

array([ 8815,  7847,  9958, ..., 10583,  3748, 11168], dtype=int64)

In [62]:
features = tfidf.get_feature_names()

In [63]:
for i in range(20):
    index = ind[-1-i]
    print(f'{i}: {features[index]}')

0: quotation
1: diagnose
2: practises
3: pragmatic
4: dgp
5: pramukh
6: praying
7: prebuilt
8: dfs
9: dfphd
10: df_no_nulls_norm
11: precleaned
12: preconceived
13: predefine
14: df_no_nulls
15: predetermined
16: df_metrics
17: df
18: predicted_df
19: predicted_variable_name


In [64]:
for i in range(20):
    index = ind[i]
    print(f'{i}: {features[index]}')

0: model
1: learning
2: paper
3: machine
4: image
5: deep
6: ml
7: network
8: research
9: neural
10: using
11: data
12: ai
13: training
14: like
15: video
16: code
17: one
18: would
19: time


### Analyze Performance

In [65]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred1))

                 precision    recall  f1-score   support

MachineLearning       0.69      0.79      0.74       598
     artificial       0.89      0.55      0.68       561
    datascience       0.74      0.90      0.81       560

       accuracy                           0.75      1719
      macro avg       0.77      0.75      0.74      1719
   weighted avg       0.77      0.75      0.74      1719

