In [1]:
import pandas as pd

In [2]:
#import data reddit
reddit = pd.read_csv('reddit.csv')
reddit.head(3)

Unnamed: 0,TITLE,SCORE,ID,SUBREDDIT,URL,NUM_COMMENTS,BODY,CREATED,TIMESTAMP
0,READ THIS BEFORE POSTING OR COMMMENTING. RULES...,44,6dwp6l,SuggestALaptop,https://www.reddit.com/r/suggestalaptop/wiki/r...,0,,1496041000.0,2017-05-29 14:49:20
1,/R/SuggestALaptop Stress Test Project! Submit ...,22,ekkvox,SuggestALaptop,https://www.reddit.com/r/SuggestALaptop/commen...,2,Laptops this gen often have thermal or TDP thr...,1578295000.0,2020-01-06 15:23:07
2,"I, a 13 year old needs a cheap laptop that can...",71,fru7bw,SuggestALaptop,https://www.reddit.com/r/SuggestALaptop/commen...,25,I need a laptop that can run those games smoot...,1585615000.0,2020-03-31 08:41:12


In [3]:
#import data reddit comments
comments = pd.read_csv('comments.csv')
comments.head(3)

Unnamed: 0,ID,COMMENTS,SUBREDDIT
0,ekkvox,If you have questions our [Discord Server](htt...,SuggestALaptop
1,ekkvox,Can anyone help me out with purchasing a lapto...,SuggestALaptop
2,fru7bw,The[ **MOTILE Laptop**](https://goto.walmart.c...,SuggestALaptop


In [4]:
#check data dimensions
print(reddit.shape)
print(comments.shape)

(5975, 9)
(14641, 3)


In [5]:
#columns that need cleaning in each dataframe:
#1) reddit = columns TITLE and BODY
#2) comments = column COMMENTS

In [6]:
#convert columns' type to string
reddit[['TITLE', 'BODY']] = reddit[['TITLE', 'BODY']].astype(str)
comments['COMMENTS'] = comments['COMMENTS'].astype(str)

In [7]:
#check datatype
print(reddit.dtypes)
print(comments.dtypes)

TITLE            object
SCORE             int64
ID               object
SUBREDDIT        object
URL              object
NUM_COMMENTS      int64
BODY             object
CREATED         float64
TIMESTAMP        object
dtype: object
ID           object
COMMENTS     object
SUBREDDIT    object
dtype: object


In [8]:
#convert character to ASCII
import unicodedata

reddit = pd.DataFrame.from_dict(reddit, orient='columns')
print(reddit['TITLE'].apply(lambda val: unicodedata.normalize('NFKD', val).encode('ascii', 'ignore').decode()))
print(reddit['BODY'].apply(lambda val: unicodedata.normalize('NFKD', val).encode('ascii', 'ignore').decode()))

comments = pd.DataFrame.from_dict(comments, orient='columns')
print(comments['COMMENTS'].apply(lambda val: unicodedata.normalize('NFKD', val).encode('ascii', 'ignore').decode()))

0       READ THIS BEFORE POSTING OR COMMMENTING. RULES...
1       /R/SuggestALaptop Stress Test Project! Submit ...
2       I, a 13 year old needs a cheap laptop that can...
3       Need something good for Wacom tablet use, and ...
4                                Advice in Gaming Laptops
5                            Good laptop around 500 (GER)
6                   Looking for gaming laptop for ~700 eu
7       A laptop which can run M&B: Bannerlord efficie...
8       Searching for laptop (<1000) for sound/video e...
9                                  FHD+ 14"-16" USA ~$700
10      Looking for a laptop for college work, sound +...
11      Looking for a laptop that's got better heating...
12      NEW 357F9 Laptop Battery Compatible with Inspi...
13           Looking for a High-Quality gaming ultrabook.
14                       Budget-$300 or less, Central U.S
15      Is the HP Omen 15 (i5 8300h rtx 2060 ) good/wo...
16                            USA, <$750, >15 inch screen
17            

## Normalization

In [9]:
import re

In [10]:
def  clean_text(df, text_field, new_text_field_name):
    df[new_text_field_name] = df[text_field].str.lower()
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"(@[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", elem))  
    # remove numbers
    df[new_text_field_name] = df[new_text_field_name].apply(lambda elem: re.sub(r"\d+", "", elem))
    
    return df

In [11]:
reddit_clean = clean_text(reddit, 'TITLE', 'TITLE_CLEAN')
reddit_clean.head(3)

Unnamed: 0,TITLE,SCORE,ID,SUBREDDIT,URL,NUM_COMMENTS,BODY,CREATED,TIMESTAMP,TITLE_CLEAN
0,READ THIS BEFORE POSTING OR COMMMENTING. RULES...,44,6dwp6l,SuggestALaptop,https://www.reddit.com/r/suggestalaptop/wiki/r...,0,,1496041000.0,2017-05-29 14:49:20,read this before posting or commmenting rules ...
1,/R/SuggestALaptop Stress Test Project! Submit ...,22,ekkvox,SuggestALaptop,https://www.reddit.com/r/SuggestALaptop/commen...,2,Laptops this gen often have thermal or TDP thr...,1578295000.0,2020-01-06 15:23:07,rsuggestalaptop stress test project submit and...
2,"I, a 13 year old needs a cheap laptop that can...",71,fru7bw,SuggestALaptop,https://www.reddit.com/r/SuggestALaptop/commen...,25,I need a laptop that can run those games smoot...,1585615000.0,2020-03-31 08:41:12,i a year old needs a cheap laptop that can ru...


In [12]:
reddit_clean = clean_text(reddit_clean, 'BODY', 'BODY_CLEAN')
reddit_clean.head(3)

Unnamed: 0,TITLE,SCORE,ID,SUBREDDIT,URL,NUM_COMMENTS,BODY,CREATED,TIMESTAMP,TITLE_CLEAN,BODY_CLEAN
0,READ THIS BEFORE POSTING OR COMMMENTING. RULES...,44,6dwp6l,SuggestALaptop,https://www.reddit.com/r/suggestalaptop/wiki/r...,0,,1496041000.0,2017-05-29 14:49:20,read this before posting or commmenting rules ...,
1,/R/SuggestALaptop Stress Test Project! Submit ...,22,ekkvox,SuggestALaptop,https://www.reddit.com/r/SuggestALaptop/commen...,2,Laptops this gen often have thermal or TDP thr...,1578295000.0,2020-01-06 15:23:07,rsuggestalaptop stress test project submit and...,laptops this gen often have thermal or tdp thr...
2,"I, a 13 year old needs a cheap laptop that can...",71,fru7bw,SuggestALaptop,https://www.reddit.com/r/SuggestALaptop/commen...,25,I need a laptop that can run those games smoot...,1585615000.0,2020-03-31 08:41:12,i a year old needs a cheap laptop that can ru...,i need a laptop that can run those games smoot...


In [13]:
comments_clean = clean_text(comments, 'COMMENTS', 'COMMENTS_CLEAN')
comments_clean.head(3)

Unnamed: 0,ID,COMMENTS,SUBREDDIT,COMMENTS_CLEAN
0,ekkvox,If you have questions our [Discord Server](htt...,SuggestALaptop,if you have questions our discord server is th...
1,ekkvox,Can anyone help me out with purchasing a lapto...,SuggestALaptop,can anyone help me out with purchasing a lapto...
2,fru7bw,The[ **MOTILE Laptop**](https://goto.walmart.c...,SuggestALaptop,the motile laptop would get you all that you n...


## Stop words removal

In [14]:
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/sidratulmuntaha/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [15]:
stop = stopwords.words('english')

reddit_clean['TITLE_CLEAN'] = reddit_clean['TITLE_CLEAN'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
reddit_clean.head(3)

Unnamed: 0,TITLE,SCORE,ID,SUBREDDIT,URL,NUM_COMMENTS,BODY,CREATED,TIMESTAMP,TITLE_CLEAN,BODY_CLEAN
0,READ THIS BEFORE POSTING OR COMMMENTING. RULES...,44,6dwp6l,SuggestALaptop,https://www.reddit.com/r/suggestalaptop/wiki/r...,0,,1496041000.0,2017-05-29 14:49:20,read posting commmenting rules forms affiliate...,
1,/R/SuggestALaptop Stress Test Project! Submit ...,22,ekkvox,SuggestALaptop,https://www.reddit.com/r/SuggestALaptop/commen...,2,Laptops this gen often have thermal or TDP thr...,1578295000.0,2020-01-06 15:23:07,rsuggestalaptop stress test project submit vie...,laptops this gen often have thermal or tdp thr...
2,"I, a 13 year old needs a cheap laptop that can...",71,fru7bw,SuggestALaptop,https://www.reddit.com/r/SuggestALaptop/commen...,25,I need a laptop that can run those games smoot...,1585615000.0,2020-03-31 08:41:12,year old needs cheap laptop run coolmathgames ...,i need a laptop that can run those games smoot...


In [16]:
reddit_clean['BODY_CLEAN'] = reddit_clean['BODY_CLEAN'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
reddit_clean.head(3)

Unnamed: 0,TITLE,SCORE,ID,SUBREDDIT,URL,NUM_COMMENTS,BODY,CREATED,TIMESTAMP,TITLE_CLEAN,BODY_CLEAN
0,READ THIS BEFORE POSTING OR COMMMENTING. RULES...,44,6dwp6l,SuggestALaptop,https://www.reddit.com/r/suggestalaptop/wiki/r...,0,,1496041000.0,2017-05-29 14:49:20,read posting commmenting rules forms affiliate...,
1,/R/SuggestALaptop Stress Test Project! Submit ...,22,ekkvox,SuggestALaptop,https://www.reddit.com/r/SuggestALaptop/commen...,2,Laptops this gen often have thermal or TDP thr...,1578295000.0,2020-01-06 15:23:07,rsuggestalaptop stress test project submit vie...,laptops gen often thermal tdp throttling issue...
2,"I, a 13 year old needs a cheap laptop that can...",71,fru7bw,SuggestALaptop,https://www.reddit.com/r/SuggestALaptop/commen...,25,I need a laptop that can run those games smoot...,1585615000.0,2020-03-31 08:41:12,year old needs cheap laptop run coolmathgames ...,need laptop run games smoothly cause damn love...


In [17]:
comments_clean['COMMENTS_CLEAN'] = comments_clean['COMMENTS_CLEAN'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
comments_clean.head(3)

Unnamed: 0,ID,COMMENTS,SUBREDDIT,COMMENTS_CLEAN
0,ekkvox,If you have questions our [Discord Server](htt...,SuggestALaptop,questions discord server best way get answers ...
1,ekkvox,Can anyone help me out with purchasing a lapto...,SuggestALaptop,anyone help purchasing laptop audio production...
2,fru7bw,The[ **MOTILE Laptop**](https://goto.walmart.c...,SuggestALaptop,motile laptop would get need budget fhd amd ry...


## Stemming

In [18]:
#tokenizing
import nltk 
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/sidratulmuntaha/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


In [19]:
reddit_clean['TITLE_TOKENS'] = reddit_clean['TITLE_CLEAN'].apply(lambda x: word_tokenize(x))
reddit_clean.head(3)

Unnamed: 0,TITLE,SCORE,ID,SUBREDDIT,URL,NUM_COMMENTS,BODY,CREATED,TIMESTAMP,TITLE_CLEAN,BODY_CLEAN,TITLE_TOKENS
0,READ THIS BEFORE POSTING OR COMMMENTING. RULES...,44,6dwp6l,SuggestALaptop,https://www.reddit.com/r/suggestalaptop/wiki/r...,0,,1496041000.0,2017-05-29 14:49:20,read posting commmenting rules forms affiliate...,,"[read, posting, commmenting, rules, forms, aff..."
1,/R/SuggestALaptop Stress Test Project! Submit ...,22,ekkvox,SuggestALaptop,https://www.reddit.com/r/SuggestALaptop/commen...,2,Laptops this gen often have thermal or TDP thr...,1578295000.0,2020-01-06 15:23:07,rsuggestalaptop stress test project submit vie...,laptops gen often thermal tdp throttling issue...,"[rsuggestalaptop, stress, test, project, submi..."
2,"I, a 13 year old needs a cheap laptop that can...",71,fru7bw,SuggestALaptop,https://www.reddit.com/r/SuggestALaptop/commen...,25,I need a laptop that can run those games smoot...,1585615000.0,2020-03-31 08:41:12,year old needs cheap laptop run coolmathgames ...,need laptop run games smoothly cause damn love...,"[year, old, needs, cheap, laptop, run, coolmat..."


In [20]:
reddit_clean['BODY_TOKENS'] = reddit_clean['BODY_CLEAN'].apply(lambda x: word_tokenize(x))
reddit_clean.head(3)

Unnamed: 0,TITLE,SCORE,ID,SUBREDDIT,URL,NUM_COMMENTS,BODY,CREATED,TIMESTAMP,TITLE_CLEAN,BODY_CLEAN,TITLE_TOKENS,BODY_TOKENS
0,READ THIS BEFORE POSTING OR COMMMENTING. RULES...,44,6dwp6l,SuggestALaptop,https://www.reddit.com/r/suggestalaptop/wiki/r...,0,,1496041000.0,2017-05-29 14:49:20,read posting commmenting rules forms affiliate...,,"[read, posting, commmenting, rules, forms, aff...",[nan]
1,/R/SuggestALaptop Stress Test Project! Submit ...,22,ekkvox,SuggestALaptop,https://www.reddit.com/r/SuggestALaptop/commen...,2,Laptops this gen often have thermal or TDP thr...,1578295000.0,2020-01-06 15:23:07,rsuggestalaptop stress test project submit vie...,laptops gen often thermal tdp throttling issue...,"[rsuggestalaptop, stress, test, project, submi...","[laptops, gen, often, thermal, tdp, throttling..."
2,"I, a 13 year old needs a cheap laptop that can...",71,fru7bw,SuggestALaptop,https://www.reddit.com/r/SuggestALaptop/commen...,25,I need a laptop that can run those games smoot...,1585615000.0,2020-03-31 08:41:12,year old needs cheap laptop run coolmathgames ...,need laptop run games smoothly cause damn love...,"[year, old, needs, cheap, laptop, run, coolmat...","[need, laptop, run, games, smoothly, cause, da..."


In [21]:
comments_clean['COMMENTS_TOKENS'] = comments_clean['COMMENTS_CLEAN'].apply(lambda x: word_tokenize(x))
comments_clean.head(3)

Unnamed: 0,ID,COMMENTS,SUBREDDIT,COMMENTS_CLEAN,COMMENTS_TOKENS
0,ekkvox,If you have questions our [Discord Server](htt...,SuggestALaptop,questions discord server best way get answers ...,"[questions, discord, server, best, way, get, a..."
1,ekkvox,Can anyone help me out with purchasing a lapto...,SuggestALaptop,anyone help purchasing laptop audio production...,"[anyone, help, purchasing, laptop, audio, prod..."
2,fru7bw,The[ **MOTILE Laptop**](https://goto.walmart.c...,SuggestALaptop,motile laptop would get need budget fhd amd ry...,"[motile, laptop, would, get, need, budget, fhd..."


In [22]:
#stemming
from nltk.stem import PorterStemmer

def word_stemmer(text):
    stem_text = [PorterStemmer().stem(i) for i in text]
    return stem_text

In [23]:
reddit_clean['TITLE_TOKENS_STEM'] = reddit_clean['TITLE_TOKENS'].apply(lambda x: word_stemmer(x))
reddit_clean.head(3)

Unnamed: 0,TITLE,SCORE,ID,SUBREDDIT,URL,NUM_COMMENTS,BODY,CREATED,TIMESTAMP,TITLE_CLEAN,BODY_CLEAN,TITLE_TOKENS,BODY_TOKENS,TITLE_TOKENS_STEM
0,READ THIS BEFORE POSTING OR COMMMENTING. RULES...,44,6dwp6l,SuggestALaptop,https://www.reddit.com/r/suggestalaptop/wiki/r...,0,,1496041000.0,2017-05-29 14:49:20,read posting commmenting rules forms affiliate...,,"[read, posting, commmenting, rules, forms, aff...",[nan],"[read, post, commment, rule, form, affili, lin..."
1,/R/SuggestALaptop Stress Test Project! Submit ...,22,ekkvox,SuggestALaptop,https://www.reddit.com/r/SuggestALaptop/commen...,2,Laptops this gen often have thermal or TDP thr...,1578295000.0,2020-01-06 15:23:07,rsuggestalaptop stress test project submit vie...,laptops gen often thermal tdp throttling issue...,"[rsuggestalaptop, stress, test, project, submi...","[laptops, gen, often, thermal, tdp, throttling...","[rsuggestalaptop, stress, test, project, submi..."
2,"I, a 13 year old needs a cheap laptop that can...",71,fru7bw,SuggestALaptop,https://www.reddit.com/r/SuggestALaptop/commen...,25,I need a laptop that can run those games smoot...,1585615000.0,2020-03-31 08:41:12,year old needs cheap laptop run coolmathgames ...,need laptop run games smoothly cause damn love...,"[year, old, needs, cheap, laptop, run, coolmat...","[need, laptop, run, games, smoothly, cause, da...","[year, old, need, cheap, laptop, run, coolmath..."


In [24]:
reddit_clean['BODY_TOKENS_STEM'] = reddit_clean['BODY_TOKENS'].apply(lambda x: word_stemmer(x))
reddit_clean.head(3)

Unnamed: 0,TITLE,SCORE,ID,SUBREDDIT,URL,NUM_COMMENTS,BODY,CREATED,TIMESTAMP,TITLE_CLEAN,BODY_CLEAN,TITLE_TOKENS,BODY_TOKENS,TITLE_TOKENS_STEM,BODY_TOKENS_STEM
0,READ THIS BEFORE POSTING OR COMMMENTING. RULES...,44,6dwp6l,SuggestALaptop,https://www.reddit.com/r/suggestalaptop/wiki/r...,0,,1496041000.0,2017-05-29 14:49:20,read posting commmenting rules forms affiliate...,,"[read, posting, commmenting, rules, forms, aff...",[nan],"[read, post, commment, rule, form, affili, lin...",[nan]
1,/R/SuggestALaptop Stress Test Project! Submit ...,22,ekkvox,SuggestALaptop,https://www.reddit.com/r/SuggestALaptop/commen...,2,Laptops this gen often have thermal or TDP thr...,1578295000.0,2020-01-06 15:23:07,rsuggestalaptop stress test project submit vie...,laptops gen often thermal tdp throttling issue...,"[rsuggestalaptop, stress, test, project, submi...","[laptops, gen, often, thermal, tdp, throttling...","[rsuggestalaptop, stress, test, project, submi...","[laptop, gen, often, thermal, tdp, throttl, is..."
2,"I, a 13 year old needs a cheap laptop that can...",71,fru7bw,SuggestALaptop,https://www.reddit.com/r/SuggestALaptop/commen...,25,I need a laptop that can run those games smoot...,1585615000.0,2020-03-31 08:41:12,year old needs cheap laptop run coolmathgames ...,need laptop run games smoothly cause damn love...,"[year, old, needs, cheap, laptop, run, coolmat...","[need, laptop, run, games, smoothly, cause, da...","[year, old, need, cheap, laptop, run, coolmath...","[need, laptop, run, game, smoothli, caus, damn..."


In [25]:
comments_clean['COMMENTS_TOKENS_STEM'] = comments_clean['COMMENTS_TOKENS'].apply(lambda x: word_stemmer(x))
comments_clean.head(3)

Unnamed: 0,ID,COMMENTS,SUBREDDIT,COMMENTS_CLEAN,COMMENTS_TOKENS,COMMENTS_TOKENS_STEM
0,ekkvox,If you have questions our [Discord Server](htt...,SuggestALaptop,questions discord server best way get answers ...,"[questions, discord, server, best, way, get, a...","[question, discord, server, best, way, get, an..."
1,ekkvox,Can anyone help me out with purchasing a lapto...,SuggestALaptop,anyone help purchasing laptop audio production...,"[anyone, help, purchasing, laptop, audio, prod...","[anyon, help, purchas, laptop, audio, product,..."
2,fru7bw,The[ **MOTILE Laptop**](https://goto.walmart.c...,SuggestALaptop,motile laptop would get need budget fhd amd ry...,"[motile, laptop, would, get, need, budget, fhd...","[motil, laptop, would, get, need, budget, fhd,..."


In [26]:
#drop unwanted columns
reddit_new = reddit_clean.drop(['TITLE', 'BODY', 'TITLE_CLEAN', 'BODY_CLEAN', 'TITLE_TOKENS', 'BODY_TOKENS'], axis=1)
print(reddit_new.shape)
reddit_new.head(3)

(5975, 9)


Unnamed: 0,SCORE,ID,SUBREDDIT,URL,NUM_COMMENTS,CREATED,TIMESTAMP,TITLE_TOKENS_STEM,BODY_TOKENS_STEM
0,44,6dwp6l,SuggestALaptop,https://www.reddit.com/r/suggestalaptop/wiki/r...,0,1496041000.0,2017-05-29 14:49:20,"[read, post, commment, rule, form, affili, lin...",[nan]
1,22,ekkvox,SuggestALaptop,https://www.reddit.com/r/SuggestALaptop/commen...,2,1578295000.0,2020-01-06 15:23:07,"[rsuggestalaptop, stress, test, project, submi...","[laptop, gen, often, thermal, tdp, throttl, is..."
2,71,fru7bw,SuggestALaptop,https://www.reddit.com/r/SuggestALaptop/commen...,25,1585615000.0,2020-03-31 08:41:12,"[year, old, need, cheap, laptop, run, coolmath...","[need, laptop, run, game, smoothli, caus, damn..."


In [27]:
comments_new = comments_clean.drop(['COMMENTS', 'COMMENTS_CLEAN', 'COMMENTS_TOKENS'], axis=1)
print(comments_new.shape)
comments_new.head(3)

(14641, 3)


Unnamed: 0,ID,SUBREDDIT,COMMENTS_TOKENS_STEM
0,ekkvox,SuggestALaptop,"[question, discord, server, best, way, get, an..."
1,ekkvox,SuggestALaptop,"[anyon, help, purchas, laptop, audio, product,..."
2,fru7bw,SuggestALaptop,"[motil, laptop, would, get, need, budget, fhd,..."


In [28]:
#check for missing values
reddit_new.isnull().sum()

SCORE                0
ID                   0
SUBREDDIT            0
URL                  0
NUM_COMMENTS         0
CREATED              0
TIMESTAMP            0
TITLE_TOKENS_STEM    0
BODY_TOKENS_STEM     0
dtype: int64

In [29]:
comments_new.isnull().sum()

ID                      0
SUBREDDIT               0
COMMENTS_TOKENS_STEM    0
dtype: int64