In [1]:
from bs4 import BeautifulSoup
import re
import nltk
from nltk.corpus import stopwords
import pymongo
from pymongo import MongoClient
import pandas as pd

#### Establish connection with MongoDB using pymongo

In [2]:

client = MongoClient('mongodb://localhost:27017/')
db = client["Reddit_India_2"]
posts = db.posts

#### Function to remove symbols and stopwords from text

In [3]:
replace_with_space = re.compile('[/(){}\[\]\|@,;]')
deleteSymbol = re.compile('[^0-9a-z #+_]')
stopwords = set(stopwords.words('english'))

def cleanText(text):
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = text.lower() # lowercase text
    text = replace_with_space.sub(' ', text) # replace certain symbols by space in text
    text = deleteSymbol.sub('', text) # delete symbols from text
    text = ' '.join(word for word in text.split() if word not in stopwords) # remove stopwords from text
    return text

Store the data in Pandas Dataframe

In [4]:
collection = db.posts
posts = pd.DataFrame(list(collection.find()))
del posts['_id']


In [5]:
posts.head()

Unnamed: 0,title,score,id,url,comms_num,created,body,author,flair,over_18,comments,authors
0,"Lost my Job, Sick Mother and Paralysed Dad, In...",1053,g014wc,https://www.reddit.com/r/india/comments/g014wc...,134,1586742148.0,Hi....It's really tough time for everyone. I r...,sanand_satwik,AskIndia,False,I'm a freelancer. Don't listen to the idiots ...,hashedram diabapp xataari Aashayrao sarcrasti...
1,Why does the government come with a begging bo...,649,fxofyu,https://www.reddit.com/r/india/comments/fxofyu...,204,1586448261.0,"We have floods, terrorist attacks, famines due...",TWO-WHEELER-MAFIA,AskIndia,False,I don't understand why they don't use money f...,Kinky-Monk ak32009 fools_eye None DwncstSheep...
2,Mother's condition is going worse due to hepat...,762,g0zlly,https://www.reddit.com/r/india/comments/g0zlly...,94,1586870771.0,"Hi folks, I really appreciate the warm respons...",sanand_satwik,AskIndia,False,If anyone knows who is influential on Twitter...,AlternativeDrop6 TheRobotsHaveCome lanky32 pl...
3,People stuck with their family during the lock...,159,g4lrhm,https://www.reddit.com/r/india/comments/g4lrhm...,117,1587383829.0,I don't think we've spend so much time with fa...,GauGau24,AskIndia,False,yesterday we had a major fight. (me and my wi...,Best-Economist Srthak_ ppccbba tb33296 damnji...
4,How prominent is the caste system in India now...,111,g6tldd,https://www.reddit.com/r/india/comments/g6tldd...,107,1587700435.0,Does caste still exist in India? Do people sti...,Oomada9,AskIndia,False,Very much intact. I know a girl from UP who w...,Cierno Vpee26 ppccbba merlin318 nou_kar Buns4...


Clean the text in title, body and comments columns of the dataframe

In [6]:
posts['title'] = posts['title'].apply(str)
posts['body'] = posts['body'].apply(str)
posts['comments'] = posts['comments'].apply(str)

In [7]:

posts['title'] = posts['title'].apply(cleanText)
posts['body'] = posts['body'].apply(cleanText)
posts['comments'] = posts['comments'].apply(cleanText)


Introduce combined features and save the data to a csv.

In [8]:
title_comments = posts["title"] + " " + posts["comments"]
title_body  = posts["title"] + " " + posts["body"]
body_comments = posts["comments"] + " " + posts["body"]
title_comments_body = posts["title"] + " " + posts["comments"] + " " + posts["body"]
title_comments_url = posts["title"] + " " + posts["comments"] + " " + posts["url"]
all_features = posts["title"] + " " + posts["comments"] + " " + posts["body"] + " " + posts["url"]

In [9]:

posts = posts.assign(title_comments_body = title_comments_body)
posts = posts.assign(title_comments_url = title_comments_url)
posts = posts.assign(all_features = all_features)

posts = posts.assign(title_comments = title_comments)
posts = posts.assign(title_body = title_body)
posts = posts.assign(body_comments = body_comments)
#saving the csv file
posts.to_csv('csv_data\\redditIndia2.csv',index=False)