# Exploring titles

This notebook explores the controversial.csv and top.csv datasets of Reddit article titles for the month of September 2020.

What signs indicate an article is controversial? 

Is the title alone enough to make this determination? After all, it's a Reddit meme to say that Redditors do not read the article, but only the title. 

Later I'll scrape and process the articles themselves and see what can be done.

In [1]:
# first we'll run through the standard bag of words. Get some counts. 
import numpy as np
import pandas as pd
import gensim
import sklearn as sk

In [9]:
# load all the controversial datasets.

df_c0 = pd.read_csv('controversial-all.csv')
df_c1 = pd.read_csv('controversial-day.csv')
df_c2 = pd.read_csv('controversial-hour.csv')
df_c3 = pd.read_csv('controversial-month.csv')
df_c4 = pd.read_csv('controversial-today.csv')
df_c5 = pd.read_csv('controversial-week.csv')
df_c6 = pd.read_csv('controversial-year.csv')

# load all the top datasets
df_t0 = pd.read_csv('top-all.csv')
df_t1 = pd.read_csv('top-day.csv')
df_t2 = pd.read_csv('top-hour.csv')
df_t3 = pd.read_csv('top-month.csv')
df_t4 = pd.read_csv('top-today.csv')
df_t5 = pd.read_csv('top-week.csv')
df_t6 = pd.read_csv('top-year.csv')

In [14]:
# Merge controversial
dfc = df_c0.append(df_c1, ignore_index=True)
dfc = dfc.append(df_c2, ignore_index=True)
dfc = dfc.append(df_c3, ignore_index=True)
dfc = dfc.append(df_c4, ignore_index=True)
dfc = dfc.append(df_c5, ignore_index=True)
dfc = dfc.append(df_c6, ignore_index=True)
dfc.shape

(3675, 9)

In [15]:
dfc.head()

Unnamed: 0,title,score,upvote_ratio,id,url,comms_num,created,body,timestamp
0,Elon Musk's promised ventilators never deliver...,53,0.5,g2edod,https://www.foxcarolina.com/elon-musks-promise...,396,1587071000.0,,2020-04-16 21:55:55
1,ISIS orders all women and girls in Mosul to un...,342,0.56,2bl8d7,http://www.theguardian.com/world/2014/jul/24/i...,1110,1406234000.0,,2014-07-24 21:36:44
2,Justin Bieber Arrested for Drag Racing / DUI (...,730,0.71,1vxp05,http://www.nbcmiami.com/news/Justin-Bieber-Arr...,3612,1390507000.0,,2014-01-23 19:48:28
3,Kanye West halts Sydney concert after two fans...,7435,0.62,2gd9kr,http://www.independent.co.uk/arts-entertainmen...,2700,1410732000.0,,2014-09-14 22:52:35
4,Actress Ellen Page has come out as a lesbian,1080,0.73,1xyegt,http://www.hrc.org/,2300,1392464000.0,,2014-02-15 11:26:10


In [1]:
# merge top articles = pd.concat([pt1,pt2,pt3])
dft = df_t0.append(df_t1, ignore_index=True)
dft = dft.append(df_t2, ignore_index=True)
dft = dft.append(df_t3, ignore_index=True)
dft = dft.append(df_t4, ignore_index=True)
dft = dft.append(df_t5, ignore_index=True)
dft = dft.append(df_t6, ignore_index=True)
dft.shape

NameError: name 'df_t0' is not defined

In [11]:
dft.head()

Unnamed: 0,title,score,upvote_ratio,id,url,comms_num,created,body,timestamp
0,Blizzard Employees Staged a Walkout After the ...,226331,0.97,dfn3yi,https://www.thedailybeast.com/blizzard-employe...,9609,1570683000.0,,2019-10-10 05:45:17
1,Kobe Bryant killed in helicopter crash in Cali...,213688,0.91,eubjfc,https://www.fox5dc.com/news/kobe-bryant-killed...,20666,1580096000.0,,2020-01-27 03:37:58
2,Scientist Stephen Hawking has died aged 76,188182,0.92,84aebi,http://news.sky.com/story/scientist-stephen-ha...,6914,1521028000.0,,2018-03-14 11:45:28
3,Jeffrey Epstein's autopsy more consistent with...,186242,0.95,dp5lr1,https://www.foxnews.com/us/forensic-pathologis...,10048,1572465000.0,,2019-10-30 19:47:27
4,F.C.C. Announces Plan to Repeal Net Neutrality,177999,0.93,7ej943,https://www.nytimes.com/2017/11/21/technology/...,10848,1511312000.0,,2017-11-22 00:56:33


In [20]:
# add a column for all entries in df1 to signify controversial true
dfc["controversial"] = True
dft["controversial"] = False

In [18]:
dfc.head()

Unnamed: 0,title,score,upvote_ratio,id,url,comms_num,created,body,timestamp,controversial
0,Elon Musk's promised ventilators never deliver...,53,0.5,g2edod,https://www.foxcarolina.com/elon-musks-promise...,396,1587071000.0,,2020-04-16 21:55:55,True
1,ISIS orders all women and girls in Mosul to un...,342,0.56,2bl8d7,http://www.theguardian.com/world/2014/jul/24/i...,1110,1406234000.0,,2014-07-24 21:36:44,True
2,Justin Bieber Arrested for Drag Racing / DUI (...,730,0.71,1vxp05,http://www.nbcmiami.com/news/Justin-Bieber-Arr...,3612,1390507000.0,,2014-01-23 19:48:28,True
3,Kanye West halts Sydney concert after two fans...,7435,0.62,2gd9kr,http://www.independent.co.uk/arts-entertainmen...,2700,1410732000.0,,2014-09-14 22:52:35,True
4,Actress Ellen Page has come out as a lesbian,1080,0.73,1xyegt,http://www.hrc.org/,2300,1392464000.0,,2014-02-15 11:26:10,True


In [24]:
dfc = dfc.drop_duplicates()

In [25]:
dft = dft.drop_duplicates()

In [32]:
# get list of ids
controversial_ids = [id for id in dfc.id]
print(len(controversial_ids))
dft = dft[~dft.id.isin(controversial_ids)]
print(dft.shape)
# controversial_ids

3603
(2931, 10)


In [34]:
df = dfc.append(dft, ignore_index=True)
df.shape

(6534, 10)

In [35]:
df.to_csv('top_and_controversial_lg.csv')