In [3]:
import csv
import os
from collections import defaultdict
import pandas as pd
from nltk.corpus import stopwords
from textblob import TextBlob, Word
from gensim.scripts.glove2word2vec import glove2word2vec
from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from gensim.models import KeyedVectors # load the Stanford GloVe model
import ftfy
import string
from nltk.tokenize import word_tokenize
import pickle

#nltk.download()

In [4]:
#reading csv
train = pd.read_csv('articles_small.csv', encoding='ISO-8859-1',low_memory=False)
#train

In [5]:
train = train[train.notnull()]
#train

In [6]:
train = train.dropna(how='any') 
#train

In [7]:
heads = train['title']
#heads

In [8]:
descs = train['content']
#descs

In [9]:
title_list = []
for i in heads:
    title = ftfy.fix_text(i)
    title_list.append(title)  

In [10]:
title_list

['House Republicans Fret About Winning Their Health Care Suit - The New York Times',
 'Rift Between Officers and Residents as Killings Persist in South Bronx - The New York Times',
 "Tyrus Wong, 'Bambi' Artist Thwarted by Racial Bias, Dies at 106 - The New York Times",
 'Among Deaths in 2016, a Heavy Toll in Pop Music - The New York Times',
 'Kim Jong-un Says North Korea Is Preparing to Test Long-Range Missile - The New York Times',
 "Sick With a Cold, Queen Elizabeth Misses New Year's Service - The New York Times",
 "Taiwan's President Accuses China of Renewed Intimidation - The New York Times",
 "After 'The Biggest Loser,' Their Bodies Fought to Regain Weight - The New York Times",
 'First, a Mixtape. Then a Romance. - The New York Times',
 'Calling on Angels While Enduring the Trials of Job - The New York Times',
 "Weak Federal Powers Could Limit Trump's Climate-Policy Rollback - The New York Times",
 'Can Carbon Capture Technology Prosper Under Trump? - The New York Times',
 'Mar-a

In [11]:
content_list = []
for i in descs:
    descs = ftfy.fix_text(i)
    content_list.append(descs)

In [12]:
content_list[0]

'WASHINGTON  —   Congressional Republicans have a new fear when it comes to their    health care lawsuit against the Obama administration: They might win. The incoming Trump administration could choose to no longer defend the executive branch against the suit, which challenges the administration\'s authority to spend billions of dollars on health insurance subsidies for   and   Americans, handing House Republicans a big victory on    issues. But a sudden loss of the disputed subsidies could conceivably cause the health care program to implode, leaving millions of people without access to health insurance before Republicans have prepared a replacement. That could lead to chaos in the insurance market and spur a political backlash just as Republicans gain full control of the government. To stave off that outcome, Republicans could find themselves in the awkward position of appropriating huge sums to temporarily prop up the Obama health care law, angering conservative voters who have been

In [13]:

title_list = [''.join(c for c in s if c not in string.punctuation) for s in title_list]


In [14]:
content_list = [''.join(c for c in s if c not in string.punctuation) for s in content_list]


In [15]:
tokenized_title = [word_tokenize(i) for i in title_list]

In [16]:
tokenized_title

[['House',
  'Republicans',
  'Fret',
  'About',
  'Winning',
  'Their',
  'Health',
  'Care',
  'Suit',
  'The',
  'New',
  'York',
  'Times'],
 ['Rift',
  'Between',
  'Officers',
  'and',
  'Residents',
  'as',
  'Killings',
  'Persist',
  'in',
  'South',
  'Bronx',
  'The',
  'New',
  'York',
  'Times'],
 ['Tyrus',
  'Wong',
  'Bambi',
  'Artist',
  'Thwarted',
  'by',
  'Racial',
  'Bias',
  'Dies',
  'at',
  '106',
  'The',
  'New',
  'York',
  'Times'],
 ['Among',
  'Deaths',
  'in',
  '2016',
  'a',
  'Heavy',
  'Toll',
  'in',
  'Pop',
  'Music',
  'The',
  'New',
  'York',
  'Times'],
 ['Kim',
  'Jongun',
  'Says',
  'North',
  'Korea',
  'Is',
  'Preparing',
  'to',
  'Test',
  'LongRange',
  'Missile',
  'The',
  'New',
  'York',
  'Times'],
 ['Sick',
  'With',
  'a',
  'Cold',
  'Queen',
  'Elizabeth',
  'Misses',
  'New',
  'Years',
  'Service',
  'The',
  'New',
  'York',
  'Times'],
 ['Taiwans',
  'President',
  'Accuses',
  'China',
  'of',
  'Renewed',
  'Intimidatio

In [17]:
tokenized_content = [word_tokenize(i) for i in content_list]

In [18]:
tokenized_content[0]

['WASHINGTON',
 '—',
 'Congressional',
 'Republicans',
 'have',
 'a',
 'new',
 'fear',
 'when',
 'it',
 'comes',
 'to',
 'their',
 'health',
 'care',
 'lawsuit',
 'against',
 'the',
 'Obama',
 'administration',
 'They',
 'might',
 'win',
 'The',
 'incoming',
 'Trump',
 'administration',
 'could',
 'choose',
 'to',
 'no',
 'longer',
 'defend',
 'the',
 'executive',
 'branch',
 'against',
 'the',
 'suit',
 'which',
 'challenges',
 'the',
 'administrations',
 'authority',
 'to',
 'spend',
 'billions',
 'of',
 'dollars',
 'on',
 'health',
 'insurance',
 'subsidies',
 'for',
 'and',
 'Americans',
 'handing',
 'House',
 'Republicans',
 'a',
 'big',
 'victory',
 'on',
 'issues',
 'But',
 'a',
 'sudden',
 'loss',
 'of',
 'the',
 'disputed',
 'subsidies',
 'could',
 'conceivably',
 'cause',
 'the',
 'health',
 'care',
 'program',
 'to',
 'implode',
 'leaving',
 'millions',
 'of',
 'people',
 'without',
 'access',
 'to',
 'health',
 'insurance',
 'before',
 'Republicans',
 'have',
 'prepared',
 

In [19]:
stop = stopwords.words('english')

In [20]:
filtered_title = [word for word in tokenized_title if word not in stop]


In [21]:
filtered_title

[['House',
  'Republicans',
  'Fret',
  'About',
  'Winning',
  'Their',
  'Health',
  'Care',
  'Suit',
  'The',
  'New',
  'York',
  'Times'],
 ['Rift',
  'Between',
  'Officers',
  'and',
  'Residents',
  'as',
  'Killings',
  'Persist',
  'in',
  'South',
  'Bronx',
  'The',
  'New',
  'York',
  'Times'],
 ['Tyrus',
  'Wong',
  'Bambi',
  'Artist',
  'Thwarted',
  'by',
  'Racial',
  'Bias',
  'Dies',
  'at',
  '106',
  'The',
  'New',
  'York',
  'Times'],
 ['Among',
  'Deaths',
  'in',
  '2016',
  'a',
  'Heavy',
  'Toll',
  'in',
  'Pop',
  'Music',
  'The',
  'New',
  'York',
  'Times'],
 ['Kim',
  'Jongun',
  'Says',
  'North',
  'Korea',
  'Is',
  'Preparing',
  'to',
  'Test',
  'LongRange',
  'Missile',
  'The',
  'New',
  'York',
  'Times'],
 ['Sick',
  'With',
  'a',
  'Cold',
  'Queen',
  'Elizabeth',
  'Misses',
  'New',
  'Years',
  'Service',
  'The',
  'New',
  'York',
  'Times'],
 ['Taiwans',
  'President',
  'Accuses',
  'China',
  'of',
  'Renewed',
  'Intimidatio

In [22]:
filtered_content = [word for word in tokenized_content if word not in stop]


In [23]:
filtered_content

[['WASHINGTON',
  '—',
  'Congressional',
  'Republicans',
  'have',
  'a',
  'new',
  'fear',
  'when',
  'it',
  'comes',
  'to',
  'their',
  'health',
  'care',
  'lawsuit',
  'against',
  'the',
  'Obama',
  'administration',
  'They',
  'might',
  'win',
  'The',
  'incoming',
  'Trump',
  'administration',
  'could',
  'choose',
  'to',
  'no',
  'longer',
  'defend',
  'the',
  'executive',
  'branch',
  'against',
  'the',
  'suit',
  'which',
  'challenges',
  'the',
  'administrations',
  'authority',
  'to',
  'spend',
  'billions',
  'of',
  'dollars',
  'on',
  'health',
  'insurance',
  'subsidies',
  'for',
  'and',
  'Americans',
  'handing',
  'House',
  'Republicans',
  'a',
  'big',
  'victory',
  'on',
  'issues',
  'But',
  'a',
  'sudden',
  'loss',
  'of',
  'the',
  'disputed',
  'subsidies',
  'could',
  'conceivably',
  'cause',
  'the',
  'health',
  'care',
  'program',
  'to',
  'implode',
  'leaving',
  'millions',
  'of',
  'people',
  'without',
  'acce

In [24]:
title_new = [' '.join(c for c in s if c not in string.punctuation) for s in filtered_title]

In [25]:
content_new = [' '.join(c for c in s if c not in string.punctuation) for s in filtered_content]

In [26]:
title_new

['House Republicans Fret About Winning Their Health Care Suit The New York Times',
 'Rift Between Officers and Residents as Killings Persist in South Bronx The New York Times',
 'Tyrus Wong Bambi Artist Thwarted by Racial Bias Dies at 106 The New York Times',
 'Among Deaths in 2016 a Heavy Toll in Pop Music The New York Times',
 'Kim Jongun Says North Korea Is Preparing to Test LongRange Missile The New York Times',
 'Sick With a Cold Queen Elizabeth Misses New Years Service The New York Times',
 'Taiwans President Accuses China of Renewed Intimidation The New York Times',
 'After The Biggest Loser Their Bodies Fought to Regain Weight The New York Times',
 'First a Mixtape Then a Romance The New York Times',
 'Calling on Angels While Enduring the Trials of Job The New York Times',
 'Weak Federal Powers Could Limit Trumps ClimatePolicy Rollback The New York Times',
 'Can Carbon Capture Technology Prosper Under Trump The New York Times',
 'MaraLago the Future Winter White House and Home 

In [27]:
content_new

['WASHINGTON — Congressional Republicans have a new fear when it comes to their health care lawsuit against the Obama administration They might win The incoming Trump administration could choose to no longer defend the executive branch against the suit which challenges the administrations authority to spend billions of dollars on health insurance subsidies for and Americans handing House Republicans a big victory on issues But a sudden loss of the disputed subsidies could conceivably cause the health care program to implode leaving millions of people without access to health insurance before Republicans have prepared a replacement That could lead to chaos in the insurance market and spur a political backlash just as Republicans gain full control of the government To stave off that outcome Republicans could find themselves in the awkward position of appropriating huge sums to temporarily prop up the Obama health care law angering conservative voters who have been demanding an end to the

In [28]:
final_list = pd.DataFrame(
    {'heads': title_new,
     'descs': content_new,
    })

In [29]:
final_list.to_pickle('tokenized_data.pickle')

In [30]:
df2 = pd.read_pickle('tokenized_data.pickle')
train_data = df2.iloc[:100000]
train_data.to_pickle('train_data.pkl')
validation_data = df2.iloc[100001:130000]
validation_data.to_pickle('validation_data.pkl')
test_data = df2.iloc[130001:142568]
test_data.to_pickle('test_data.pkl')