In [1]:
import os
import glob
import json
import time
import requests
import datetime
import dateutil
import pandas as pd
import re
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
from dateutil.relativedelta import relativedelta
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
from google.colab import drive
import tensorflow as tf
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from keras.models import load_model
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
drive.mount('/drive')

Mounted at /drive


In [3]:
end = datetime.date.today()
start = end - relativedelta(years=2)
print(end, start)

2021-05-13 2019-05-13


In [4]:
months_in_range = [x.split(' ') for x in pd.date_range(start, end, freq='MS').strftime("%Y %-m").tolist()]
print(months_in_range)

[['2019', '6'], ['2019', '7'], ['2019', '8'], ['2019', '9'], ['2019', '10'], ['2019', '11'], ['2019', '12'], ['2020', '1'], ['2020', '2'], ['2020', '3'], ['2020', '4'], ['2020', '5'], ['2020', '6'], ['2020', '7'], ['2020', '8'], ['2020', '9'], ['2020', '10'], ['2020', '11'], ['2020', '12'], ['2021', '1'], ['2021', '2'], ['2021', '3'], ['2021', '4'], ['2021', '5']]


In [5]:
def send_request(date):
    '''Sends a request to the NYT Archive API for given date.'''
    base_url = 'https://api.nytimes.com/svc/archive/v1/'
    url = base_url  + date[0] + '/' + date[1] + '.json?api-key=' + '5bBmcpCW4fOGtnBURGCXLoVFo887iwWX'
    response = requests.get(url).json()
    time.sleep(6)
    return response


def is_valid(article, date):
    '''An article is only worth checking if it is in range, and has a headline.'''
    is_in_range = date > start and date < end
    has_headline = type(article['headline']) == dict and 'main' in article['headline'].keys()
    return is_in_range and has_headline


def parse_response(response):
    '''Parses and returns response as pandas data frame.'''
    data = {'headline': [],  
        'date': [], 
        'doc_type': [],
        'material_type': [],
        'section': [],
        'news_desk':[],
        'abstract':[],
        'keywords': [],
        'lead_paragraph':[],
        'snippet':[]}
    
    articles = response['response']['docs'] 

    for article in articles: 
        date = dateutil.parser.parse(article['pub_date']).date()
        if is_valid(article, date):
            data['date'].append(date)
            data['headline'].append(article['headline']['main']) 
            if 'section_name' in article:
                data['section'].append(article['section_name'])
            else:
                data['section'].append(None)
            data['doc_type'].append(article['document_type'])
            if 'type_of_material' in article: 
                data['material_type'].append(article['type_of_material'])
            else:
                data['material_type'].append(None)
            if 'abstract' in article: 
                data['abstract'].append(article['abstract'])
            else:
                data['abstract'].append(None)
            if 'news_desk' in article: 
                data['news_desk'].append(article['news_desk'])
            else:
                data['news_desk'].append(None)
            if 'lead_paragraph' in article: 
                data['lead_paragraph'].append(article['lead_paragraph'])
            else:
                data['lead_paragraph'].append(None)
            if 'snippet' in article: 
                data['snippet'].append(article['snippet'])
            else:
                data['snippet'].append(None)
            keywords = [keyword['value'] for keyword in article['keywords'] if keyword['name'] == 'subject']
            new_keywords = listToString(keywords)
            data['keywords'].append(new_keywords)
    return pd.DataFrame(data) 

def listToString(s):  
    str1 = ""  
    for ele in s:  
        str1 += (ele+",")  
    return str1

def word_cloud(text,color):
    wordcloud = WordCloud(max_font_size=50, max_words=100, background_color=color).generate(text)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    plt.show()

def get_data(dates):
    '''Sends and parses request/response to/from NYT Archive API for given dates.'''
    total = 0
    print('Date range: ' + str(dates[0]) + ' to ' + str(dates[-1]))
    if not os.path.exists('headlines'):
        os.mkdir('headlines')
    for date in dates:
        response = send_request(date)
        df = parse_response(response)
        total += len(df)
        df.to_csv('headlines/' + date[0] + '-' + date[1] + '.csv', index=False)
        print('Saving headlines/' + date[0] + '-' + date[1] + '.csv...')
    print('Number of articles collected: ' + str(total))

def preprocess_text(text):
    text = text.lower().replace('\n',' ').replace('\r','').strip()
    text = re.sub(' +',' ',text)
    text = re.sub(r'[^\w\s]','',text)
    return text

def tokenize_stopwords(text):
    stop_words = set(stopwords.words('english'))
    word_token = word_tokenize(text)
    filter_sentence = [w for w in word_token if not w in stop_words]
    text = ' '.join(filter_sentence)
    return text


def stemming(messages):
    ps = PorterStemmer()
    corpus = []
    for i in range(0, len(messages)):
      print(i)
      review = re.sub('[^a-zA-Z]', ' ', messages[i])
      review = review.lower()
      review = review.split()

      review = [ps.stem(word) for word in review if not word in stopwords.words('english')]
      review = ' '.join(review)
      corpus.append(review)
    return corpus  


In [6]:
get_data(months_in_range)

Date range: ['2019', '6'] to ['2021', '5']
Saving headlines/2019-6.csv...
Saving headlines/2019-7.csv...
Saving headlines/2019-8.csv...
Saving headlines/2019-9.csv...
Saving headlines/2019-10.csv...
Saving headlines/2019-11.csv...
Saving headlines/2019-12.csv...
Saving headlines/2020-1.csv...
Saving headlines/2020-2.csv...
Saving headlines/2020-3.csv...
Saving headlines/2020-4.csv...
Saving headlines/2020-5.csv...
Saving headlines/2020-6.csv...
Saving headlines/2020-7.csv...
Saving headlines/2020-8.csv...
Saving headlines/2020-9.csv...
Saving headlines/2020-10.csv...
Saving headlines/2020-11.csv...
Saving headlines/2020-12.csv...
Saving headlines/2021-1.csv...
Saving headlines/2021-2.csv...
Saving headlines/2021-3.csv...
Saving headlines/2021-4.csv...
Saving headlines/2021-5.csv...
Number of articles collected: 108445


In [7]:
os.chdir("/content/headlines")
extension = 'csv'
all_filenames = [i for i in glob.glob('*.{}'.format(extension))]
combined_csv = pd.concat([pd.read_csv(f) for f in all_filenames ])
combined_csv.to_csv( "combined_csv.csv", index=False, encoding='utf-8-sig')

In [9]:
data = pd.read_csv('/content/headlines/combined_csv.csv')
data.head()
len(data)

108445

In [None]:
sections_abstract_count = data.groupby('section')['abstract'].nunique()
print(sections_abstract_count)
print(len(sections_abstract_count))

In [None]:
data['section'].replace('T Magazine', 'Magazine', inplace = True) 
data['section'].replace(['Movies', 'Arts', 'Theater'], 'Entertainment', inplace = True) 

In [None]:
data = data[data.groupby('section').section.transform('count')>=2000].copy()
data.shape

In [2]:
sections_abstract_count1 = data.groupby('section')['abstract'].nunique()
sections_abstract_count1
len(data['section'].value_counts())

NameError: ignored

In [None]:
data.isnull().any()

In [None]:
data.isnull().sum()

In [None]:
data.dropna(subset=['headline','material_type', 'keywords', 'snippet','abstract','news_desk','lead_paragraph'],axis=0,inplace=True)
data.isnull().sum()

In [None]:
sns.set(rc={'figure.figsize':(20,15)})
sns.countplot(data.section)

In [None]:
data['news_length'] = data['headline'].str.len() + data['abstract'].str.len() + data['lead_paragraph'].str.len() + data['keywords'].str.len()
data['news_length']
data['news_length'].max()

In [None]:
data['text'] = data['headline'] + " "+ data['abstract'] + " "+ data['lead_paragraph'] +" "+ data['keywords']
data['text'][0]

In [None]:
sns.set()
_ = plt.hist(data['news_length'],bins=70)
_ = plt.xlabel("length")
_ = plt.ylabel("count")
plt.show()

In [None]:
categories = data['section'].unique()
i = 1
for category in categories:
  subset = data[data.section == category]
  sns.set(rc={'figure.figsize':(12,10)})
  text = subset.abstract.values + subset.headline.values + subset.lead_paragraph.values + subset.keywords.values + subset.news_desk.values
  word = ' '.join(text)
  print('\n' + str(i) + '. ' + category.upper() + '\n')
  if (i % 2 == 0):
    word_cloud(word,'white')
  else:
    word_cloud(word,'black')
  i = i + 1

In [None]:
section_codes = {'U.S.': 0,
'Entertainment': 1,   
'World': 2,
'Opinion': 3, 
'Business Day': 4, 
'Sports': 5,
'New York': 6,
'Books': 7,
'Style': 8,
'Magazine': 9,
'Food': 10,
'Real Estate': 11,
'Briefing': 12}

In [None]:
data['section_code'] = data['section']
data = data.replace({'section_code':section_codes})

In [None]:
print(len(data))
y = [0,1,2,3,4,5,6,7,8,9,10,11]
for x in y:
  print(data[data['section_code'] == x])
  x +=1
print(len(data))

In [None]:
lstm_data = data.copy()

In [None]:
voc_size = 1500

In [None]:
X_train, X_test, y_train,y_test = train_test_split(lstm_data['text'],lstm_data['section_code'],test_size = 0.2,random_state=8)

In [None]:
messages = lstm_data['text'].copy()
messages = messages.reset_index(drop= True)

In [None]:
corpus = stemming(messages)

In [None]:
corpus

In [None]:
totalLne = len(corpus)
i = 0
maxLen = 0
while i < totalLne:
  currentLen = len(corpus[i])
  if currentLen > maxLen:
    maxLen = currentLen
  i += 1
print(maxLen)

In [None]:
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
onehot_repr

In [None]:
sent_length=1600
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
print(embedded_docs)

In [None]:
train_label = data['section_code']
len(train_label.value_counts())

In [None]:
train_label = train_label.reset_index(drop= True)

In [None]:
print(len(train_label), len(corpus))

In [None]:
embedding_dim = 64
model = tf.keras.Sequential([
                              tf.keras.layers.Embedding(voc_size, embedding_dim),
                              tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim, return_sequences=True)),
                              tf.keras.layers.Conv1D(128, 5, activation='relu', input_shape = (None, 128, 1)),
                              tf.keras.layers.GlobalAveragePooling1D(),
                              tf.keras.layers.Dense(64,activation= 'relu'),
                              tf.keras.layers.Dropout(0.3),
                              tf.keras.layers.Dense(embedding_dim,activation= 'relu'),
                              tf.keras.layers.Dropout(0.3),
                              tf.keras.layers.Dense(13, activation= 'softmax')

])


model.summary()
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
import numpy as np
X_final=(embedded_docs)
y_final=(train_label)

In [None]:
from sklearn.model_selection import train_test_split
X_train_lstm, X_test_lstm, y_train_lstm, y_test_lstm = train_test_split(X_final, y_final, test_size=0.33, random_state=14)

In [None]:
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=8,batch_size=64)

In [None]:
model.save("BidirectionalLSTM.h5")

Implementing ML models

In [None]:
#splitting data into test & train
X_train, X_test, y_train,y_test = train_test_split(data['parsed_text'],data['section_code'],test_size = 0.2,random_state=8)

In [None]:
print(X_train.shape)
print(y_train.shape)
print(y_test.shape)
print(X_test.shape)

In [None]:
ngram_range = (1,2)
min_df = 10
max_df = 1.
max_features = 30000

In [None]:
tfidf = TfidfVectorizer(encoding='utf-8',
                        ngram_range = ngram_range,
                        stop_words = None,
                        lowercase = False,
                        max_df = max_df,
                        min_df = min_df,
                        max_features = max_features,
                        norm = 'l2',
                        sublinear_tf = True
                        )
train_features = tfidf.fit_transform(X_train).toarray()
train_label = y_train

test_features = tfidf.transform(X_test).toarray()
test_label = y_test

In [None]:
print(test_label.shape)
print(train_label.shape)
print(train_features.shape)
print(test_features.shape)

##Random Forest Model

In [None]:
import pickle
model = RandomForestClassifier()
model.fit(train_features,train_label)
filename = 'finalized_model'
pickle.dump(model, open(filename, 'wb'))
predictions = model.predict(test_features)
print(accuracy_score(test_label,predictions))
print(classification_report(test_label,predictions))

##Logistic Regression


In [None]:
import pickle
from sklearn.linear_model import LogisticRegression
model = LogisticRegression(multi_class='multinomial', solver='lbfgs')
model.fit(train_features,train_label)
filename = 'logit_model.pkl'
pickle.dump(model, open(filename, 'wb'))
predictions = model.predict(test_features)
print(accuracy_score(test_label,predictions))
print(classification_report(test_label,predictions))
print(predictions)

In [None]:
from sklearn.tree import DecisionTreeClassifier
model = DecisionTreeClassifier()
model.fit(train_features,train_label)
predictions = model.predict(test_features)
print(accuracy_score(test_label,predictions))
print(classification_report(test_label,predictions))

In [None]:
from sklearn.naive_bayes import GaussianNB
model = GaussianNB()
model.fit(train_features,train_label)
predictions = model.predict(test_features)
print(accuracy_score(test_label,predictions))
print(classification_report(test_label,predictions))
print(predictions)