In [3]:
import pandas as pd
import numpy as np
import json


section_stat = {}
corpus = []

with open('NewYorkTimesClean.jsonl') as f:
    for i, line in enumerate(f):
        meta = json.loads(line)
        section = meta['section']
        paragraph = meta['lead_paragraph']
        if section not in section_stat:
            section_stat[section] = []
        section_stat[section].append(i)
        if paragraph is not None:
            corpus.append(paragraph)

for label, count in sorted([(k, len(v)) for k, v in section_stat.items()], key=lambda x: x[1], reverse=True):
    print("Section: %s -- Count: %s" % (label, count))

Section: sports -- Count: 13340
Section: business day -- Count: 12999
Section: u.s. -- Count: 12323
Section: arts -- Count: 11661
Section: paid death notices -- Count: 10608
Section: opinion -- Count: 9875
Section: world -- Count: 9621
Section: n.y. / region -- Count: 8166
Section: new york and region -- Count: 4579
Section: technology -- Count: 3852
Section: business -- Count: 2913
Section: fashion & style -- Count: 2400
Section: movies -- Count: 2398
Section: health -- Count: 2102
Section: science -- Count: 1854
Section: books -- Count: 1806
Section: travel -- Count: 1681
Section: style -- Count: 1655
Section: real estate -- Count: 1575
Section: magazine -- Count: 1241
Section: t:style -- Count: 1095


In [4]:
print(len(corpus))

106319


In [5]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
bag_of_words = vectorizer.fit_transform(corpus)

In [6]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB

In [7]:
df = pd.read_json('NewYorkTimesClean.jsonl', lines=True)
df = df.replace(to_replace='None', value=np.nan).dropna(subset=['lead_paragraph'])
df.head()

Unnamed: 0,headline,keywords,lead_paragraph,section
0,,[],MAJOR LEAGUE BASEBALL American League YANKEES-...,sports
1,,"[{'name': 'persons', 'value': 'MARTINEZ, TINO'...",Tino Martinez does not know where he fits in w...,sports
2,,"[{'name': 'subject', 'value': 'BOWL GAMES'}, {...",ROSE BOWL No. 6 Texas (10-1) vs. No.13 Michiga...,sports
3,,"[{'name': 'persons', 'value': 'JOHNSON, RANDY'...","Now batting: the center fielder, No. 15, Carlo...",sports
4,,"[{'name': 'persons', 'value': 'SHOCKEY, JEREMY...",The Giants have listed receiver Amani Toomer a...,sports


In [8]:
len(df)

106319

In [9]:
len(df[df.section=='sports'])

12443

In [10]:
df.loc[df['section']=='sports','section']=0
df.loc[df['section']=='business day','section']=1
df.loc[df['section']=='u.s.','section']=2
df.loc[df['section']=='arts','section']=3
df.loc[df['section']=='paid death notices','section']=4
df.loc[df['section']=='opinion','section']=5
df.loc[df['section']=='world','section']=6
df.loc[df['section']=='n.y. / region','section']=7
df.loc[df['section']=='new york and region','section']=8
df.loc[df['section']=='technology','section']=9
df.loc[df['section']=='business','section']=10
df.loc[df['section']=='fashion & style','section']=11
df.loc[df['section']=='movies','section']=12
df.loc[df['section']=='health','section']=13
df.loc[df['section']=='science','section']=14
df.loc[df['section']=='books','section']=15
df.loc[df['section']=='travel','section']=16
df.loc[df['section']=='style','section']=17
df.loc[df['section']=='real estate','section']=18
df.loc[df['section']=='magazine','section']=19
df.loc[df['section']=='t:style','section']=20

In [11]:
df.head()

Unnamed: 0,headline,keywords,lead_paragraph,section
0,,[],MAJOR LEAGUE BASEBALL American League YANKEES-...,0
1,,"[{'name': 'persons', 'value': 'MARTINEZ, TINO'...",Tino Martinez does not know where he fits in w...,0
2,,"[{'name': 'subject', 'value': 'BOWL GAMES'}, {...",ROSE BOWL No. 6 Texas (10-1) vs. No.13 Michiga...,0
3,,"[{'name': 'persons', 'value': 'JOHNSON, RANDY'...","Now batting: the center fielder, No. 15, Carlo...",0
4,,"[{'name': 'persons', 'value': 'SHOCKEY, JEREMY...",The Giants have listed receiver Amani Toomer a...,0


In [12]:
df_x = df['lead_paragraph']
df_y = df['section']

In [13]:
df_x

0         MAJOR LEAGUE BASEBALL American League YANKEES-...
1         Tino Martinez does not know where he fits in w...
2         ROSE BOWL No. 6 Texas (10-1) vs. No.13 Michiga...
3         Now batting: the center fielder, No. 15, Carlo...
4         The Giants have listed receiver Amani Toomer a...
5         Quarterback Jake Delhomme said he thought it w...
6         Maj. Billy Smith retired from the Florida Stat...
7         SOMETIME between Wednesday night and yesterday...
8         With the Bowl Championship Series formula dest...
9         Jets Coach Herman Edwards says he has been sle...
10        Stephon Marbury strolled off the Knicks' pract...
11        The Nets and the Knicks have rarely enjoyed su...
12        In his 24-year odyssey through higher educatio...
13        To the Sports Editor: Re Murray Chass and the ...
14        To the Sports Editor: As much as I like the ki...
15        As the close of the year brings home the lesso...
16        To the Sports Editor: Your tri

In [14]:
cv = CountVectorizer()

In [15]:
x_train, x_test, y_train, y_test = train_test_split(df_x, df_y, test_size=0.2, random_state=4)

In [16]:
x_train.head()

3900     Philadelphia fans have mixed feelings about An...
25437    Last year generated $4.7 trillion in announced...
33989    Mitt Romney enters Florida on Monday wounded b...
28993    A woman has been indicted on charges saying sh...
42079    T. L. Stanley is locked back in the press room...
Name: lead_paragraph, dtype: object

In [17]:
x_traincv = cv.fit_transform(x_train)

In [18]:
x_testcv = cv.transform(x_test)

In [19]:
mnb = MultinomialNB()

y_train = y_train.astype('int')
mnb.fit(x_traincv , y_train)
testmessage=x_test.iloc[0]
predictions=mnb.predict(x_testcv)

predictions

array([3, 2, 6, ..., 0, 3, 3])

In [32]:
cv_y = CountVectorizer()
#y_testcv = cv_y.fit_transform(y_test)
y_test = y_test.astype('int')
mnb_test_score = mnb.score(x_testcv, y_test)
mnb_train_score = mnb.score(x_traincv, y_train)
print('Multinomial NB accuracy score with train data is {}'.format(mnb_train_score))
print('Multinomial NB accuracy score with test data is {}'.format(mnb_test_score))

Multinomial NB accuracy score with train data is 0.7123625889130563
Multinomial NB accuracy score with test data is 0.6292325056433409


In [27]:
from sklearn.linear_model import SGDClassifier
svm_clf = SGDClassifier(max_iter=5, tol=None, loss='hinge', random_state=5234)
svm_clf.fit(x_traincv, y_train)

SGDClassifier(alpha=0.0001, average=False, class_weight=None, epsilon=0.1,
       eta0=0.0, fit_intercept=True, l1_ratio=0.15,
       learning_rate='optimal', loss='hinge', max_iter=5, n_iter=None,
       n_jobs=1, penalty='l2', power_t=0.5, random_state=5234,
       shuffle=True, tol=None, verbose=0, warm_start=False)

In [31]:
svm_train_score = svm_clf.score(x_traincv, y_train)
svm_test_score = svm_clf.score(x_testcv, y_test)
print('SVM accuracy score with train data is {}'.format(svm_train_score))
print('SVM accuracy score with test data is {}'.format(svm_test_score))

SVM accuracy score with train data is 0.7719945917347598
SVM accuracy score with test data is 0.6324303987960873


In [50]:
from sklearn.neural_network import MLPClassifier

mlp_clf = MLPClassifier(hidden_layer_sizes=(20, ), alpha=0.3)
mlp_clf.fit(x_traincv, y_train)

MLPClassifier(activation='relu', alpha=0.3, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(20,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       nesterovs_momentum=True, power_t=0.5, random_state=None,
       shuffle=True, solver='adam', tol=0.0001, validation_fraction=0.1,
       verbose=False, warm_start=False)

In [51]:
mlp_train_score = mlp_clf.score(x_traincv, y_train)
mlp_test_score = mlp_clf.score(x_testcv, y_test)
print('MLP accuracy score with train data is {}'.format(mlp_train_score))
print('MLP accuracy score with test data is {}'.format(mlp_test_score))

MLP accuracy score with train data is 0.8268414555287755
MLP accuracy score with test data is 0.6901335590669676
