In [1]:
import numpy as np
from glob import glob
import os
import matplotlib.pyplot as plt
from sklearn import svm
import zipfile
from tqdm import tqdm
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
import re
from scipy import sparse
import nltk
import pandas as pd

# Download any necessary nltk files for nlp
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/raymondyuan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:
data = pd.read_csv("./Articles.csv", encoding = "ISO-8859-1")

In [3]:
data.head()

Unnamed: 0,Article,Date,Heading,NewsType
0,KARACHI: The Sindh government has decided to b...,1/1/2015,sindh govt decides to cut public transport far...,business
1,HONG KONG: Asian markets started 2015 on an up...,1/2/2015,asia stocks up in new year trad,business
2,HONG KONG: Hong Kong shares opened 0.66 perce...,1/5/2015,hong kong stocks open 0.66 percent lower,business
3,HONG KONG: Asian markets tumbled Tuesday follo...,1/6/2015,asian stocks sink euro near nine year,business
4,NEW YORK: US oil prices Monday slipped below $...,1/6/2015,us oil prices slip below 50 a barr,business


In [4]:
data['Article'].shape

(2692,)

# Get data

In [5]:
# Regex to remove all Non-Alpha Numeric 
SPECIAL_CHARS = re.compile(r'([^a-z\d!?.\s])', re.IGNORECASE)

def read_texts(path_file):
    label = int("invalid" not in path_file)
    texts = np.load(path_file)['text']
    labels = [label] * len(texts)
    filter_texts = [SPECIAL_CHARS.sub('',  t) for t in texts]
    return filter_texts, labels

# Get all training data
train_pos_data = read_texts("/Users/raymondyuan/Google Drive/Rice/Year 4/COMP 413/code/DatelineRice2.0-NLP/valid_texts_1189.npz")
train_neg_data = read_texts("/Users/raymondyuan/Google Drive/Rice/Year 4/COMP 413/code/DatelineRice2.0-NLP/invalid_texts.npz")

train_texts = train_pos_data[0] + train_neg_data[0]
train_labels = train_pos_data[1] + train_neg_data[1]

In [6]:
external_invalid = data['Article'].values
train_texts.extend(external_invalid)
train_labels.extend([0] * len(external_invalid))

In [7]:
train_texts, val_texts, train_labels, val_labels = train_test_split(train_texts, train_labels, test_size=0.1,
                                                                    random_state=42)

In [8]:
print(f"Number of training examples {len(train_texts)}")
print(f"Number of validation examples {len(val_texts)}")

Number of training examples 3537
Number of validation examples 394


In [9]:
vec = TfidfVectorizer(ngram_range=(1, 2), tokenizer=word_tokenize,
                      min_df=3, max_df=0.9, strip_accents='unicode', use_idf=1,
                      smooth_idf=1, sublinear_tf=1)

In [10]:
print("Created Vectorizer %s" % vec)
print("Fitting to all docs...")
vec.fit(train_texts + val_texts)
print("Transforming train docs...")
trn_term_doc = vec.transform(train_texts)
print("Transforming val docs...")
val_term_doc = vec.transform(val_texts)

Created Vectorizer TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.9, max_features=None, min_df=3,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=1,
        stop_words=None, strip_accents='unicode', sublinear_tf=1,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function word_tokenize at 0x1a13a0d268>, use_idf=1,
        vocabulary=None)
Fitting to all docs...
Transforming train docs...
Transforming val docs...


In [11]:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.utils.validation import check_X_y, check_is_fitted
from sklearn.svm import LinearSVC

class NbSvmClassifier(BaseEstimator, ClassifierMixin):
    def __init__(self, C=1.0, dual='auto', verbose=0):
        self.C = C
        self.dual = dual
        self.verbose = verbose
        self._clf = None
        print("Creating model with C=%s" % C)

    def predict(self, x):
        # Verify that model has been fit
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.predict(x.multiply(self._r))
    
    def score(self, x, y):
        check_is_fitted(self, ['_r', '_clf'])
        return self._clf.score(x.multiply(self._r), y)
        
    def fit(self, x, y):
        # Check that X and y have correct shape
        x, y = check_X_y(x, y, accept_sparse=True)

        def pr(x, y_i, y):
            p = x[y == y_i].sum(0)
            return (p + 1) / ((y == y_i).sum() + 1)

        self._r = sparse.csr_matrix(np.log(pr(x, 1, y) / pr(x, 0, y)))
        x_nb = x.multiply(self._r)
        if self.dual == 'auto':
            self.dual = x_nb.shape[0] <= x_nb.shape[1]
        self._clf = LinearSVC(C=self.C, dual=self.dual, verbose=self.verbose)
        self._clf.fit(x_nb, y)
        return self

In [12]:
# Search for the appropriate C
Cs = [1e-2, 1e-1, 1e0, 1e1, 1e2]

best_model = None
best_val = -float("inf")
best_C = None
for C in Cs:
    print("Fitting with C={}".format(C))
    model = NbSvmClassifier(C=C, verbose=0).fit(trn_term_doc, train_labels)
    # Evaluate the model
    val_preds = model.predict(val_term_doc)
    score = np.mean(val_labels == val_preds)

    print("Model had val score of %s" % score)
    if score > best_val:
        print("New maximum score improved from {} to {}".format(best_val, score))
        best_model = model
        best_val = score
        best_C = C
score = best_val
print("Best score with C={} is {}".format(best_C, score))

Fitting with C=0.01
Creating model with C=0.01
Model had val score of 0.9720812182741116
New maximum score improved from -inf to 0.9720812182741116
Fitting with C=0.1
Creating model with C=0.1
Model had val score of 0.9771573604060914
New maximum score improved from 0.9720812182741116 to 0.9771573604060914
Fitting with C=1.0
Creating model with C=1.0
Model had val score of 0.9847715736040609
New maximum score improved from 0.9771573604060914 to 0.9847715736040609
Fitting with C=10.0
Creating model with C=10.0
Model had val score of 0.9898477157360406
New maximum score improved from 0.9847715736040609 to 0.9898477157360406
Fitting with C=100.0
Creating model with C=100.0
Model had val score of 0.9898477157360406
Best score with C=10.0 is 0.9898477157360406


## Validation

In [13]:
best_model.score(val_term_doc, val_labels)

0.9898477157360406

In [14]:
test= read_texts("/Users/raymondyuan/Google Drive/Rice/Year 4/COMP 413/code/DatelineRice2.0-NLP/valid_texts1198.npz")

In [15]:
test = list(set(test[0]) - set(train_texts + val_texts))
test_labels = [1] * len(test)

In [16]:
best_model.score(vec.transform(test), test_labels)

0.96

## Sample Texts