# Get random articles

In [1]:
import pandas as pd
import numpy as np
import random

In [2]:
df = pd.read_json('/home/nightwing/rsc/Axelbib/scraping/Analysis of Results/data.json')

In [3]:
np.random.seed(0)

sample = df.sample(n=100)[['abstract', 'title']]

In [4]:
with open('sample.txt', 'w') as textfile:
    for i in sample.title:
        textfile.write(str(i) + "\n")

# Using Spacy and sklearn

In [5]:
from spacy.en import English
parser = English()

In [6]:
# both models
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.preprocessing import MultiLabelBinarizer
import numpy as np
# first model
from sklearn.base import TransformerMixin
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from sklearn.metrics import accuracy_score
from nltk.corpus import stopwords
from sklearn import preprocessing
import string
import numpy as np
import re
# second
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_extraction.text import TfidfTransformer

# Dummy Data Set

In [7]:
# data
train = np.array(['This is an evolution article',
                  'This is a spatial article',
                  'This is a square lattice article',
                  'This has both evolution and spatial',
                  'This has both spatial and square lattice', 
                  'Finally an article with both square lattice and evolution'])
labelsTrain = [['evolution'],
               ['spatial'],
               ['square lattice'],
               ['evolution', 'spatial'],
               ['spatial', 'square lattice'],
               ['square lattice', 'evolution']]

test = np.array(['This should be an article about evolution and spatial tournaments'])
labelsTest = [['evolution', 'spatial']]

In [27]:
# lets have some actual data to test
from sklearn.cross_validation import train_test_split
train, test, labelsTrain, labelsTest = train_test_split(sample.text, labels,random_state=1)



ValueError: Found input variables with inconsistent numbers of samples: [100, 3]

# More complicated model

In [13]:
# A custom stoplist
STOPLIST = set(stopwords.words('english') + ["n't", "'s", "'m", "ca"] + list(ENGLISH_STOP_WORDS))
# List of symbols we don't care about
SYMBOLS = " ".join(string.punctuation).split(" ") + ["-----", "---", "...", "“", "”", "'ve"]

# Every step in a pipeline needs to be a "transformer". Define a custom transformer to clean text using spaCy
class CleanTextTransformer(TransformerMixin):
    """
    Convert text to cleaned text
    """

    def transform(self, X, **transform_params):
        return [cleanText(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}
    
# A custom function to clean the text before sending it into the vectorizer
def cleanText(text):
    # get rid of newlines
    text = text.strip().replace("\n", " ").replace("\r", " ")
    
    # replace twitter @mentions
    mentionFinder = re.compile(r"@[a-z0-9_]{1,15}", re.IGNORECASE)
    text = mentionFinder.sub("@MENTION", text)
    
    # repla ce HTML symbols
    text = text.replace("&amp;", "and").replace("&gt;", ">").replace("&lt;", "<")
    
    # lowercase
    text = text.lower()

    return text

# A custom function to tokenize the text using spaCy
# and convert to lemmas
def tokenizeText(sample):

    # get the tokens using spaCy
    tokens = parser(sample)

    # lemmatize
    lemmas = []
    for tok in tokens:
        lemmas.append(tok.lemma_.lower().strip() if tok.lemma_ != "-PRON-" else tok.lower_)
    tokens = lemmas

    # stoplist the tokens
    tokens = [tok for tok in tokens if tok not in STOPLIST]

    # stoplist symbols
    tokens = [tok for tok in tokens if tok not in SYMBOLS]

    # remove large strings of whitespace
    while "" in tokens:
        tokens.remove("")
    while " " in tokens:
        tokens.remove(" ")
    while "\n" in tokens:
        tokens.remove("\n")
    while "\n\n" in tokens:
        tokens.remove("\n\n")

    return tokens

def printNMostInformative(vectorizer, clf, N):
    """Prints features with the highest coefficient values, per class"""
    feature_names = vectorizer.get_feature_names()
    coefs_with_fns = sorted(zip(clf.coef_[0], feature_names))
    topClass1 = coefs_with_fns[:N]
    topClass2 = coefs_with_fns[:-(N + 1):-1]
    print("Class 1 best: ")
    for feat in topClass1:
        print(feat)
    print("Class 2 best: ")
    for feat in topClass2:
        print(feat)

In [20]:
from sklearn.mixture import GMM


In [21]:
# the vectorizer and classifer to use
# note that I changed the tokenizer in CountVectorizer to use a custom function using spaCy's tokenizer
vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(1,1))
clf = GMM(n_components=3, covariance_type='full') 

#lb = MultiLabelBinarizer(classes=('evolution', 'spatial', 'square lattice'))
#Y = lb.fit_transform(labelsTrain)
#Y_test = lb.fit_transform(labelsTest)

# the pipeline to clean, tokenize, vectorize, and classify
pipe = Pipeline([('cleanText', CleanTextTransformer()), ('vectorizer', vectorizer), ('clf', clf)])

pipe.fit(train)
#predicted = pipe.predict(test)
#all_labels = lb.inverse_transform(predicted)

#accuracy_score(Y_test, predicted)



TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [33]:
abstract = "The prisoners dilemma "
lb.inverse_transform(pipe.predict(np.array([abstract])))

[('spatial',)]

# Simple Model

In [43]:
vectorizer = CountVectorizer(tokenizer=tokenizeText, ngram_range=(1,1))

In [46]:
lb = MultiLabelBinarizer(classes=('evolution', 'spatial', 'square lattice'))
Y = lb.fit_transform(labelsTrain)
Y_test = lb.fit_transform(labelsTest)

classifier = Pipeline([
    ('vectorizer', vectorizer),
    ('tfidf', TfidfTransformer()),
    ('clf', OneVsRestClassifier(LinearSVC()))])

classifier.fit(train, Y)
predicted = classifier.predict(test)

all_labels = lb.inverse_transform(predicted)
all_labels = lb.inverse_transform(predicted)

accuracy_score(Y_test, predicted)

1.0