In [53]:
import pandas as pd
import numpy as np
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics.pairwise import cosine_similarity
from collections import Counter
from spellchecker import SpellChecker
from lexicalrichness import LexicalRichness
from nltk.stem.porter import PorterStemmer
from itertools import groupby
from sklearn.externals import joblib



In [2]:
feature_num = 18

In [14]:
def avg_sentence_length(text):
    sentences = re.split('[?|!|\.]',text)
    avg_sentence = np.mean(np.array([len(sentence) for sentence in sentences]))
    return avg_sentence

In [15]:
def avg_word_length(text):
    text_clean = re.sub(r"[^\w]+", ' ', text)
    avg_word = np.mean(np.array([len(word) for word in nltk.word_tokenize(text_clean)]))
    return avg_word

In [36]:
pos_list = ['ADJ','ADP','ADV','CONJ','DET','NOUN','NUM','PRT','PRON','VERB','.']
def pos_num(text):
    
    noun = 0
    pronoun = 0
    adjective = 0
    numeral = 0
    verb = 0
    adverb = 0
    end = 0
    comma = 0
    question = 0
    exclamation = 0
    total = len(text.split())
    tags = nltk.pos_tag(nltk.word_tokenize(text))
    counts = Counter( tag for word,  tag in tags)
    for tag in counts:
        if tag == 'NN' or tag == 'NNS' or tag == 'NNP' or tag == 'NNPS':
            noun+=counts[tag]
        elif tag == 'PRP':
            pronoun += counts[tag]
        elif tag == 'JJ' or tag == 'JJR 'or tag == 'JJS':
            adjective += counts[tag]
        elif tag == 'CD':
            numeral += counts[tag]
        elif tag == 'VB' or tag == 'VBD' or tag == 'VBG' or tag == 'VBN' or tag == 'VBP' or tag == 'VBZ':
            verb += counts[tag]
        elif tag == 'RB' or tag == 'RBR' or tag == 'RBS':
            adverb += counts[tag]
        elif tag == '.':
            end += counts[tag]
        elif tag == ',':
            comma += counts[tag]
        elif tag == '?':
            question += counts[tag]
        elif tag == '!':
            exclamation += counts[tag]
    #return noun,pronoun,adjective,numeral,verb,adverb,end,comma,question,exclamation
    return noun,pronoun,adjective,numeral,verb,adverb,end,comma
        

In [17]:
def mispelling(text):
    spell = SpellChecker()
    text_clean = re.sub(r"[^\w]+", ' ', text)
    return len(spell.unknown(text_clean.split()))

In [18]:
def ngram_word(text, num):
    tmp = [] 
    words = text.split()
    for i in range(len(words)-num+1):
        tmp.append(words[i:i+num]) 
    return tmp
def ngram_letter(text, num):
    tmp = [] 
    words = text.split()
    for i in range(len(words)-num+1):
        tmp.append(words[i:i+num]) 
    return tmp
def diff_ngram(text1, text2, num):
    
    a = ngram_word(text1, num)
    b = ngram_word(text2, num) 
    
    cnt = 0 
    for i in a:
        for j in b:
            if i == j:
                cnt += 1
                #common.append(i)
    if len(a) == 0:
        return 100000
    else:
        return cnt/len(a)


In [19]:
def tfidf_cos(text1,text2):
    vectorizer = TfidfVectorizer()
    tfidf = vectorizer.fit_transform(np.array([text1,text2])).toarray()
    return cosine_similarity(tfidf)[0][1]

In [10]:


def words(entry):
    return filter(lambda w: len(w) > 0,
                  [w.strip("0123456789!:,.?(){}[]") for w in entry.split()])

def yule(entry):
    # yule's I measure (the inverse of yule's K measure)
    # higher number is higher diversity - richer vocabulary
    d = {}
    stemmer = PorterStemmer()
    for w in words(entry):
        w = stemmer.stem(w).lower()
        try:
            d[w] += 1
        except KeyError:
            d[w] = 1

    M1 = float(len(d))
    M2 = sum([len(list(g))*(freq**2) for freq,g in groupby(sorted(d.values()))])

    try:
        return (M1*M1)/(M2-M1)
    except ZeroDivisionError:
        return 0
    
# yule(data.pair1[0])

In [13]:
input_fea = np.zeros(feature_num)

In [56]:
def create_fea(text1,text2,feature_num):
    input_fea = np.zeros(feature_num)
    avgs1 = avg_sentence_length(text1)
    avgs2 = avg_sentence_length(text2)
    input_fea[0] = max(avgs1,avgs2)/min(avgs1,avgs2)
    avgs1 = avg_word_length(text1)
    avgs2 = avg_word_length(text2)
    input_fea[1] = max(avgs1,avgs2)/min(avgs1,avgs2)
    noun1,pronoun1,adjective1,numeral1,verb1,adverb1,end1,comma1 = pos_num(text1)
    noun2,pronoun2,adjective2,numeral2,verb2,adverb2,end2,comma2 = pos_num(text2)
    input_fea[2] = abs(noun1-noun2)
    input_fea[3] = abs(pronoun1 - pronoun2)
    input_fea[4] = abs(adjective1 - adjective2)
    input_fea[5] = abs(numeral1-numeral2)
    input_fea[6] = abs(verb1 - verb2)
    input_fea[7] = abs(adverb1-adverb2)
    input_fea[8] = abs(end1 - end2)
    input_fea[9] = abs(comma1 - comma2)
    mis1 = mispelling(text1)
    mis2 = mispelling(text2)
    input_fea[10] = abs(mis1 - mis2)
    input_fea[11] = diff_ngram(text1, text2, 2)
    input_fea[12] = diff_ngram(text1, text2, 3)
    input_fea[13] = diff_ngram(text1, text2, 4)
    input_fea[14] = tfidf_cos(text1,text2)
    yule1 = yule(text1)
    yule2 = yule(text2)
    input_fea[15] = max(yule1,yule2)/min(yule1,yule2)
    richness1 = LexicalRichness(text1).mtld(threshold=0.72)
    richness2 = LexicalRichness(text2).mtld(threshold=0.72)
    input_fea[16] = max(richness1,richness2)/min(richness1,richness2)
    return input_fea

In [68]:
def sameORnot(feature):
    result = RF.predict(feature.reshape(1,-1))[0]
    if result == 1:
        print('The two texts are written by the same author')
    elif result ==0:
        print('The two texts are written by different authors')

In [20]:
import os

In [25]:
f = open('text_unseen_authors/text1.txt')
text1 = f.read() 
f.close()

In [31]:
f = open('text_unseen_authors/text2.txt')
text2 = f.read()
f.close()

In [42]:
f = open('text_unseen_authors/text3.txt')
text3 = f.read()
f.close()

In [46]:
f = open('text_unseen_authors/text4.txt')
text4 = f.read()
f.close()

In [50]:
f = open('text_unseen_authors/text5.txt')
text5 = f.read()
f.close()

In [51]:
len(text5)

5789

In [54]:
RF = joblib.load('model/RF.pkl')

In [None]:
# text1 text2 shakespeare
# text3 Edgar Allan Poe
# text5 text6 jk rowling

In [69]:
# same: text1 text2 shakespeare
sameORnot(create_fea(text1,text2,feature_num)) 

The two texts are written by the same author


In [70]:
# different: text2 shakespeare text3 Edgar Allan Poe
sameORnot(create_fea(text2,text3,feature_num)) 

The two texts are written by different authors


In [71]:
# different: text3 Edgar Allan Poe text4 JK Rowling
sameORnot(create_fea(text3,text4,feature_num)) 

The two texts are written by different authors


In [72]:
# same: text4 text5 JK Rowling
sameORnot(create_fea(text4,text5,feature_num)) 

The two texts are written by the same author
