In [1]:
import nltk
nltk.download('gutenberg')

import pandas as pd

from sklearn.model_selection import train_test_split

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\groov\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


In [2]:
alice = nltk.corpus.gutenberg.raw('carroll-alice.txt')

In [3]:
alice = alice.replace("\n\n", " ")
alice = alice.replace("\n", " ")
alice = alice.replace("[", "")
alice = alice.replace("]", "")

In [4]:
sent_text = nltk.sent_tokenize(alice)

In [5]:
alice_df = pd.DataFrame(sent_text)
alice_df = alice_df[1:]
alice_df.head()

Unnamed: 0,0
1,Down the Rabbit-Hole Alice was beginning to ge...
2,So she was considering in her own mind (as wel...
3,There was nothing so VERY remarkable in that; ...
4,Oh dear!
5,I shall be late!'


In [6]:
austin = nltk.corpus.gutenberg.raw('austen-sense.txt')

In [7]:
austin = austin.replace("\n\n", " ")
austin = austin.replace("\n", " ")
austin = austin.replace("[", "")
austin = austin.replace("]", "")

In [8]:
sent_text = nltk.sent_tokenize(austin)

In [9]:
austin_df = pd.DataFrame(sent_text)
austin_df = austin_df[1:]
austin_df.head()

Unnamed: 0,0
1,"Their estate was large, and their residence wa..."
2,The late owner of this estate was a single man...
3,"But her death, which happened ten years before..."
4,"In the society of his nephew and niece, and th..."
5,His attachment to them all increased.


In [10]:
alice_df['alice'] = 1
alice_df.head()

Unnamed: 0,0,alice
1,Down the Rabbit-Hole Alice was beginning to ge...,1
2,So she was considering in her own mind (as wel...,1
3,There was nothing so VERY remarkable in that; ...,1
4,Oh dear!,1
5,I shall be late!',1


In [11]:
austin_df['alice'] = 0
austin_df.head()

Unnamed: 0,0,alice
1,"Their estate was large, and their residence wa...",0
2,The late owner of this estate was a single man...,0
3,"But her death, which happened ten years before...",0
4,"In the society of his nephew and niece, and th...",0
5,His attachment to them all increased.,0


In [12]:
clf_df = pd.concat([alice_df, austin_df])
clf_df.columns = ['text', 'alice']
clf_df.head()

Unnamed: 0,text,alice
1,Down the Rabbit-Hole Alice was beginning to ge...,1
2,So she was considering in her own mind (as wel...,1
3,There was nothing so VERY remarkable in that; ...,1
4,Oh dear!,1
5,I shall be late!',1


In [13]:
clf_df.shape

(6456, 2)

In [14]:
label = clf_df['alice']

documents = clf_df['text']

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

from nltk.tokenize.casual import casual_tokenize

tfidf = TfidfVectorizer(tokenizer=casual_tokenize, ngram_range=(1,3), lowercase=True, sublinear_tf=True)

tfidf_docs = tfidf.fit_transform(raw_documents=documents)

In [16]:
tfidf_docs

<6456x188715 sparse matrix of type '<class 'numpy.float64'>'
	with 467736 stored elements in Compressed Sparse Row format>

In [17]:
from sklearn.preprocessing import Normalizer

normalizer = Normalizer()
tfidf_docs = normalizer.fit_transform(tfidf_docs)

In [18]:
from sklearn.decomposition import TruncatedSVD

svd = TruncatedSVD(n_components=100, n_iter=100, random_state=1337)
svd_topic_vectors = svd.fit_transform(tfidf_docs)

In [19]:
columns = ['topic_{}'.format(i) for i in range(svd.n_components)]
svd_topic_vectors = pd.DataFrame(svd_topic_vectors, columns=columns)
svd_topic_vectors.head()

Unnamed: 0,topic_0,topic_1,topic_2,topic_3,topic_4,topic_5,topic_6,topic_7,topic_8,topic_9,...,topic_90,topic_91,topic_92,topic_93,topic_94,topic_95,topic_96,topic_97,topic_98,topic_99
0,0.133847,0.022746,-0.063978,-0.040737,-0.001384,0.013637,-0.055956,0.03325,-0.035153,0.002568,...,0.007435,0.014632,-0.011478,0.004691,0.006578,-0.003636,-0.005841,-0.010244,-0.016367,0.020464
1,0.103997,-0.045281,-0.018636,-0.05708,0.019522,0.006989,-0.000984,0.01398,-0.018641,-0.003802,...,-0.001687,-0.015797,0.00518,0.006392,0.003942,-0.001255,0.009304,-0.008776,-0.014734,0.014633
2,0.111537,0.052376,0.001119,-0.05241,0.029275,0.004118,-0.016243,-0.013219,-0.00887,0.007926,...,0.062445,-0.056836,-0.021399,0.025803,0.043168,-0.012643,0.04049,-0.022846,7e-06,-0.030598
3,0.046214,0.174497,0.19519,-0.107082,0.091202,0.00854,0.009441,-0.014985,0.009161,-0.0123,...,0.145752,-0.1405,0.042471,-0.057691,0.080899,-0.079639,0.103412,0.012045,0.023122,-0.010911
4,0.079499,0.139521,0.011673,-0.038999,-0.082066,0.081166,0.012897,-0.103077,-0.019602,-0.016029,...,0.026613,0.034764,-0.006223,0.015743,0.019277,0.006945,-0.002028,0.060662,0.026651,-0.08126


In [20]:
X = svd_topic_vectors
y = label

In [21]:
X.shape, y.shape

((6456, 100), (6456,))

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1337, test_size=0.5)

In [23]:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA

lda = LDA(n_components=1)
lda.fit(X_train, y_train)
lda.score(X_train, y_train), lda.score(X_test, y_test)

(0.9519826517967782, 0.9464064436183395)

In [24]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, lda.predict(X_test))

array([[2403,   27],
       [ 146,  652]], dtype=int64)

In [25]:
from sklearn.metrics import classification_report

print(classification_report(y_test, lda.predict(X_test)))

              precision    recall  f1-score   support

           0       0.94      0.99      0.97      2430
           1       0.96      0.82      0.88       798

    accuracy                           0.95      3228
   macro avg       0.95      0.90      0.92      3228
weighted avg       0.95      0.95      0.94      3228



In [26]:
def predict_message(message):
    
    message = [message]

    tfidf_doc = tfidf.transform(message)
    svd_topic_vector = svd.transform(tfidf_doc)
    return lda.predict(svd_topic_vector)

In [27]:
# uncomment this out if you want to watch it predict against Carroll's text

# for text in alice_df[0]:
#     prediction = predict_message(text)
#     print(text)
#     print('preciction: {}'.format(prediction))
#     print('-------------')
    

In [28]:
# uncomment this out if you want to watch it predict against Austin's text

# for text in austin_df[0]:
#     prediction = predict_message(text)
#     print(text)
#     print('preciction: {}'.format(prediction))
#     print('-------------')
    