In [1]:
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/spooky-author-identification/train.zip
/kaggle/input/spooky-author-identification/test.zip
/kaggle/input/spooky-author-identification/sample_submission.zip


In [29]:
%%time
import numpy as np, pandas as pd, nltk, spacy, re, tqdm
from gensim.models import Word2Vec
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression

CPU times: user 33 µs, sys: 1 µs, total: 34 µs
Wall time: 39.3 µs


In [15]:
tqdm.tqdm.pandas()

In [14]:
nltk.download('stopwords')
nlp = spacy.load('en_core_web_sm')

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [18]:
data_train = pd.read_csv('/kaggle/input/spooky-author-identification/train.zip', compression='zip')
data_test = pd.read_csv('/kaggle/input/spooky-author-identification/test.zip', compression='zip')
data_train.head(2)

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL


In [6]:
data_test.head(2)

Unnamed: 0,id,text
0,id02310,"Still, as I urged our leaving Ireland with suc..."
1,id24541,"If a fire wanted fanning, it could readily be ..."


In [8]:
print(f"{data_train.shape}\n{data_test.shape}")

(19579, 3)
(8392, 2)


In [11]:
total_authors = list()
for author in data_train['author'].values:
    total_authors.append(author)
print(len(set(total_authors)))

3


In [19]:
def preprocessing_text(text):
    stop_words = set(nltk.corpus.stopwords.words('english'))
    text = [w.lemma_ for w in nlp(text)]
    text = [w for w in text if w not in stop_words]
    return text

data_train['tokenz'] = data_train['text'].progress_apply(preprocessing_text)
data_test['tokenz'] = data_test['text'].progress_apply(preprocessing_text)

100%|██████████| 19579/19579 [04:21<00:00, 74.88it/s]
100%|██████████| 8392/8392 [01:40<00:00, 83.78it/s] 


In [21]:
data_train.head(2)

Unnamed: 0,id,text,author,tokenz
0,id26305,"This process, however, afforded me no means of...",EAP,"[process, ,, however, ,, afford, I, mean, asce..."
1,id17569,It never once occurred to me that the fumbling...,HPL,"[never, occur, I, fumbling, might, mere, mista..."


In [23]:
word2vec = Word2Vec(
    data_train['tokenz'],
    vector_size=300,
    window=5,
    min_count=2,
    workers=4
)

def embedding_process(tokenz):
    embedding = [word2vec.wv[w] for w in tokenz if w in word2vec.wv]
    return np.mean(embedding, axis=0) if embedding else np.zeros(word2vec.vector_size)

data_train['vectors'] = data_train['tokenz'].progress_apply(embedding_process)
data_test['vectors'] = data_test['tokenz'].progress_apply(embedding_process)

100%|██████████| 19579/19579 [00:01<00:00, 17367.97it/s]
100%|██████████| 8392/8392 [00:00<00:00, 14632.36it/s]


In [24]:
data_train.head(2)

Unnamed: 0,id,text,author,tokenz,vectors
0,id26305,"This process, however, afforded me no means of...",EAP,"[process, ,, however, ,, afford, I, mean, asce...","[-0.00597406, 0.21650667, 0.012053456, 0.12304..."
1,id17569,It never once occurred to me that the fumbling...,HPL,"[never, occur, I, fumbling, might, mere, mista...","[0.019263597, 0.19273354, 0.025756955, 0.07946..."


In [28]:
X_train, y_train = np.stack(data_train['vectors'].values), data_train['author'].values
X_test = np.stack(data_test['vectors'].values)

In [30]:
reg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)
reg.fit(X_train, y_train)

In [31]:
pred = reg.predict_proba(X_test)
labels = reg.classes_

In [34]:
submission = pd.DataFrame(pred, columns=labels)
submission['id'] = data_test['id'].values
submission = submission[['id', 'EAP', 'HPL', 'MWS']]
submission.head(2)

Unnamed: 0,id,EAP,HPL,MWS
0,id02310,0.373859,0.246502,0.379639
1,id24541,0.428893,0.331836,0.239271


In [35]:
submission.to_csv('submission.csv', index=False)