In [1]:
import pandas as pd

# Read Data

In [2]:
df = pd.read_csv('train.csv')
df

Unnamed: 0,id,text,author
0,id26305,"This process, however, afforded me no means of...",EAP
1,id17569,It never once occurred to me that the fumbling...,HPL
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP
3,id27763,How lovely is spring As we looked from Windsor...,MWS
4,id12958,"Finding nothing else, not even gold, the Super...",HPL
...,...,...,...
19574,id17718,"I could have fancied, while I looked at it, th...",EAP
19575,id08973,The lids clenched themselves together as if in...,EAP
19576,id05267,"Mais il faut agir that is to say, a Frenchman ...",EAP
19577,id17513,"For an item of news like this, it strikes us i...",EAP


In [3]:
df['author'].value_counts()

EAP    7900
MWS    6044
HPL    5635
Name: author, dtype: int64

# Data Analysis

In [56]:
from plotly import graph_objs as go
import spacy

In [57]:
nlp = spacy.load("en_core_web_sm")

### Message Length

In [58]:
import plotly.express as px

In [59]:
df['text_len'] = df['text'].apply(len)
df_len = df[['author', 'text_len']]
fig = px.box(df_len, x="author", y="text_len")
fig.show()

### Punctuation

In [73]:
import string
import re

In [74]:

def punctuation(item):
    return len(re.findall('[%s]' % re.escape(string.punctuation), item)) / len(item)


In [75]:
df['punctuation'] = df['text'].apply(punctuation)
df_len = df[['author', 'punctuation']]
fig = px.box(df_len, x="author", y="punctuation")
fig.show()

### Sentiment

In [19]:
from nltk.corpus import sentiwordnet as swn
from pywsd import disambiguate

Warming up PyWSD (takes ~10 secs)... took 3.0079009532928467 secs.


In [20]:

sentiments = []

docs = nlp.pipe(df['text'], disable=['ner'])

for doc in docs:
    sentiment = 0
    for sentence in doc.sents:
        for word, sysnet in disambiguate(sentence.text):
            if sysnet is not None:
                sysnet_senti = swn.senti_synset(sysnet.name())
                sentiment += sysnet_senti.pos_score()
                sentiment -= sysnet_senti.neg_score()

    sentiments.append(sentiment)


In [80]:
df['sentiment'] = pd.Series(sentiments)
df

Unnamed: 0,id,text,author,text_len,punctuation,sentiment
0,id26305,"This process, however, afforded me no means of...",EAP,231,0.030303,1.375
1,id17569,It never once occurred to me that the fumbling...,HPL,71,0.014085,-0.875
2,id11008,"In his left hand was a gold snuff box, from wh...",EAP,200,0.025000,0.625
3,id27763,How lovely is spring As we looked from Windsor...,MWS,206,0.019417,0.375
4,id12958,"Finding nothing else, not even gold, the Super...",HPL,174,0.022989,-0.500
...,...,...,...,...,...,...
19574,id17718,"I could have fancied, while I looked at it, th...",EAP,108,0.027778,-0.875
19575,id08973,The lids clenched themselves together as if in...,EAP,55,0.018182,0.000
19576,id05267,"Mais il faut agir that is to say, a Frenchman ...",EAP,68,0.029412,-1.125
19577,id17513,"For an item of news like this, it strikes us i...",EAP,74,0.040541,0.500


In [82]:
df_sentiment = df[['author', 'sentiment']]
fig = px.box(df_sentiment, x="author", y="sentiment")
fig.show()

In [34]:
eap_df = df[df['author'] == 'EAP']['sentiment'].value_counts().sort_index()
mws_df = df[df['author'] == 'MWS']['sentiment'].value_counts().sort_index()
hpl_df = df[df['author'] == 'HPL']['sentiment'].value_counts().sort_index()

fig = go.Figure()
fig.add_trace(go.Histogram(
    x=eap_df.index,
    y=eap_df.values,
    name='EAP',
    marker_color="#496595",
))

fig.add_trace(go.Histogram(
    x=mws_df.index,
    y=mws_df.values,
    name='MWS',
    marker_color="#c6ccd8",
))

fig.add_trace(go.Histogram(
    x=hpl_df.index,
    y=hpl_df.values,
    name='HPL',
    marker_color="#1a611b",
))

fig.update_layout(
    title='<span style="font-size:32px; font-family:Times New Roman">Sentiment by Author</span>'
)
fig.update_xaxes(range=[df['sentiment'].min(), df['sentiment'].max()])
fig.show()

print(f"EAP Mean Sentiment: {df[df['author'] == 'EAP']['sentiment'].mean()}")
print(f"MWS Mean Sentiment: {df[df['author'] == 'MWS']['sentiment'].mean()}")
print(f"HPL Mean Sentiment: {df[df['author'] == 'HPL']['sentiment'].mean()}")

EAP Mean Sentiment: 0.08290949367088606
MWS Mean Sentiment: -0.033757941760423536
HPL Mean Sentiment: -0.13396521739130446


### POS tags

In [35]:

pos_tags = {}
docs = nlp.pipe(df['text'], disable=['ner'])

for auth, doc in zip(df['author'], docs):
    for token in doc:
        pos_tags[auth] = pos_tags.get(auth, []) + [token.tag_]


In [61]:
eap_df = pd.Series(pos_tags['EAP']).value_counts()
mws_df = pd.Series(pos_tags['MWS']).value_counts()
hpl_df = pd.Series(pos_tags['HPL']).value_counts()

fig = go.Figure()

fig.add_trace(go.Bar(
    x=eap_df.index,
    y=eap_df.values,
    name='EAP',
    marker_color="#496595",
))

fig.add_trace(go.Bar(
    x=mws_df.index,
    y=mws_df.values,
    name='MWS',
    marker_color="#c6ccd8",
))

fig.add_trace(go.Bar(
    x=hpl_df.index,
    y=hpl_df.values,
    name='HPL',
    marker_color="#1a611b",
))


fig.update_layout(
    title='<span style="font-size:32px; font-family:Times New Roman">POS-tags by Author</span>'
)

fig.show()

In [70]:
print(f"EAP - Most Common: {eap_df.index[:5].tolist()}, Least Common: {eap_df.index[-5:].tolist()}")
print(f"MWS - Most Common: {mws_df.index[:5].tolist()}, Least Common: {mws_df.index[-5:].tolist()}")
print(f"HPL - Most Common: {hpl_df.index[:5].tolist()}, Least Common: {hpl_df.index[-5:].tolist()}")

EAP - Most Common: ['NN', 'IN', 'DT', ',', 'JJ'], Least Common: ['XX', 'NFP', 'LS', 'ADD', 'AFX']
MWS - Most Common: ['NN', 'IN', 'DT', 'PRP', 'VBD'], Least Common: ['EX', 'WP$', 'RBS', 'NNPS', 'FW']
HPL - Most Common: ['NN', 'IN', 'DT', 'JJ', 'VBD'], Least Common: ['_SP', 'XX', 'NFP', 'ADD', 'AFX']


# Feature Extractor

In [26]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.base import BaseEstimator, TransformerMixin
from nltk.corpus import sentiwordnet as swn
from sklearn.pipeline import FeatureUnion
from nltk.tokenize import word_tokenize
from sklearn.pipeline import Pipeline
from pywsd import disambiguate
import numpy as np
import requests
import string
import spacy
import re

In [18]:
class PolarityTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    @staticmethod
    def transform(X, y=None):
        scores = []
        nlp = spacy.load("en_core_web_sm")
        docs = nlp.pipe(X.tolist())

        for doc in docs:
            pos = 0
            neg = 0
            count = 0
            for sentence in doc.sents:
                for word, sysnet in disambiguate(sentence.text):
                    if sysnet is not None:
                        sysnet_senti = swn.senti_synset(sysnet.name())
                        pos += sysnet_senti.pos_score()
                        neg += sysnet_senti.neg_score()
                    count += 1
            scores.append({'pos': pos/count, 'neg': neg/count})

        return pd.DataFrame(scores)

class SentenceLengthTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    @staticmethod
    def transform(X, y=None):
        sentence_length = []

        for sentence in X:
            tokens = word_tokenize(re.sub('[%s]' % re.escape(string.punctuation), '', sentence))
            sentence_length.append(len(tokens))

        return pd.DataFrame(sentence_length)

class WordLengthTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    @staticmethod
    def transform(X, y=None):
        word_length = []

        for sentence in X:
            tokens = word_tokenize(re.sub('[%s]' % re.escape(string.punctuation), '', sentence))
            word_length.append(np.mean([len(token) for token in tokens]))

        return pd.DataFrame(word_length)

class POSTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.tag_map = ["AFX", "CC", "CD", "DT", "EX", "FW", "HYPH", "IN", "JJ", "JJR", "JJS", "LS", "MD", "NIL", "NN", "NNP",                     "NNPS", "NNS", "PDT", "POS", "PRP", "PRP$", "RB", "RBR", "RBS", "RP", "SP", "SYM", "TO", "UH", "VB", "VBD",                 "VBG", "VBN", "VBP", "VBZ", "WDT", "WP", "WP$", "WRB", "ADD", "NFP", "GW", "XX", "BES", "HVS", "_SP"]
        return self

    def transform(self, X, y=None):
        tag_count = []
        nlp = spacy.load("en_core_web_sm")
        docs = nlp.pipe(X)

        for doc in docs:
            tags = {tag: 0 for tag in self.tag_map}
            for token in doc:
                if token.tag_ in self.tag_map:
                    tags[token.tag_] += 1
            tag_count.append(tags)

        return pd.DataFrame(tag_count)

class BritishAmericanTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        self.british = requests.get("https://raw.githubusercontent.com/hyperreality/American-British-English-Translator/master/data/british_spellings.json").json().keys()
        self.american = requests.get("https://raw.githubusercontent.com/hyperreality/American-British-English-Translator/master/data/american_spellings.json").json().keys()
        return self

    def transform(self, X, y=None):
        count = []

        for sentence in X:
            british_american = {'british': 0, 'american': 0}
            words = word_tokenize(re.sub('[%s]' % re.escape(string.punctuation), '', sentence.lower()))

            for word in words:
                if word in self.british:
                    british_american['british'] += 1/len(words)
                elif word in self.american:
                    british_american['american'] += 1/len(words)

            count.append(british_american)

        return pd.DataFrame(count)


class CurrentWordsTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        nlp = spacy.load("en_core_web_sm")
        self.words = {word: 1 for word in nlp.vocab.strings}
        return self

    def transform(self, X, y=None):
        count = []

        for sentence in X:
            current_count = 0
            words = word_tokenize(re.sub('[%s]' % re.escape(string.punctuation), '', sentence.lower()))

            for word in words:
                if self.words.get(word, 0) == 1:
                    current_count += 1/len(words)

            count.append(current_count)

        return pd.DataFrame(count)


### Polarity

In [7]:
polarity = PolarityTransformer()
polarity_features = polarity.fit_transform(df['text'])
polarity_features

Unnamed: 0,pos,neg
0,0.049479,0.020833
1,0.066667,0.125000
2,0.036585,0.021341
3,0.023026,0.013158
4,0.020161,0.036290
...,...,...
19574,0.000000,0.038043
19575,0.000000,0.000000
19576,0.025000,0.100000
19577,0.034722,0.006944


### Sentence Length

In [19]:
sentence_len = SentenceLengthTransformer()
sentence_len_features = sentence_len.fit_transform(df['text'])
sentence_len_features

Unnamed: 0,0
0,41
1,14
2,36
3,34
4,27
...,...
19574,20
19575,10
19576,13
19577,15


### Word Length

In [20]:
word_len = WordLengthTransformer()
word_len_features = word_len.fit_transform(df['text'])
word_len_features

Unnamed: 0,0
0,4.487805
1,4.071429
2,4.444444
3,4.970588
4,5.333333
...,...
19574,4.300000
19575,4.500000
19576,4.153846
19577,3.800000


### POS

In [10]:
pos = POSTransformer()
pos_features = pos.fit_transform(df['text'])
pos_features

Unnamed: 0,AFX,CC,CD,DT,EX,FW,HYPH,IN,JJ,JJR,...,WP,WP$,WRB,ADD,NFP,GW,XX,BES,HVS,_SP
0,0,1,0,6,0,0,0,7,2,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,2,0,0,0,2,1,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,5,0,0,0,7,3,0,...,0,0,0,0,0,0,0,0,0,0
3,0,2,1,2,0,0,0,6,5,1,...,0,0,1,0,0,0,0,0,0,0
4,0,1,0,2,0,0,0,3,1,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19574,0,0,0,1,0,0,0,4,1,0,...,0,0,0,0,0,0,0,0,0,0
19575,0,0,0,2,0,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0
19576,0,0,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19577,0,0,0,2,0,0,0,3,0,0,...,0,0,0,0,0,0,0,0,0,0


### British American

In [11]:
british_american = BritishAmericanTransformer()
british_american_features = british_american.fit_transform(df['text'])
british_american_features

Unnamed: 0,british,american
0,0.0,0.0
1,0.0,0.0
2,0.0,0.0
3,0.0,0.0
4,0.0,0.0
...,...,...
19574,0.0,0.0
19575,0.0,0.0
19576,0.0,0.0
19577,0.0,0.0


### Current words

In [12]:
current_words = CurrentWordsTransformer()
current_words_features = current_words.fit_transform(df['text'])
current_words_features

Unnamed: 0,0
0,0.975610
1,0.928571
2,0.916667
3,1.000000
4,1.000000
...,...
19574,0.950000
19575,0.800000
19576,0.769231
19577,1.000000


### 1-gram

In [13]:
count_vec = CountVectorizer(stop_words='english')
one_gram = pd.DataFrame.sparse.from_spmatrix(count_vec.fit_transform(df['text']))
one_gram

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,24754,24755,24756,24757,24758,24759,24760,24761,24762,24763
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19574,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19575,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19576,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19577,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 2-gram

In [14]:
count_vec = CountVectorizer(stop_words='english', ngram_range=(1,2))
two_gram = pd.DataFrame.sparse.from_spmatrix(count_vec.fit_transform(df['text']))
two_gram

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,212565,212566,212567,212568,212569,212570,212571,212572,212573,212574
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19574,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19575,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19576,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19577,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### 3-gram

In [15]:
count_vec = CountVectorizer(stop_words='english', ngram_range=(1,3))
three_gram = pd.DataFrame.sparse.from_spmatrix(count_vec.fit_transform(df['text']))
three_gram

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,397056,397057,397058,397059,397060,397061,397062,397063,397064,397065
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
19574,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19575,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19576,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
19577,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


### Combine

In [28]:
union_features_pipe = FeatureUnion([('polarity', PolarityTransformer()),
                                    ('word_len', WordLengthTransformer()),
                                    ('one_gram', CountVectorizer(stop_words='english'))
                                    ])

union_features = union_features_pipe.fit_transform(df['text'])
union_features

<19579x24767 sparse matrix of type '<class 'numpy.float64'>'
	with 273106 stored elements in Compressed Sparse Row format>

# Model