In [1]:
from IPython.html.services.config import ConfigManager
from IPython.paths import locate_profile
cm = ConfigManager(profile_dir=locate_profile(get_ipython().profile))

cm.update('notebook', {"load_extensions": {"livereveal/main": True}})
cm.update('livereveal', {
    'theme': 'simple',
    'transition': 'linear',
    'slideNumber': True,
    'start_slideshow_at': 'selected',
    'scroll': True,
})



{'scroll': True,
 'slideNumber': True,
 'start_slideshow_at': 'selected',
 'theme': 'simple',
 'transition': 'linear'}

# Text classification

![](./img/categorization.png)

How can a computer learn to classify things?

![](./img/classification.png)

But how documents are represented as vectors?

![Document term matrix](./img/docterm.png)

![Tfidf](./img/tfidf.png)

## Building a classification pipeline

### ...with `scikit-learn`

Notations:

* `X` is a 2D matrix, rows represents data points, columns contains feature values
* `y` is a 1D array containing the labels

In [2]:
class SklearnPredictor:
    def fit(self, X, y):
        """
        Learning to classify from the data
        """
        return self
    
    def predict(X):
        """
        Predict a label for all examples in X
        """
        return predictions
        
class SklearnTransformer:
    def fit(self, X, y=None):
        """
        Learning to transforming the data
        """
        return self
    
    def transform(X):
        """
        Transform all the data in X
        """
        return transformed_data

In [3]:
from sklearn.pipeline import Pipeline

Pipeline([
    ("transformer1", SklearnTransformer()),
    ("transformer2", SklearnTransformer()), # ...
    ("mypredictor", SklearnPredictor()),
])

Pipeline(steps=[('transformer1', <__main__.SklearnTransformer object at 0x10a67fc18>), ('transformer2', <__main__.SklearnTransformer object at 0x10a67fba8>), ('mypredictor', <__main__.SklearnPredictor object at 0x10a67fc50>)])

# Categorizing Hungarian news articles

Have a look at the data

In [4]:
import pandas as pd

df = pd.read_csv("./data/index_articles.csv")
df.head()

Unnamed: 0,Title,Body,category,url,text
0,Lángoló kamion miatt zárták le az M5-öst Szege...,Teljes terjedelmében ég egy kamion szombat kor...,belfold,http://index.hu/belfold/2017/06/03/langolo_kam...,Lángoló kamion miatt zárták le az M5-öst Szege...
1,Kigyulladt egy lakatlan épület a XII. kerületben,Kigyulladt egy kb. négyszáz négyzetméter alapt...,belfold,http://index.hu/belfold/2017/06/03/kigyulladt_...,Kigyulladt egy lakatlan épület a XII. kerületb...
2,Dubrovniknak elege lett a meztelen turistákból,A városi tanács döntése értelmében hamarosan p...,gazdasag,http://index.hu/gazdasag/2016/07/26/nincs_tobb...,Dubrovniknak elege lett a meztelen turistákból...
3,Húsz éve nem házasodtak ennyien Magyarországon,2016 januárja és novembere között a legmagasab...,gazdasag,http://index.hu/gazdasag/2017/02/14/husz_eve_n...,Húsz éve nem házasodtak ennyien Magyarországon...
4,Kútba esett egy asszony Nógrádban,Kútba esett egy asszony a Nógrád megyei Szurdo...,belfold,http://index.hu/belfold/hirek/2013/10/24/kutba...,Kútba esett egy asszony Nógrádban\n\nKútba ese...


In [5]:
len(df)

4972

In [6]:
df.category.value_counts()

belfold     2440
gazdasag    1573
tech         622
kultur       337
Name: category, dtype: int64

We usually split the data into training and test sets

In [7]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(df.text, df.category, test_size=0.33, random_state=42)

#### Now we can build the preprocessing pipeline

In [8]:
from spacy.hu import Hungarian

nlp = Hungarian()

def tokenize(text):
    return [tok.text for tok in nlp(text) if tok.is_alpha and not tok.is_stop]

tokenize("Hello világ. Itt vagyok.")

['Hello', 'világ']

As `spaCy` does not support lemmatization currently, we rely on simple stemming

In [9]:
import snowballstemmer

stemmer = snowballstemmer.stemmer('hungarian')
stemmer.stemWord("baglyom")

'bagly'

In [10]:
def tokenize_stem(text):
    return [stemmer.stemWord(tok.text) for tok in nlp(text) if tok.is_alpha and not tok.is_stop]

tokenize_stem("Hello világ. Itt vagyok.")

['Hell', 'világ']

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(tokenizer=tokenize_stem, ngram_range=(1,2), lowercase=True)
vectorizer.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=None, strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=<function tokenize_stem at 0x116de92f0>, use_idf=True,
        vocabulary=None)

Note that we used `ngram_range=(1,2)` that means we rely on words **and neighbouring word pairs** as features.

In [12]:
example_vector = vectorizer.transform(["Az oktatási miniszter is levélben szögezte le: covfafa"])
example_vector.nonzero()

(array([0, 0, 0, 0, 0, 0], dtype=int32),
 array([374579, 308404, 308327, 276716, 276542, 244104], dtype=int32))

In [13]:
doc_vector = example_vector[example_vector.nonzero()].tolist()[0]
doc_features = vectorizer.inverse_transform(example_vector)[0]
for feature, score in zip(doc_features, doc_vector):
    print(feature, score)

szögezt 0.45421955882361387
oktatás miniszter 0.4848001115368416
oktatás 0.2783541455813643
miniszter levél 0.5695599453959549
miniszter 0.23767731020511645
levél 0.3166755100853776


#### Build a predictor

In [14]:
X_transformed = vectorizer.transform(X_train)

In [15]:
from sklearn.svm import LinearSVC

classifier = LinearSVC()
classifier.fit(X_transformed, y_train)

LinearSVC(C=1.0, class_weight=None, dual=True, fit_intercept=True,
     intercept_scaling=1, loss='squared_hinge', max_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0)

In [16]:
classifier.predict(vectorizer.transform(["Az oktatási minisztere is levélben szögezte le: covfafa"]))

array(['belfold'], dtype=object)

#### By building a pipeline we simplify the whole process

In [17]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer


pipeline = make_pipeline(
    TfidfVectorizer(tokenizer=tokenize_stem, ngram_range=(1,2)),
    LinearSVC()
)

pipeline.fit(X_train, y_train)

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 2), norm='l2', preprocessor=None, smooth_i...ax_iter=1000,
     multi_class='ovr', penalty='l2', random_state=None, tol=0.0001,
     verbose=0))])

We used a simple linear SVM model, but good candidates could be:
* Multinomial Naive Bayes
* Logistic regression
* Decision trees

### Evaluate the model

**Accuracy** is a common metric used in classification evaluation.

$\mathrm{Accuracy} = \frac{\mathrm{\# correct\ labels}}{N}$

In [18]:
from sklearn.metrics import accuracy_score

y_tpred = pipeline.predict(X_train)
y_pred = pipeline.predict(X_test)

print("Train accuracy: {}".format(accuracy_score(y_train, y_tpred)))
print("Test accuracy : {}".format(accuracy_score(y_test, y_pred)))

Train accuracy: 1.0
Test accuracy : 0.890920170627666


#### Other metrics

![Precision, Recall](https://upload.wikimedia.org/wikipedia/commons/thumb/2/26/Precisionrecall.svg/440px-Precisionrecall.svg.png)

$\mathrm{F\ score} = 2 \cdot \frac{\mathrm{Precision} \cdot \mathrm{Recall}}{ \mathrm{Precision} + \mathrm{Recall}}$

In [19]:
from sklearn.metrics import classification_report

y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

    belfold       0.88      0.94      0.91       778
   gazdasag       0.89      0.88      0.88       523
     kultur       0.93      0.55      0.69       118
       tech       0.94      0.91      0.93       222

avg / total       0.89      0.89      0.89      1641



#### So far, so good, but what are the common errors of the classifier?

In [20]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_test, y_pred)

array([[733,  38,   3,   4],
       [ 57, 462,   1,   3],
       [ 36,  12,  65,   5],
       [  9,  10,   1, 202]])

#### Improve your classifier

* Collect more data.
* Collect even more data.
* [Diagnose whether your classifier suffers from  bias or variance](https://www.coursera.org/learn/machine-learning/lecture/yCAup/diagnosing-bias-vs-variance)
* Analyze the confusion matrix, and add new features if necessary.
* Fine tune your model through optimizing your hyperparameters (`GridSearch`)
* Try alternative methods such as [FastText](https://github.com/facebookresearch/fastText) (a neural network based text classification library from Facebook)

# Sentiment analysis

![Sentiment scale](./img/sentiment.png)

Classification!

## Data

In [21]:
import pandas as pd

pd.set_option("display.max_colwidth", 500)

sent_df = pd.read_csv("./data/twitter_emotion.csv", index_col=None)
sent_df.head(5)

Unnamed: 0,tweeter_tweetid,tweet,sentiment
0,_monana_626507788945125376,"@kadarmatyas én élőben tettem, a munkatàrsaidat nem olyan könnyű leblokkolni",3
1,aagitorok_620631032937775104,"ez a Conanes GRRM videó iszonyat béna, de nagyon hangosan röhögtem végig az egészet :DD",5
2,tocolade_559613056792399872,@KevinaZolvaso hihii elfelejtette behozni a dogát aztán meg jó akkor elhiszem hogy elolvastátok,4
3,niallphabetic_626856718690091009,"persze amúgy meg nem kell vissza írni...........................................nem tényleg nem, elég hogy csak elolvastad jólvan akkor.....",2
4,batacinti_537844599465988097,Elindultam dolgozni. Otthonhagytam a cigim és a kajàm. Egyszerencsétlen vagyok. Büdösek az emberek. Jóreggelt.,2


In [22]:
sent_df.sentiment = sent_df.sentiment.apply(lambda x: 1 if x >3 else -1 if x <3 else 0)
sent_df.head(3)

Unnamed: 0,tweeter_tweetid,tweet,sentiment
0,_monana_626507788945125376,"@kadarmatyas én élőben tettem, a munkatàrsaidat nem olyan könnyű leblokkolni",0
1,aagitorok_620631032937775104,"ez a Conanes GRRM videó iszonyat béna, de nagyon hangosan röhögtem végig az egészet :DD",1
2,tocolade_559613056792399872,@KevinaZolvaso hihii elfelejtette behozni a dogát aztán meg jó akkor elhiszem hogy elolvastátok,1


## Tfidf pipeline

In [23]:
from spacy.hu import Hungarian
nlp = Hungarian()

In [24]:
def is_useful_token(tok):
    return not (tok.like_url or tok.like_num) and "@" not in tok.text and "rt" != tok.text.lower()
    
def sent_tokenize(text):
    return [tok.text for tok in nlp(text) if is_useful_token(tok)]

def sent_tokenize_stem(text):
    return [stemmer.stemWord(tok) for tok in sent_tokenize(text)]


sent_tokenize_stem("rt @kadarmatyas Én élőben tettem http://index.hu 89.2 21,3 :) !")

['Én', 'élő', 'tett', ':)', '!']

In [25]:
from sklearn.pipeline import make_pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC

pipeline = make_pipeline(
    TfidfVectorizer(
        tokenizer=sent_tokenize_stem,
        ngram_range=(1, 2),
        lowercase=True,
        token_pattern=".*"),
    LinearSVC())

In [26]:
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix
import numpy as np
import random

X, y = sent_df.tweet, sent_df.sentiment

random.seed(42)
np.random.seed(42)

y_pred = cross_val_predict(pipeline, X, y, cv=5)
print(classification_report(y, y_pred))

cm = confusion_matrix(y, y_pred)
print(cm)
accuracy_score(y, y_pred)

             precision    recall  f1-score   support

         -1       0.49      0.62      0.55      1129
          0       0.53      0.36      0.43      1261
          1       0.63      0.66      0.65      1610

avg / total       0.56      0.56      0.55      4000

[[ 704  199  226]
 [ 408  455  398]
 [ 335  205 1070]]


0.55725000000000002

#### Evaluation in context

* A random classifier would result in 0.33 accuracy
* Agreement rate between humans is below 0.8

### Sentiment lexicons to the rescue?

In [27]:
pos_words = set(open("./data/PrecoSenti/PrecoPos.txt").read().strip().split())
neg_words = set(open("./data/PrecoSenti/PrecoNeg.txt").read().strip().split())

In [28]:
print(list(pos_words)[:5])
print(list(neg_words)[:5])

['rajongó', 'mesteri', 'hivatástudat', 'professzionálisan', 'megfelelően']
['vesztegető', 'vihar', 'agyonpuffant', 'nyűg', 'hazudós']


In [29]:
# Taken from the Pattern library
pos_emoticons = {
    "<3", u"♥", u"❤", ">:D", ":-D", ":D", "=-D", "=D", "X-D", "x-D", "XD",
    "xD", "8-D", ">:P", ":-P", ":P", ":-p", ":p", ":-b", ":b", ":c)", ":o)",
    ":^)", ">:)", ":-)", ":)", "=)", "=]", ":]", ":}", ":>", ":3", "8)", "8-)",
    ">;]", ";-)", ";)", ";-]", ";]", ";D", ";^)", "*-)", "*)"
}
neg_emoticons = {
    ">:/", ":-/", ":/", ":\\", ">:\\", ":-.", ":-s", ":s", ":S", ":-S", ">.>",
    ">:[", ":-(", ":(", "=(", ":-[", ":[", ":{", ":-<", ":c", ":-c", "=/",
    ":'(", ":'''(", ";'("
}

In [30]:
pos_emojis = {
    u"❤️", u"💜", u"💚", u"💙", u"💛", u"💕", u"😀", u"😄", u"😃", u"😆", u"😅", u"😂",
    u"😁", u"😻", u"😍", u"😈", u"👌", u"😛", u"😝", u"😜", u"😋", u"😇", u"😊", u"😌",
    u"😏", u"😎", u"☺", u"👍", u"😉"
}
neg_emojis = {
    u"😕", u"😬", u"😟", u"😒", u"😔", u"😞", u"😠", u"😩", u"😫", u"😡", u"👿", u"😢",
    u"😥", u"😓", u"😪", u"😭", u"😿"
}

In [31]:
positives = pos_words | pos_emoticons | pos_emojis
negatives = neg_words | neg_emoticons | neg_emojis

### Can't we just count the ratio of sentiment tokens?

In [32]:
def sentiment_polarizer(text):
    words = sent_tokenize(text)
    return [(1 if tok.lower() in positives else 
             (-1 if tok.lower()in negatives else 0)) 
            for tok in words]
            
sentiment_polarizer("Gagyi, :) nagyon jó!")

[-1, 0, 1, 0, 1, 0]

In [33]:
from sklearn.base import ClassifierMixin, BaseEstimator
from collections import Counter

X, y = sent_df.tweet, sent_df.sentiment

class SimpleSentiment(ClassifierMixin, BaseEstimator):
    def __init__(self, ratio_threshold=1.5):
        self.threshold = ratio_threshold
        
    def fit(self, X, y):
        return self
    
    def _decide(self, x):
        values = sentiment_polarizer(x) + [1, -1]
        val_counts = Counter(values)
        val_ratio = val_counts[1]/val_counts[-1]
        if val_ratio > self.threshold:
            return 1
        elif val_ratio < 1 / (self.threshold):
            return -1
        else:
            return 0
                                   
    def predict(self, X, *args):
        return [self._decide(x) for x in X]

In [34]:
y_pred = SimpleSentiment(1.5).predict(X)
print(classification_report(y, y_pred))

cm = confusion_matrix(y, y_pred)
print(cm)
accuracy_score(y, y_pred)

             precision    recall  f1-score   support

         -1       0.64      0.26      0.37      1129
          0       0.38      0.76      0.51      1261
          1       0.64      0.40      0.50      1610

avg / total       0.56      0.48      0.46      4000

[[295 682 152]
 [ 89 956 216]
 [ 79 880 651]]


0.47549999999999998

### Combining multiple information sources

In [39]:
from sklearn.pipeline import make_union
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import MaxAbsScaler

random.seed(42)
np.random.seed(42)

pipeline = make_pipeline(
    make_union(
        TfidfVectorizer(
            tokenizer=sent_tokenize,
            ngram_range=(1, 2),
            lowercase=True,
            token_pattern=".*"),
        make_pipeline(
            CountVectorizer(tokenizer=sentiment_polarizer),
            MaxAbsScaler()
        )
    ),
    LinearSVC()

)

In [40]:
y_pred = cross_val_predict(pipeline, X, y, cv=10)
print(classification_report(y, y_pred))

cm = confusion_matrix(y, y_pred)
print(cm)
accuracy_score(y, y_pred)

             precision    recall  f1-score   support

         -1       0.55      0.57      0.56      1129
          0       0.51      0.48      0.50      1261
          1       0.65      0.65      0.65      1610

avg / total       0.58      0.58      0.58      4000

[[ 648  265  216]
 [ 292  610  359]
 [ 242  318 1050]]


0.57699999999999996

### What's next?

* Collect **more** data
* Analyze the bias-variance of your model
* PoS tagged, lemmatized sentiment lexicon
* Use additional sentiment lexicons (such as [this emoji lexicon](http://kt.ijs.si/data/Emoji_sentiment_ranking/))
* Be creative with the features (handing negation, irony, sarcasm)
* Fine tune your model

Do you really want to classify sentiments on tweets? How about analyzing 
* clause level sentiment or
* entity level sentiment or
* aspect level sentiment?