<a href="https://colab.research.google.com/github/SDS-AAU/M3-2018/blob/master/assignments/individual/Easy_text_preprocessing.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Easy text-preprocessing approaches

- ##  Spacy and out of the box sentence embeddings
- ##  SKlearn and ML-style count-tfidf-vectorizer

In [0]:
# Imporr libraries

import pandas as pd
import numpy as np

import spacy

from sklearn.preprocessing import LabelEncoder
labelencoder = LabelEncoder()

# quick evaluation
from sklearn.metrics import classification_report

In [2]:
# Download the small standard English language model
!python -m spacy download en


[93m    Linking successful[0m
    /usr/local/lib/python3.6/dist-packages/en_core_web_sm -->
    /usr/local/lib/python3.6/dist-packages/spacy/data/en

    You can now load the model via spacy.load('en')



In [0]:
# Load up the model, so we can use it

nlp = spacy.load('en')

In [0]:
# Load up the data

data = pd.read_csv('https://github.com/SDS-AAU/M3-2018/raw/master/assignments/individual/data/train.csv')

In [0]:
# take a 1000 sample (because it's an example)

data = data.sample(n=1000)

In [0]:
# If you run this just like that, it will take around 10min for the whole dataset

data['spacy_sentence_vec'] = data['text'].map(lambda t: nlp(t).vector)

In [7]:
# the spacy_sentence_vec column now contains a vector representation of each sentence with 384 dimensions

data.head()

Unnamed: 0,id,text,author,spacy_sentence_vec
14147,id15567,"Mr. Blackwood has a pair of tailor's shears, a...",EAP,"[0.20423822, 0.7053059, 0.7660605, 1.2693121, ..."
15276,id18142,That gland is the great sense organ of organs ...,HPL,"[0.3282318, -0.2011276, 0.39009184, 1.2634265,..."
11247,id16883,His interest gradually veered away from the un...,HPL,"[0.44190902, 0.07274222, 0.103103265, 1.167332..."
14353,id16917,From the moment of my mother's death untill hi...,MWS,"[0.51801, 0.39364424, 0.21461968, 1.3217521, 0..."
2739,id09897,When I mention his weakness I have allusion to...,EAP,"[0.6794714, 0.32919943, 1.0377783, 0.80767155,..."


In [0]:
# This is how we transform the sequence of lists in rows into a numpy array
# We just use vstack - that stands for vertical stacking

X = np.vstack(data['spacy_sentence_vec'])

In [9]:
# As expected: a 1000 by 384 matrix

X.shape

(1000, 384)

In [0]:
# our y can be defined from the author column using a standard label_encoder

y = labelencoder.fit_transform(data['author'])

#### DONE - all you need to do if you run a simple deep neural network.

Let's see if that works using a simle Logistic Regression as a baseline model

In [0]:
# Split into train-test

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [0]:
# Load up a classifier

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

In [0]:
# You can experiment with some fancier stuff if you want
#from xgboost import XGBClassifier

#classifier = XGBClassifier()

In [14]:
# Fit the model 

classifier.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [15]:
print(classification_report(y_test, classifier.predict(X_test), target_names=labelencoder.classes_))

             precision    recall  f1-score   support

        EAP       0.69      0.61      0.64        79
        HPL       0.59      0.73      0.65        51
        MWS       0.70      0.67      0.69        70

avg / total       0.67      0.66      0.66       200



## Approach 2, using sklearn 

In [0]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english', 
                             use_idf=True, 
                             smooth_idf=True)
X = vectorizer.fit_transform(data['text'])

#### Let's evaluate

In [17]:
# Split into train-test

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)


# Load up a classifier

from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression()

# Fit the model 

classifier.fit(X_train, y_train)

print(classification_report(y_test, classifier.predict(X_test), target_names=labelencoder.classes_))

             precision    recall  f1-score   support

        EAP       0.49      0.96      0.65        79
        HPL       0.78      0.14      0.23        51
        MWS       0.86      0.44      0.58        70

avg / total       0.69      0.57      0.52       200



Or do first the count-step and also try dimensionality reduction... SVD = LSI

In [0]:
# This will tokenize and count up each and any text-chunk
# Add a TFIDF transformer if you want to work from that end

from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(data['text'])

In [0]:
# This can be added to reduce dimensionality (using SVC rather than PCA because of sparse matrix input)
# Dimensionality reduction (not really helpful)

from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components=500, algorithm='randomized', n_iter=10, random_state=42)
X = svd.fit_transform(X)