In [None]:
# load all necessary libraries
import pandas as pd
import numpy as np
import xgboost as xgb
from tqdm import tqdm
from sklearn.svm import SVC
from keras.layers.embeddings import Embedding
from keras.utils import np_utils
from sklearn import preprocessing, decomposition, model_selection, metrics, pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from nltk import word_tokenize
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [None]:
# load train and test data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')
# sample = pd.read_csv('../input/sample_submission.csv')
test.head()

In [None]:
# have a general look at the size of the data
print(train.shape)
print(test.shape)

The train dataset has more than 100 thousand rows, which is quite enough for the training.

The next thing is to have look at number of labels/classes we want to classify.

In [None]:
# print all unique labels
print(train.label.unique())
print(train.label.value_counts())

There are 23 classes in total. The distribution of these classes is quite balanced, so we dont need to concern about dealing with unbalanced labels. Let's move on further analysis steps

In [None]:
# chop the train data into a smaller part, so the machine can run faster and we can reduce waiting time
train = train.sample(frac=1).reset_index(drop=True)
train.shape

Now, we use a trick from scikit-learn which is LabelEncoder to convert all label from string format to integers such as 0, 1, 2, 3,..., 22

In [None]:
# encoding all labels into digits from 0 to 22
lbl_enc = preprocessing.LabelEncoder()
train_label = lbl_enc.fit_transform(train.label.values)
np.unique(train_label, return_counts=True)

Since the test labels are not provided to test the result of the model, we now use train_test_split function to create a test dataset to validate the accuracy of training model. Later, we may want to use these data to quickly process and see the result since I used 5-fold cross-validation in my submission code and it took approximately 8 hours to finish entire validation process.

In [None]:
# split the data into training and validation sets, preferably by 8-2 ratio
xtrain, xvalid, ytrain, yvalid = train_test_split(train.Text.values, train_label, stratify=train_label, random_state=42, 
                                                  test_size=0.2, shuffle=True)
print(xtrain.shape)
print(xvalid.shape)

## Comparing a logistic model and SVM using tfidf as feature

In this step, I use the function TfidfVectorizer to convinently convert 

In [None]:
# Always start with these features. They work (almost) everytime!
tfv = TfidfVectorizer(min_df=5,  max_features=None, strip_accents='unicode', analyzer='word', token_pattern=r'\w{1,}', 
                      ngram_range=(1, 3), use_idf=1, smooth_idf=1, sublinear_tf=1, stop_words = 'english')

# fit_transform(raw_documents[, y]) Learn vocabulary and idf, return term-document matrix.
# Fitting TF-IDF to both training and test sets (semi-supervised learning)
tfv.fit(list(xtrain) + list(xvalid)) # learn vocabulary and idf from training set and valid set
xtrain_tfv =  tfv.fit_transform(xtrain)  # Transform documents to document-term matrix.
xvalid_tfv = tfv.fit_transform(xvalid)

# Fitting a simple Logistic Regression on TFIDF
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

In [None]:
feature_names = xtrain_tfv.get_feature_names()
corpus_index = [n for n in corpus]
df = pd.DataFrame(tfs.T.todense(), index=feature_names, columns=corpus_index)
print(df)

## Fitting an SVM

In [None]:
# Apply SVD, I chose 120 components. 120-200 components are good enough for SVM model.
svd = decomposition.TruncatedSVD(n_components=120)
svd.fit(xtrain_tfv)
xtrain_svd = svd.transform(xtrain_tfv)
xvalid_svd = svd.transform(xvalid_tfv)

# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svd)
xtrain_svd_scl = scl.transform(xtrain_svd)
xvalid_svd_scl = scl.transform(xvalid_svd)

# Fitting a simple SVM
clf = SVC(C=1.0, probability=True) # since we need probabilities
clf.fit(xtrain_svd_scl, ytrain)
predictions = clf.predict_proba(xvalid_svd_scl)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

## Fitting a logistic model on word counts

In [None]:
ctv = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(1, 3), stop_words = 'english')

# Fitting Count Vectorizer to both training and test sets (semi-supervised learning)
ctv.fit(list(xtrain) + list(xvalid))
xtrain_ctv =  ctv.transform(xtrain) 
xvalid_ctv = ctv.transform(xvalid)

In [None]:
# Fitting a simple Logistic Regression on Counts
clf = LogisticRegression(C=1.0)
clf.fit(xtrain_ctv, ytrain)
predictions = clf.predict_proba(xvalid_ctv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

## Fitting a simple Naive Bayes model on tfidf

In [None]:
# Fitting a simple Naive Bayes on TFIDF
clf = MultinomialNB()
clf.fit(xtrain_tfv, ytrain)
predictions = clf.predict_proba(xvalid_tfv)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

## Implement the GloVe
Now, this is the most fun and interesting part. GloVe is recently introduced Standford University, which is an effective algorithm for obtaining vector representations for words. From my experience, I find this kind of word embedding method is way more efficient than word2vec which I also tried in this competition. 

In [None]:
# load the GloVe vectors in a dictionary:

embeddings_index = {}
f = open('glove.6B.50d.txt', encoding="utf8")
for line in tqdm(f):
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

In [None]:
embeddings_index.keys()

In [None]:
embeddings_index['killed']

In [None]:
# this function creates a normalized vector for the whole sentence (here we transform entire document into a vector)
def sent2vec(s):
    words = str(s).lower()#.decode('utf-8')
    words = word_tokenize(words) # tokenize the sentence
    words = [w for w in words if not w in stop_words] # remove stop words
    words = [w for w in words if w.isalpha()] # remove numbers
    
    # create an array of word vectors excluding stopwords
    M = []
    for w in words:
        try:
            M.append(embeddings_index[w])
        except:
            continue
    M = np.array(M)
    
    v = M.sum(axis=0)
    if type(v) != np.ndarray:
        return np.zeros(300)
    return v / np.sqrt((v ** 2).sum())

In [None]:
# create sentence vectors using the above function for training and validation set
xtrain_glove = [sent2vec(x) for x in tqdm(xtrain)]
xvalid_glove = [sent2vec(x) for x in tqdm(xvalid)]

In [None]:
xtrain_glove

In [None]:
# transform theses vectors into arrays
xtrain_glove = np.array(xtrain_glove)
xvalid_glove = np.array(xvalid_glove)

ytrain = np.array(ytrain)
yvalid = np.array(yvalid)

In [None]:
# Apply SVD, I chose 120 components. 120-200 components are good enough for SVM model.
svd = decomposition.TruncatedSVD(n_components=200)
svd.fit(xtrain_glove)
xtrain_svm = svd.transform(xtrain_glove)
xvalid_svm = svd.transform(xvalid_glove)

# Scale the data obtained from SVD. Renaming variable to reuse without scaling.
scl = preprocessing.StandardScaler()
scl.fit(xtrain_svm)
xtrain_svm_scl = scl.transform(xtrain_svm)
xvalid_svm_scl = scl.transform(xvalid_svm)

In [None]:
# Fitting a simple SVM
clf = SVC(C=1.0, kernel="poly") # since we need probabilities
clf.fit(xtrain_glove[:,np.newaxis], ytrain)
predictions = clf.predict(xvalid_glove)

# print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

In [None]:
# Fitting a simple xgboost on glove features
clf = xgb.XGBClassifier(nthread=10, silent=False)
clf.fit(xtrain_glove, ytrain) #[:,np.newaxis]
predictions = clf.predict_proba(xvalid_glove)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))

In [None]:
# Fitting a simple xgboost on glove features
clf = xgb.XGBClassifier(max_depth=7, n_estimators=200, colsample_bytree=0.8, 
                        subsample=0.8, nthread=10, learning_rate=0.1, silent=False)
clf.fit(xtrain_glove, ytrain)
predictions = clf.predict_proba(xvalid_glove)

print ("logloss: %0.3f " % multiclass_logloss(yvalid, predictions))