In [None]:
import pandas as pd
import numpy as np
import re
import nltk
# nltk.download()
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split


In [68]:
X_train = pd.read_csv('hm_train.csv')
X_test = pd.read_csv('hm_test.csv')
X_test_copy = X_test.copy()
print(X_test.shape)
print(X_train.shape)

(40213, 4)
(60321, 5)


In [69]:
X_train.predicted_category.value_counts()

affection           20880
achievement         20274
bonding              6561
enjoy_the_moment     6508
leisure              4242
nature               1127
exercise              729
Name: predicted_category, dtype: int64

## Data above is very unbalanced.


In [70]:
X_train = pd.concat([X_train, X_train[X_train['predicted_category'] == 'exercise']])
X_train = pd.concat([X_train, X_train[X_train['predicted_category'] == 'nature']])
X_train = pd.concat([X_train, X_train[X_train['predicted_category'] == 'exercise']])
X_train = pd.concat([X_train, X_train[X_train['predicted_category'] == 'bonding']])
X_train = pd.concat([X_train, X_train[X_train['predicted_category'] == 'enjoy_the_moment']])
X_train = pd.concat([X_train, X_train[X_train['predicted_category'] == 'leisure']])
X_train = pd.concat([X_train, X_train[X_train['predicted_category'] == 'leisure']])
X_train = pd.concat([X_train, X_train[X_train['predicted_category'] == 'nature']])
X_train = pd.concat([X_train, X_train[X_train['predicted_category'] == 'nature']])
X_train = pd.concat([X_train, X_train[X_train['predicted_category'] == 'exercise']])
X_train = pd.concat([X_train, X_train[X_train['predicted_category'] == 'exercise']])
X_train = pd.concat([X_train, X_train[X_train['predicted_category'] == 'exercise']])
X_train = pd.concat([X_train, X_train[X_train['predicted_category'] == 'nature']])
X_train = pd.concat([X_train, X_train[X_train['predicted_category'] == 'bonding']])

In [71]:
X_train.predicted_category.value_counts()

bonding             26244
exercise            23328
affection           20880
achievement         20274
nature              18032
leisure             16968
enjoy_the_moment    13016
Name: predicted_category, dtype: int64

In [72]:
X_test = X_test.drop(columns=['hmid', 'reflection_period', 'num_sentence'])

In [73]:
y_train = X_train.predicted_category
X_train = X_train.drop(columns=['hmid', 'reflection_period', 'num_sentence', 'predicted_category'])
X_train.head()

Unnamed: 0,cleaned_hm
0,I went on a successful date with someone I fel...
1,I was happy when my son got 90% marks in his e...
2,I went to the gym this morning and did yoga.
3,We had a serious talk with some friends of our...
4,I went with grandchildren to butterfly display...


In [74]:
X_train.shape

(138742, 1)

In [75]:
y_train.shape

(138742,)

In [76]:
y_train.value_counts()

bonding             26244
exercise            23328
affection           20880
achievement         20274
nature              18032
leisure             16968
enjoy_the_moment    13016
Name: predicted_category, dtype: int64

In [77]:
print(X_test.shape)
print(X_train.shape)

(40213, 1)
(138742, 1)


In [78]:
STOPWORDS = list(set(stopwords.words('english')))

## Remove the Stopwords.

In [79]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')

def text_prepare(text):
  
    text = text.lower() 
    text = REPLACE_BY_SPACE_RE.sub(' ', text)
    text = BAD_SYMBOLS_RE.sub('', text) 
    text = ' '.join([word for word in text.split() if word not in STOPWORDS])
    text = text.strip()
    return text

In [80]:
X_train.cleaned_hm = X_train.cleaned_hm.apply(text_prepare)
X_test.cleaned_hm = X_test.cleaned_hm.apply(text_prepare)

In [81]:
tags_counts = {}
words_counts = {}

from collections import Counter
tags_counts = Counter([word for line in X_train.cleaned_hm for word in line.split(' ')])
words_counts = Counter([word for line in X_test.cleaned_hm for word in line.split(' ')])
most_common_tags = sorted(tags_counts.items(), key=lambda x: x[1], reverse=True)[:3]
most_common_words = sorted(words_counts.items(), key=lambda x: x[1], reverse=True)[:3]
words_counts.most_common(4)

[('happy', 6815), ('got', 5270), ('made', 4213), ('time', 3659)]

In [82]:
print(most_common_words)
print(most_common_tags)

[('happy', 6815), ('got', 5270), ('made', 4213)]
[('happy', 22768), ('went', 22688), ('got', 14399)]


## Creating a bag of words representation

In [83]:
DICT_SIZE = 5000
VOCAB = words_counts.most_common(DICT_SIZE)
WORDS_TO_INDEX = {item[0]:ii for ii, item in enumerate(sorted(VOCAB, key=lambda x: x[1], reverse=True))}
INDEX_TO_WORDS = {ii:word for word, ii in WORDS_TO_INDEX.items()} 

def my_bag_of_words(text, words_to_index, dict_size):
    result_vector = np.zeros(dict_size)
 
    for word in text.split(' '):
        if word in words_to_index:
            result_vector[words_to_index[word]] +=1
    return result_vector

In [84]:
X_train.cleaned_hm[0]

'went successful date someone felt sympathy connection'

In [85]:
from scipy import sparse as sp_sparse

mtx = [sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train.cleaned_hm[0]]
mtx_1 = [sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train.cleaned_hm[1]]


In [86]:
from scipy import sparse as sp_sparse
X_train_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_train.cleaned_hm])
X_test_mybag = sp_sparse.vstack([sp_sparse.csr_matrix(my_bag_of_words(text, WORDS_TO_INDEX, DICT_SIZE)) for text in X_test.cleaned_hm])

In [87]:
len(INDEX_TO_WORDS)
# len(WORDS_TO_INDEX)

5000

In [88]:
print('X_train shape ', X_train_mybag.shape)
print('X_test shape ', X_test_mybag.shape)

X_train shape  (138742, 5000)
X_test shape  (40213, 5000)


In [89]:
X_train_mybag.toarray()

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [90]:
row = X_train_mybag[1].toarray()[0]
non_zero_elements_count = np.sum([1 for item in row if item != 0])
non_zero_elements_count

6

In [91]:
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import LogisticRegression, RidgeClassifier

X_train_split, X_test_split, y_train_split, y_test_split = train_test_split(X_train_mybag, y_train, 
                                                                            test_size=0.2)

model = OneVsRestClassifier(LogisticRegression(penalty='l2', C=1.0))
model.fit(X_train_split, y_train_split)
print(model.score(X_test_split, y_test_split))
print(model.score(X_train_split, y_train_split))



0.9446826912681539
0.961366933049832


In [92]:
model = OneVsRestClassifier(LogisticRegression(penalty='l2', C=1.0))
model.fit(X_train_mybag, y_train)

X_test_copy['predicted_category'] = model.predict(X_test_mybag)



In [93]:
X_test_copy = X_test_copy.drop(columns=['cleaned_hm', 'num_sentence', 'reflection_period'])
X_test_copy.shape


(40213, 2)

In [94]:
X_test_copy.to_csv('submission_1.csv', index=False)