In [1]:
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
import string
import re
import textmining
import matplotlib.pyplot as plt
from wordcloud import WordCloud, STOPWORDS
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn import svm
from sklearn.metrics import accuracy_score

# Load data

In [2]:
rawData = pd.read_csv('dataset/hm_train.csv')
rawData

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence,predicted_category
0,27673,24h,I went on a successful date with someone I fel...,1,affection
1,27674,24h,I was happy when my son got 90% marks in his e...,1,affection
2,27675,24h,I went to the gym this morning and did yoga.,1,exercise
3,27676,24h,We had a serious talk with some friends of our...,2,bonding
4,27677,24h,I went with grandchildren to butterfly display...,1,affection
...,...,...,...,...,...
60316,88299,3m,I got together with my best friend and baked c...,1,bonding
60317,88300,3m,I went to a restaurant with friends,1,bonding
60318,88301,3m,The other day on Mechanical Turk I made over f...,1,achievement
60319,88302,3m,Finished the semester today and aced majority ...,2,achievement


In [3]:
# For time being, Select first 1000 rows of text for fast processing
rawData = rawData.iloc[:1000, ]
rawData.shape

(1000, 5)

In [4]:
rawData['predicted_category'].value_counts()

affection           339
achievement         323
bonding             114
enjoy_the_moment     99
leisure              63
nature               32
exercise             30
Name: predicted_category, dtype: int64

In [5]:
categories = sorted(rawData['predicted_category'].unique())
categories

['achievement',
 'affection',
 'bonding',
 'enjoy_the_moment',
 'exercise',
 'leisure',
 'nature']

In [6]:
# Can be used for exploring
affection = rawData[rawData['predicted_category'] == 'affection']
exercise = rawData[rawData['predicted_category'] == 'exercise']
bonding = rawData[rawData['predicted_category'] == 'bonding']
leisure = rawData[rawData['predicted_category'] == 'leisure']
achievement = rawData[rawData['predicted_category'] == 'achievement']
enjoy_the_moment = rawData[rawData['predicted_category'] == 'enjoy_the_moment']
nature = rawData[rawData['predicted_category'] == 'nature']

In [7]:
# Download Repository
# nltk.download('popular')
# OR nltk.download()

In [8]:
# Extract stopwords
stops = set(stopwords.words('English'))
# stops_ = set(STOPWORDS)
# stops.update(stops_)


# Remove punctuation marks
exclude = set(string.punctuation)

# Data Pre-Processing

### Removing stop words, punctuations, numbers, hypens, white spaces from text

In [9]:
# Text Pre-Processing

def clean(doc):
    doc_tokens = word_tokenize(doc.lower())
    stop_free = " ".join([i for i in doc_tokens if i not in stops])
    punc_free = "".join([i for i in stop_free if i not in exclude])
    num_free = "".join(i for i in punc_free if i is not i.isdigit())
    num_free_ = re.sub("\d+", "", num_free)
    hypen_free = "".join(num_free_).replace('-', ' ')
    result = num_free_.strip()
    result = re.sub(' +', ' ', result)
    return result

data_cleaned = [clean(rawData.iloc[i,2]) for i in range(0, rawData.shape[0])]
data_cleaned

['went successful date someone felt sympathy connection',
 'happy son got marks examination',
 'went gym morning yoga',
 'serious talk friends flaky lately understood good evening hanging',
 'went grandchildren butterfly display crohn conservatory',
 'meditated last night',
 'made new recipe peasant bread came spectacular',
 'got gift elder brother really surprising',
 'yesterday moms birthday enjoyed',
 'watching cupcake wars three teen children',
 'came rd place call duty video game',
 'completed miles run without break makes feel strong',
 'went movies friends fun',
 'shorting gold made trade',
 'hearing songs nearly impossible go angry happy re looking thought eases angry feeling moves direction happiness may take long re headed positive direction youall world good',
 'son performed well test preparation',
 'helped neighbour fix car damages',
 'managed get final trophy game playing',
 'hot kiss girl friend last night made day',
 'new bcaas came mail yay strawberry lemonade flavored

In [10]:
# Convert it to dataframe
data_cleaned = pd.DataFrame(data_cleaned)

# Rename the column
data_cleaned = data_cleaned.rename(columns = {0 : 'cleaned_hm'})

# Add Target variable to the dataframe
#data_cleaned['predicted_category'] = rawData['predicted_category']

data_cleaned

Unnamed: 0,cleaned_hm
0,went successful date someone felt sympathy con...
1,happy son got marks examination
2,went gym morning yoga
3,serious talk friends flaky lately understood g...
4,went grandchildren butterfly display crohn con...
...,...
995,ate chikfila
996,walked classroom students ran excitedly hugged
997,stray cat hanging back porch last night finall...
998,purchased new pants two sizes smaller usually buy


### Stemming

In [11]:
# Perform Stemming
def stemming(line):
    stem_sentence = []
    words = word_tokenize(line)
    ps = PorterStemmer()
    for w in words:
        root_word = ps.stem(w)
        stem_sentence.append(root_word)
    final_sentence = " ".join(stem_sentence)
    #print(final_sentence)
    return final_sentence

In [12]:
# Function call for stemming
stemmedData = [stemming(data_cleaned['cleaned_hm'][i]) for i in range(0, data_cleaned.shape[0])]

In [13]:
# Converting it to dataframe
stemmedData = pd.DataFrame(stemmedData)

# Renaming the column
stemmedData = stemmedData.rename(columns = {0 : 'cleaned_hm'})

stemmedData

Unnamed: 0,cleaned_hm
0,went success date someon felt sympathi connect
1,happi son got mark examin
2,went gym morn yoga
3,seriou talk friend flaki late understood good ...
4,went grandchildren butterfli display crohn con...
...,...
995,ate chikfila
996,walk classroom student ran excitedli hug
997,stray cat hang back porch last night final fou...
998,purchas new pant two size smaller usual buy


### Lemmatization

In [14]:
def lemmatize(line):
    lemmetized_sentence = []
    lemmatizer = WordNetLemmatizer()
    for w in line:
        l = lemmatizer.lemmatize(w)
        lemmetized_sentence.append(l)
    final_sentence = "".join(lemmetized_sentence)
    #print(final_sentence)
    return final_sentence

In [15]:
# Function call for lemmatizing
lemmatized_data = [lemmatize(stemmedData['cleaned_hm'][i]) for i in range(0, stemmedData.shape[0])]

In [16]:
# Convert it to dataframe
dataAsDF = pd.DataFrame(lemmatized_data)

# Rename the column
dataAsDF = dataAsDF.rename(columns = {0 : 'cleaned_hm'})

dataAsDF

Unnamed: 0,cleaned_hm
0,went success date someon felt sympathi connect
1,happi son got mark examin
2,went gym morn yoga
3,seriou talk friend flaki late understood good ...
4,went grandchildren butterfli display crohn con...
...,...
995,ate chikfila
996,walk classroom student ran excitedli hug
997,stray cat hang back porch last night final fou...
998,purchas new pant two size smaller usual buy


### Vectorizing Text Data

In [17]:
# COUNT VECTORIZER

# stops_ = set(STOPWORDS)
# stops.update(stops_)

count_vec = CountVectorizer(ngram_range = (1,3)) #stop_words=stops)
inputDF = pd.DataFrame(count_vec.fit_transform(dataAsDF['cleaned_hm']).toarray())

print(inputDF.shape)
# print(count_vec.get_feature_names())

(1000, 16377)


In [18]:
# TF-IDF VECTORIZER

# count_vec = TfidfVectorizer()
# inputDF = pd.DataFrame(count_vec.fit_transform(dataAsDF['cleaned_hm']).toarray())

# It is giving less ACCURACY

In [19]:
outputDF = pd.DataFrame(rawData['predicted_category'])
outputDF = outputDF.rename(columns = {'predicted_category' : 'category'})
print(outputDF.shape)
outputDF

(1000, 1)


Unnamed: 0,category
0,affection
1,affection
2,exercise
3,bonding
4,affection
...,...
995,enjoy_the_moment
996,bonding
997,enjoy_the_moment
998,achievement


In [20]:
# Convert target columns to categories
#outputDF = pd.DataFrame(outputDF.predicted_category.astype('category').cat.codes)
outputDF['category'] = outputDF['category'].map({'achievement':0, 'affection':1, 'bonding':2, 'enjoy_the_moment':3, 'exercise':4, 'leisure':5, 'nature':6})
outputDF

Unnamed: 0,category
0,1
1,1
2,4
3,2
4,1
...,...
995,3
996,2
997,3
998,0


### Splitting Data for Training and Testing

In [21]:
# Splitting Train and Test data
xtrain, xtest, ytrain, ytest = train_test_split(inputDF, outputDF, test_size = 0.2, random_state = 201)

In [22]:
print(xtrain.shape, ytrain.shape, xtest.shape, ytest.shape)

(800, 16377) (800, 1) (200, 16377) (200, 1)


### Creating Classification Models

#### Multinomial Naive Bayes Classifier

In [23]:
MNB_clf = MultinomialNB()
MNB_clf.fit(xtrain,ytrain)

predicted_train = MNB_clf.predict(xtrain)
train_accuracy = accuracy_score(ytrain, predicted_train, normalize = True, sample_weight=None)
print('Train accuracy: ', train_accuracy)

predicted_test = MNB_clf.predict(xtest)
test_accuracy = accuracy_score(ytest, predicted_test, normalize = True, sample_weight=None)
print('Test accuracy: ', test_accuracy)

#MNB_clf.score(xtest,ytest)

  y = column_or_1d(y, warn=True)


Train accuracy:  0.985
Test accuracy:  0.725


#### Decision Tree Classifier

In [24]:
DT_clf = DecisionTreeClassifier()
DT_clf.fit(xtrain,ytrain)
DT_clf.score(xtest,ytest)

0.68

#### Support Vector Machines

In [25]:
SVM_clf = svm.SVC()
SVM_clf.fit(xtrain, ytrain)
SVM_clf.score(xtest, ytest)

  y = column_or_1d(y, warn=True)


0.34

#### Random Forest Classifier

In [26]:
RF_clf = RandomForestClassifier()
RF_clf.fit(xtrain, ytrain)
RF_clf.score(xtest, ytest)

  


0.645

MultinomialNB classifier is yielding more Accuracy.

# Testing

In [27]:
test_data = pd.read_csv("dataset/hm_test.csv")
test_data

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence
0,88305,3m,I spent the weekend in Chicago with my friends.,1
1,88306,3m,We moved back into our house after a remodel. ...,2
2,88307,3m,My fiance proposed to me in front of my family...,1
3,88308,3m,I ate lobster at a fancy restaurant with some ...,1
4,88309,3m,I went out to a nice restaurant on a date with...,5
...,...,...,...,...
40208,128762,24h,My husband announced he is getting a decent bo...,1
40209,128763,24h,Had a can of Pepsi to drink.,1
40210,128764,24h,Cuddling with my girlfriend last night.,1
40211,128765,24h,I had a great meeting yesterday at work with m...,1


In [28]:
test_data.shape

(40213, 4)

In [29]:
test_data = test_data.iloc[:1000, ]
test_data.shape

(1000, 4)

### Removing stop words, punctuations, numbers, hypens, white spaces from text

In [30]:
test_data_cleaned = [clean(test_data.iloc[i,2]) for i in range(0, test_data.shape[0])]
test_data_cleaned = pd.DataFrame(test_data_cleaned)
test_data_cleaned = test_data_cleaned.rename(columns = {0 : 'cleaned_hm'})
test_data_cleaned

Unnamed: 0,cleaned_hm
0,spent weekend chicago friends
1,moved back house remodel lived hotel months du...
2,fiance proposed front family beginning march
3,ate lobster fancy restaurant friends
4,went nice restaurant date wife popular restaur...
...,...
995,went outside watered flowers yard
996,went favorite bakery got macarons
997,got promoted work go dream level worked years ...
998,heard back aunt received promotion


### Stemming

In [31]:
test_data_stemmed = [stemming(test_data_cleaned['cleaned_hm'][i]) for i in range(0, test_data_cleaned.shape[0])]

In [32]:
test_data_stemmed = pd.DataFrame(test_data_stemmed)
test_data_stemmed = test_data_stemmed.rename(columns = {0 : 'cleaned_hm'})
test_data_stemmed

Unnamed: 0,cleaned_hm
0,spent weekend chicago friend
1,move back hous remodel live hotel month due ho...
2,fianc propos front famili begin march
3,ate lobster fanci restaur friend
4,went nice restaur date wife popular restaur co...
...,...
995,went outsid water flower yard
996,went favorit bakeri got macaron
997,got promot work go dream level work year reach...
998,heard back aunt receiv promot


### Lemmetization

In [33]:
test_data_lemmetized = [lemmatize(test_data_stemmed['cleaned_hm'][i]) for i in range(0, test_data_stemmed.shape[0])]

In [34]:
test_data_lemmetized = pd.DataFrame(test_data_lemmetized)
test_data_lemmetized = test_data_lemmetized.rename(columns = {0 : 'cleaned_hm'})
test_data_lemmetized

Unnamed: 0,cleaned_hm
0,spent weekend chicago friend
1,move back hous remodel live hotel month due ho...
2,fianc propos front famili begin march
3,ate lobster fanci restaur friend
4,went nice restaur date wife popular restaur co...
...,...
995,went outsid water flower yard
996,went favorit bakeri got macaron
997,got promot work go dream level work year reach...
998,heard back aunt receiv promot


### Vectorizing Text data

In [35]:
test_inputDF = pd.DataFrame(count_vec.transform(test_data_lemmetized['cleaned_hm']).toarray())
test_inputDF.shape

(1000, 16377)

In [36]:
predicted_output = MNB_clf.predict(test_inputDF)
predicted_output = pd.DataFrame(predicted_output)
predicted_output = predicted_output.rename(columns = {0 : 'predicted_output'})
predicted_output['predicted_output'] = predicted_output['predicted_output'].map({0:'achievement', 1:'affection', 2:'bonding', 3:'enjoy_the_moment', 4:'exercise', 5:'leisure', 6:'nature'})
predicted_output

Unnamed: 0,predicted_output
0,affection
1,affection
2,affection
3,bonding
4,affection
...,...
995,affection
996,affection
997,achievement
998,achievement


In [37]:
final_predicted = pd.DataFrame(test_data[['hmid','cleaned_hm']])
final_predicted['predicted_output'] = predicted_output
final_predicted

Unnamed: 0,hmid,cleaned_hm,predicted_output
0,88305,I spent the weekend in Chicago with my friends.,affection
1,88306,We moved back into our house after a remodel. ...,affection
2,88307,My fiance proposed to me in front of my family...,affection
3,88308,I ate lobster at a fancy restaurant with some ...,bonding
4,88309,I went out to a nice restaurant on a date with...,affection
...,...,...,...
995,89304,I went outside and watered some flowers in my ...,affection
996,89305,We went to my favorite bakery and got macarons.,affection
997,89306,I got promoted at work. I go the y dream level...,achievement
998,89307,I heard back from my aunt that she received a ...,achievement


In [38]:
# Counts of Unique values
final_predicted['predicted_output'].value_counts()

affection           521
achievement         404
bonding              59
enjoy_the_moment      8
leisure               5
exercise              3
Name: predicted_output, dtype: int64

In [39]:
# Writting to csv file
final_predicted.to_csv('final_predicted.csv', index=False)

# DONE