In [1]:
from google.colab import drive 
drive.mount('/content/gdrive')

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


Using TensorFlow backend.


## Making pandas dataframes for training and test data

In [0]:
dftrain=pandas.read_csv('/content/gdrive/My Drive/Hackerearth (1)/7c9cd64e078411e9 (1)/dataset (1)/hm_train.csv')

In [0]:
dftest=pandas.read_csv('/content/gdrive/My Drive/Hackerearth (1)/7c9cd64e078411e9 (1)/dataset (1)/hm_test.csv')


## Exploratory data analysis

In [4]:
dftrain.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence,predicted_category
0,27673,24h,I went on a successful date with someone I fel...,1,affection
1,27674,24h,I was happy when my son got 90% marks in his e...,1,affection
2,27675,24h,I went to the gym this morning and did yoga.,1,exercise
3,27676,24h,We had a serious talk with some friends of our...,2,bonding
4,27677,24h,I went with grandchildren to butterfly display...,1,affection


In [5]:
dftrain.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60321 entries, 0 to 60320
Data columns (total 5 columns):
hmid                  60321 non-null int64
reflection_period     60321 non-null object
cleaned_hm            60321 non-null object
num_sentence          60321 non-null int64
predicted_category    60321 non-null object
dtypes: int64(2), object(3)
memory usage: 2.3+ MB


In [6]:
dftrain

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence,predicted_category
0,27673,24h,I went on a successful date with someone I fel...,1,affection
1,27674,24h,I was happy when my son got 90% marks in his e...,1,affection
2,27675,24h,I went to the gym this morning and did yoga.,1,exercise
3,27676,24h,We had a serious talk with some friends of our...,2,bonding
4,27677,24h,I went with grandchildren to butterfly display...,1,affection
5,27678,24h,I meditated last night.,1,leisure
6,27679,24h,"I made a new recipe for peasant bread, and it ...",1,achievement
7,27680,24h,I got gift from my elder brother which was rea...,1,affection
8,27681,24h,YESTERDAY MY MOMS BIRTHDAY SO I ENJOYED,1,enjoy_the_moment
9,27682,24h,Watching cupcake wars with my three teen children,1,affection


In [7]:
dftest.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence
0,88305,3m,I spent the weekend in Chicago with my friends.,1
1,88306,3m,We moved back into our house after a remodel. ...,2
2,88307,3m,My fiance proposed to me in front of my family...,1
3,88308,3m,I ate lobster at a fancy restaurant with some ...,1
4,88309,3m,I went out to a nice restaurant on a date with...,5


## Generating a new training dataframe for feature(text) and label (category), and a new test dataframe having sequence of texts.

In [0]:
trainDF = pandas.DataFrame()
trainDF['text'] = dftrain['cleaned_hm']
trainDF['label'] = dftrain['predicted_category']

In [0]:
testDF=pandas.DataFrame()
testDF['text']=dftest['cleaned_hm']

## splitting the dataset into training and validation datasets

In [0]:
 train_x, valid_x, train_y, valid_y = model_selection.train_test_split(trainDF['text'], trainDF['label'],random_state=0)

## label encoding the target variable (representing categories mathematically)

In [0]:
encoder = preprocessing.LabelEncoder()
train_y = encoder.fit_transform(train_y)
valid_y = encoder.fit_transform(valid_y)

## Converting text data into feature vectors using different approaches

In [0]:
# creating a count vectorizer object 
count_vect = CountVectorizer(analyzer='word', token_pattern=r'\w{1,}')
count_vect.fit(trainDF['text'])

# transform the training and validation data using count vectorizer object
xtrain_count =  count_vect.transform(train_x)
xvalid_count =  count_vect.transform(valid_x)

In [0]:
xtest_count =  count_vect.transform(testDF['text'])

In [0]:
# word level tf-idf
tfidf_vect = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', max_features=5000)
tfidf_vect.fit(trainDF['text'])
xtrain_tfidf =  tfidf_vect.transform(train_x)
xvalid_tfidf =  tfidf_vect.transform(valid_x)

xtest_tfidf =  tfidf_vect.transform(testDF['text'])

# ngram level tf-idf 
tfidf_vect_ngram = TfidfVectorizer(analyzer='word', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram.fit(trainDF['text'])
xtrain_tfidf_ngram =  tfidf_vect_ngram.transform(train_x)
xvalid_tfidf_ngram =  tfidf_vect_ngram.transform(valid_x)

xtest_tfidf_ngram =  tfidf_vect.transform(testDF['text'])

# characters level tf-idf
tfidf_vect_ngram_chars = TfidfVectorizer(analyzer='char', token_pattern=r'\w{1,}', ngram_range=(2,3), max_features=5000)
tfidf_vect_ngram_chars.fit(trainDF['text'])
xtrain_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(train_x) 
xvalid_tfidf_ngram_chars =  tfidf_vect_ngram_chars.transform(valid_x)

xtest_tfidf_ngram_chars =  tfidf_vect.transform(testDF['text'])

# Training Function

In [0]:
def train_model(classifier, feature_vector_train, label, feature_vector_valid, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_valid)
    
    if is_neural_net:
        predictions = predictions.argmax(axis=-1)
    
    return metrics.accuracy_score(predictions, valid_y)

## Using Naive Bayes Classifier

In [16]:
# Naive Bayes on Count Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_count, train_y, xvalid_count)
print("NB, Count Vectors: ", accuracy)

# Naive Bayes on Word Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf, train_y, xvalid_tfidf)
print("NB, WordLevel TF-IDF: ", accuracy)

# Naive Bayes on Ngram Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("NB, N-Gram Vectors: ", accuracy)

# Naive Bayes on Character Level TF IDF Vectors
accuracy = train_model(naive_bayes.MultinomialNB(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("NB, CharLevel Vectors: ", accuracy)

NB, Count Vectors:  0.8163914859757311
NB, WordLevel TF-IDF:  0.7723625754260327
NB, N-Gram Vectors:  0.759697632782972
NB, CharLevel Vectors:  0.755719116769445


decent accuracy achieved

## Using Linear Classifier

In [17]:
# Linear Classifier on Count Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_count, train_y, xvalid_count)
print("LR, Count Vectors: ", accuracy)

# Linear Classifier on Word Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xvalid_tfidf)
print("LR, WordLevel TF-IDF: ", accuracy)

# Linear Classifier on Ngram Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("LR, N-Gram Vectors: ", accuracy)

# Linear Classifier on Character Level TF IDF Vectors
accuracy = train_model(linear_model.LogisticRegression(), xtrain_tfidf_ngram_chars, train_y, xvalid_tfidf_ngram_chars)
print("LR, CharLevel Vectors: ", accuracy)



LR, Count Vectors:  0.9293813407598965
LR, WordLevel TF-IDF:  0.9135998939062396
LR, N-Gram Vectors:  0.813871759167164
LR, CharLevel Vectors:  0.8720244015648829


Thats a pretty good accuracy using Linear classifier on count vectors

## Using SVM

In [18]:
accuracy = train_model(svm.SVC(), xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram)
print("SVM, N-Gram Vectors: ", accuracy)



SVM, N-Gram Vectors:  0.3489158543863139


took training time but result is bad

## Using Random Forest Classifier

In [19]:
# RF on Count Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_count, train_y, xvalid_count)
print("RF, Count Vectors: ", accuracy)

# RF on Word Level TF IDF Vectors
accuracy = train_model(ensemble.RandomForestClassifier(), xtrain_tfidf, train_y, xvalid_tfidf)
print("RF, WordLevel TF-IDF: ", accuracy)



RF, Count Vectors:  0.7959021285060672




RF, WordLevel TF-IDF:  0.8016046681254558


## Gradient boosting

In [20]:
# Extereme Gradient Boosting on Count Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_count.tocsc(), train_y, xvalid_count.tocsc())
print("Xgb, Count Vectors: ", accuracy)

# Extereme Gradient Boosting on Word Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf.tocsc(), train_y, xvalid_tfidf.tocsc())
print("Xgb, WordLevel TF-IDF: ", accuracy)

# Extereme Gradient Boosting on Character Level TF IDF Vectors
accuracy = train_model(xgboost.XGBClassifier(), xtrain_tfidf_ngram_chars.tocsc(), train_y, xvalid_tfidf_ngram_chars.tocsc())
print("Xgb, CharLevel Vectors: ", accuracy)

Xgb, Count Vectors:  0.7867515416749552
Xgb, WordLevel TF-IDF:  0.7852264438697699
Xgb, CharLevel Vectors:  0.8004111133213978


## Using a shallow neural network

In [21]:
def create_model_architecture(input_size):
    # create input layer 
    input_layer = layers.Input((input_size, ), sparse=True)
    
    # create hidden layer
    hidden_layer = layers.Dense(100, activation="relu")(input_layer)
    
    # create output layer
    output_layer = layers.Dense(1, activation="sigmoid")(hidden_layer)

    classifier = models.Model(inputs = input_layer, outputs = output_layer)
    classifier.compile(optimizer=optimizers.Adam(), loss='binary_crossentropy')
    return classifier 

classifier = create_model_architecture(xtrain_tfidf_ngram.shape[1])
accuracy = train_model(classifier, xtrain_tfidf_ngram, train_y, xvalid_tfidf_ngram, is_neural_net=True)
print("NN, Ngram Level TF IDF Vectors",  accuracy)

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Use tf.cast instead.
Epoch 1/1
NN, Ngram Level TF IDF Vectors 0.3306809893243154


## Making function for returning predicted values using a classifier on test set.

In [0]:
def get_pred(classifier, feature_vector_train, label, feature_vector_test, is_neural_net=False):
    # fit the training dataset on the classifier
    classifier.fit(feature_vector_train, label)
    
    # predict the labels on validation dataset
    predictions = classifier.predict(feature_vector_test)
    return predictions

## Since linear classifier on count vectors gave best accuracy so using it for test predictions

In [23]:
#Gen_pred=get_pred(linear_model.LogisticRegression(), xtrain_count, train_y, xtest_count)
Gen_pred=get_pred(linear_model.LogisticRegression(), xtrain_tfidf, train_y, xtest_tfidf)




## Making dataframe for holding predicted values

In [24]:
P=pandas.DataFrame()
Idh = pandas.Series([])
Cat=pandas.Series([])
for i in range(len(Gen_pred)):
  Idh[i]=dftest['hmid'].iloc[i]
  if Gen_pred[i]==0:
    Cat[i]='achievement'
  elif Gen_pred[i]==1:
    Cat[i]='affection'
  elif Gen_pred[i]==2:
    Cat[i]='bonding'
  elif Gen_pred[i]==3:
    Cat[i]='enjoy_the_moment'
  elif Gen_pred[i]==4:
    Cat[i]='exercise'
  elif Gen_pred[i]==5:
    Cat[i]='leisure'
  elif Gen_pred[i]==6:
    Cat[i]='nature'
P.insert(0, 'hmid', Idh)
P.insert(1, 'predicted_category', Cat)
P


Unnamed: 0,hmid,predicted_category
0,88305,bonding
1,88306,achievement
2,88307,affection
3,88308,bonding
4,88309,affection
5,88310,achievement
6,88311,achievement
7,88312,affection
8,88313,leisure
9,88314,affection


## Generating predicted file in csv format

In [0]:
P.to_csv("my_submission2.csv", index=False)

In [0]:
from google.colab import files
files.download('my_submission2.csv')