# <center>  ML INTERNSHIP - HACKEREARTH CHALLENGE

### <center>  1) importing necssary libraries

In [8]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
from nltk.corpus import stopwords
%matplotlib inline

### <center>2) importing data and analysis

In [9]:
train_df = pd.read_csv('hm_train.csv')
train_df.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence,predicted_category
0,27673,24h,I went on a successful date with someone I fel...,1,affection
1,27674,24h,I was happy when my son got 90% marks in his e...,1,affection
2,27675,24h,I went to the gym this morning and did yoga.,1,exercise
3,27676,24h,We had a serious talk with some friends of our...,2,bonding
4,27677,24h,I went with grandchildren to butterfly display...,1,affection


In [10]:
test_df = pd.read_csv('hm_test.csv')
test_df.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence
0,88305,3m,I spent the weekend in Chicago with my friends.,1
1,88306,3m,We moved back into our house after a remodel. ...,2
2,88307,3m,My fiance proposed to me in front of my family...,1
3,88308,3m,I ate lobster at a fancy restaurant with some ...,1
4,88309,3m,I went out to a nice restaurant on a date with...,5


In [11]:
train_df.describe(include=['object','bool','int','float']) # including object types also

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence,predicted_category
count,60321.0,60321,60321,60321.0,60321
unique,,2,58454,,7
top,,24h,I WENT TO MOVIE,,affection
freq,,30455,76,,20880
mean,57996.92951,,,1.355946,
std,17501.024854,,,1.30816,
min,27673.0,,,1.0,
25%,42845.0,,,1.0,
50%,58001.0,,,1.0,
75%,73160.0,,,1.0,


In [12]:
test_df.describe(include=['object','bool','int','float']) #including object types also

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence
count,40213.0,40213,40213,40213.0
unique,,2,38596,
top,,3m,I WENT TO TEMPLE,
freq,,20837,50,
mean,108539.500734,,,1.318007
std,11678.312178,,,1.280175
min,88305.0,,,1.0
25%,98433.0,,,1.0
50%,108538.0,,,1.0
75%,118655.0,,,1.0


In [13]:
train_df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60321 entries, 0 to 60320
Data columns (total 5 columns):
hmid                  60321 non-null int64
reflection_period     60321 non-null object
cleaned_hm            60321 non-null object
num_sentence          60321 non-null int64
predicted_category    60321 non-null object
dtypes: int64(2), object(3)
memory usage: 2.3+ MB


**- clearly no missing values**

In [14]:
test_df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40213 entries, 0 to 40212
Data columns (total 4 columns):
hmid                 40213 non-null int64
reflection_period    40213 non-null object
cleaned_hm           40213 non-null object
num_sentence         40213 non-null int64
dtypes: int64(2), object(2)
memory usage: 1.2+ MB


**- clearly no missing values**

In [15]:
train_df['cleaned_hm'].describe() 

count               60321
unique              58454
top       I WENT TO MOVIE
freq                   76
Name: cleaned_hm, dtype: object

**- unique != count there can be repeated messages**

In [16]:
train_df.groupby(['predicted_category'])['cleaned_hm'].describe() 

Unnamed: 0_level_0,count,unique,top,freq
predicted_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
achievement,20274,19832,I got a new job.,19
affection,20880,20384,My son gave me a big hug in the morning when I...,18
bonding,6561,6335,The offsite with colleagues was great fun. We ...,38
enjoy_the_moment,6508,6230,Happiness is a mental or emotional state of we...,24
exercise,729,684,I WENT TO YOGA,14
leisure,4242,3870,I WENT TO MOVIE,76
nature,1127,1121,Lyling in the bed listening to the rain outsid...,2


**- `achievement`, `excercise` and `leisure` have short sentences hence `length of words` can be a good feature** 

In [17]:
#train_df['length_of_words'] = train_df['cleaned_hm'].apply(len) 
#test_df['length_of_words'] = test_df['cleaned_hm'].apply(len) 

### <center> 3) Text-Processing

In [18]:
# creating text pre-process
def text_pre_process(text):
    """
        1.remove punctuation
        2.remove stop words
        3.lower case all words
        return list of clean text words
    """
    nonpunc = ''.join([c for c in text if c not in string.punctuation])
    
    return [word.lower() for word in nonpunc.split() if word not in stopwords.words('english')]

In [19]:
# using count vectorizer
from sklearn.feature_extraction.text import CountVectorizer


In [20]:
# use tf-idf transformer for term frequency and inverse-documnent frequency product
from sklearn.feature_extraction.text import TfidfTransformer

In [21]:
from sklearn.naive_bayes import MultinomialNB

### <center>4) data pipeline

In [40]:
from sklearn.pipeline import Pipeline

In [41]:
text_pipeline = Pipeline([
                ('bow', CountVectorizer(analyzer=text_pre_process)),
                ('tf-idf', TfidfTransformer()),
                ('model', MultinomialNB())
])

In [57]:
text_pipeline

Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer=<function text_pre_process at 0x7f771421b7a0>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tf-idf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('model',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [25]:
# train_test_spilt
from sklearn.model_selection import train_test_split

In [26]:
x_train, x_test, y_train, y_test = train_test_split(train_df.iloc[:,:], train_df.iloc[:,-1], 
                                                    test_size=0.3,
                                                    shuffle=True, 
                                                    random_state=0)

In [29]:
print(f'train_set : {len(x_train)} \ntest_set : {len(x_test)}')

train_set : 42224 
test_set : 18097


In [28]:
text_pipeline.fit(x_train['cleaned_hm'], y_train)

NameError: name 'text_pipeline' is not defined

In [62]:
predictions = text_pipeline.predict(x_test['cleaned_hm'])

In [60]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))


NameError: name 'predictions' is not defined

In [99]:
test_results = text_pipeline.predict(test_df['cleaned_hm'])

In [100]:
predict_dataframe = pd.DataFrame(data = test_df['hmid'])

In [101]:
predict_dataframe['predicted_category'] = test_results

In [103]:
predict_dataframe.to_csv('predictions.csv', index=False)

### <center>5) Using nltk.lemmatization then doing prediction

In [107]:
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

In [112]:
# creating text pre-process
def text_pre_process_lemmatize(text):
    """
        1.remove punctuation
        2.remove stop words
        3.lower case all words
        return list of clean text words
    """
    nonpunc = ''.join([c for c in text if c not in string.punctuation])
    
    return [lemmatizer.lemmatize(word.lower()) for word in nonpunc.split() if word not in stopwords.words('english')]

In [113]:
text_lemmatize_pipeline = Pipeline([
                ('bow', CountVectorizer(analyzer=text_pre_process_lemmatize)),
                ('tf-idf', TfidfTransformer()),
                ('model', MultinomialNB())
])

In [115]:
text_lemmatize_pipeline.fit(x_train['cleaned_hm'], y_train)

Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer=<function text_pre_process_lemmatize at 0x7f78a64050e0>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tf-idf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('model',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [116]:
predictions_lemma = text_lemmatize_pipeline.predict(x_test['cleaned_hm'])

In [117]:
print(classification_report(y_test, predictions_lemma))

                  precision    recall  f1-score   support

     achievement       0.76      0.88      0.82      6034
       affection       0.61      0.94      0.74      6256
         bonding       0.98      0.29      0.45      1975
enjoy_the_moment       0.83      0.23      0.36      1961
        exercise       1.00      0.01      0.03       211
         leisure       0.96      0.24      0.39      1307
          nature       1.00      0.02      0.04       353

        accuracy                           0.69     18097
       macro avg       0.88      0.38      0.40     18097
    weighted avg       0.76      0.69      0.65     18097



In [118]:
test_results = text_lemmatize_pipeline.predict(test_df['cleaned_hm'])

In [119]:
predict_lemma_dataframe = pd.DataFrame(data = test_df['hmid'])

In [120]:
predict_lemma_dataframe['predicted_category'] = test_results

In [121]:
predict_lemma_dataframe.to_csv('predictions_lemma.csv', index=False)

### <center> use logistic regression

In [43]:
from sklearn.linear_model import LogisticRegression

In [44]:
logistic_model = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinominal', ) 

In [67]:
from sklearn.preprocessing import LabelEncoder

In [68]:
label_encoder = LabelEncoder()

In [69]:
y_train_label = LabelEncoder().fit_transform(y_train)

In [103]:
text_logit_pipeline = Pipeline([
                                ("bow",CountVectorizer(analyzer=text_pre_process)),
                                ("tfidf", TfidfTransformer()),
                                ("model", LogisticRegression(random_state=0, solver='lbfgs', 
                                                             multi_class='multinomial', max_iter=5000))
                                ])

In [104]:
text_logit_pipeline.fit(x_train['cleaned_hm'], y_train_label)

Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer=<function text_pre_process at 0x7f53b4c4eef0>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w...
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('model',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                 

In [105]:
logit_predictions = text_logit_pipeline.predict(x_test['cleaned_hm'])

In [106]:
print(classification_report(label_encoder.fit_transform(y_test), logit_predictions))

              precision    recall  f1-score   support

           0       0.85      0.95      0.90      6034
           1       0.93      0.93      0.93      6256
           2       0.96      0.91      0.93      1975
           3       0.81      0.69      0.75      1961
           4       0.94      0.68      0.79       211
           5       0.87      0.75      0.80      1307
           6       0.92      0.71      0.80       353

    accuracy                           0.89     18097
   macro avg       0.90      0.80      0.84     18097
weighted avg       0.89      0.89      0.89     18097



In [107]:
test_results = text_logit_pipeline.predict(test_df['cleaned_hm'])

In [108]:
predictions_logit_text = label_encoder.inverse_transform(test_results)

In [112]:
predict_logit_dataframe = pd.DataFrame(data = test_df['hmid'])

In [113]:
predict_logit_dataframe['predicted_category'] = predictions_logit_text

In [114]:
predict_logit_dataframe.to_csv('predictions_logit1.csv', index=False)

Unnamed: 0,hmid,predicted_category
0,88305,bonding
1,88306,achievement
2,88307,affection
3,88308,bonding
4,88309,affection
...,...,...
40208,128762,affection
40209,128763,enjoy_the_moment
40210,128764,affection
40211,128765,achievement


### <center> Support vector machines

In [127]:
from sklearn.svm import LinearSVC

In [128]:
svm = LinearSVC()

In [129]:
text_svc_pipeline = Pipeline([
                                ("bow",CountVectorizer(analyzer=text_pre_process)),
                                ("tfidf", TfidfTransformer()),
                                ("model", LinearSVC(random_state=0, tol=1e-5))
                                ])

In [130]:
text_svc_pipeline.fit(x_train['cleaned_hm'], y_train_label)

Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer=<function text_pre_process at 0x7f53b4c4eef0>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('model',
                 LinearSVC(C=1.0, class_weight=None, dual=True,
                           fit_intercept=True, in

In [131]:
svm_predictions = text_svc_pipeline.predict(x_test['cleaned_hm'])

In [132]:
print(classification_report(label_encoder.fit_transform(y_test), svm_predictions))

              precision    recall  f1-score   support

           0       0.89      0.93      0.91      6034
           1       0.94      0.94      0.94      6256
           2       0.94      0.94      0.94      1975
           3       0.81      0.74      0.78      1961
           4       0.85      0.85      0.85       211
           5       0.85      0.80      0.82      1307
           6       0.86      0.83      0.84       353

    accuracy                           0.90     18097
   macro avg       0.88      0.86      0.87     18097
weighted avg       0.90      0.90      0.90     18097



In [133]:
predict_svm_dataframe = pd.DataFrame(data = test_df['hmid'])

In [134]:
predictions_svm = text_svc_pipeline.predict(test_df['cleaned_hm'])

In [136]:
prediction_svm_text = label_encoder.inverse_transform(predictions_svm)

In [138]:
predict_svm_dataframe['predicted_category'] = prediction_svm_text

In [139]:
predict_svm_dataframe.to_csv('predicitions_svm.csv', index=False)

### <center> Random Forest

In [151]:
from sklearn.ensemble import RandomForestClassifier

In [152]:
rf_model = RandomForestClassifier(n_estimators=100,random_state=0)

In [153]:
text_random_pipeline = Pipeline([
                                ("bow",CountVectorizer(analyzer=text_pre_process)),
                                ("tfidf", TfidfTransformer()),
                                ("model", RandomForestClassifier(n_estimators=100,random_state=0))
                                ])

In [154]:
text_random_pipeline.fit(x_train['cleaned_hm'], y_train)

Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer=<function text_pre_process at 0x7f53b4c4eef0>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                              

In [155]:
predictions_random = text_random_pipeline.predict(x_test['cleaned_hm'])

In [156]:
print(classification_report(y_test,predictions_random))

                  precision    recall  f1-score   support

     achievement       0.78      0.91      0.84      6034
       affection       0.87      0.93      0.90      6256
         bonding       0.95      0.88      0.91      1975
enjoy_the_moment       0.79      0.47      0.59      1961
        exercise       0.87      0.64      0.74       211
         leisure       0.79      0.64      0.71      1307
          nature       0.85      0.55      0.67       353

        accuracy                           0.83     18097
       macro avg       0.84      0.72      0.76     18097
    weighted avg       0.83      0.83      0.83     18097



### <center> XGBoost

In [158]:
import xgboost

In [159]:
xgb = xgboost.XGBClassifier()

In [160]:
text_xgb_pipeline = Pipeline([
                                ("bow",CountVectorizer(analyzer=text_pre_process)),
                                ("tfidf", TfidfTransformer()),
                                ("model", xgboost.XGBClassifier())
                                ])

In [162]:
text_xgb_pipeline.fit(x_train['cleaned_hm'], y_train)

Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer=<function text_pre_process at 0x7f53b4c4eef0>,
                                 binary=False, decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w...
                 XGBClassifier(base_score=0.5, booster='gbtree',
                               colsample_bylevel=1, colsample_bynode=1,
                               colsample_bytree=1, gamma=0, learning_rate=0.1,
                               max_delta_step=0, max_depth=3,
                               min_child_weight=1, missing=None,
                         

In [163]:
predictions_xgb = text_xgb_pipeline.predict(x_test['cleaned_hm'])

In [165]:
print(classification_report(y_test, predictions_xgb))

                  precision    recall  f1-score   support

     achievement       0.63      0.92      0.75      6034
       affection       0.89      0.79      0.84      6256
         bonding       0.92      0.90      0.91      1975
enjoy_the_moment       0.78      0.24      0.37      1961
        exercise       0.84      0.64      0.73       211
         leisure       0.83      0.46      0.59      1307
          nature       0.89      0.51      0.65       353

        accuracy                           0.76     18097
       macro avg       0.83      0.64      0.69     18097
    weighted avg       0.79      0.76      0.74     18097



**clearly svm out-porformed all models in text processing s we are doing hyperparamter tunning for svm**

### <center> Hyperparameter tunning svm

In [208]:
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score, make_scorer
f_score = make_scorer(f1_score)

In [209]:
text_svc_pipeline = Pipeline([
                                ("bow",CountVectorizer(analyzer=text_pre_process)),
                                ("tfidf", TfidfTransformer()),
                                ("model", LinearSVC(random_state=0, tol=1e-5, ))
                                ])

In [218]:
from sklearn.model_selection import GridSearchCV

In [220]:
params_grid = {"model__C": range(2, 10, 2)}

In [222]:
grid_search_cv = GridSearchCV(estimator=text_svc_pipeline, param_grid=params_grid, n_jobs=4, scoring=f_score)