# <center>  ML INTERNSHIP - HACKEREARTH CHALLENGE

### <center>  1) importing necssary libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import nltk
from nltk.corpus import stopwords
%matplotlib inline

### <center>2) importing data and analysis

In [2]:
train_df = pd.read_csv('hm_train.csv')
train_df.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence,predicted_category
0,27673,24h,I went on a successful date with someone I fel...,1,affection
1,27674,24h,I was happy when my son got 90% marks in his e...,1,affection
2,27675,24h,I went to the gym this morning and did yoga.,1,exercise
3,27676,24h,We had a serious talk with some friends of our...,2,bonding
4,27677,24h,I went with grandchildren to butterfly display...,1,affection


In [3]:
test_df = pd.read_csv('hm_test.csv')
test_df.head()

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence
0,88305,3m,I spent the weekend in Chicago with my friends.,1
1,88306,3m,We moved back into our house after a remodel. ...,2
2,88307,3m,My fiance proposed to me in front of my family...,1
3,88308,3m,I ate lobster at a fancy restaurant with some ...,1
4,88309,3m,I went out to a nice restaurant on a date with...,5


In [4]:
train_df.describe(include=['object','bool','int','float']) # including object types also

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence,predicted_category
count,60321.0,60321,60321,60321.0,60321
unique,,2,58454,,7
top,,24h,I WENT TO MOVIE,,affection
freq,,30455,76,,20880
mean,57996.92951,,,1.355946,
std,17501.024854,,,1.30816,
min,27673.0,,,1.0,
25%,42845.0,,,1.0,
50%,58001.0,,,1.0,
75%,73160.0,,,1.0,


In [5]:
test_df.describe(include=['object','bool','int','float']) #including object types also

Unnamed: 0,hmid,reflection_period,cleaned_hm,num_sentence
count,40213.0,40213,40213,40213.0
unique,,2,38596,
top,,3m,I WENT TO TEMPLE,
freq,,20837,50,
mean,108539.500734,,,1.318007
std,11678.312178,,,1.280175
min,88305.0,,,1.0
25%,98433.0,,,1.0
50%,108538.0,,,1.0
75%,118655.0,,,1.0


In [6]:
train_df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 60321 entries, 0 to 60320
Data columns (total 5 columns):
hmid                  60321 non-null int64
reflection_period     60321 non-null object
cleaned_hm            60321 non-null object
num_sentence          60321 non-null int64
predicted_category    60321 non-null object
dtypes: int64(2), object(3)
memory usage: 2.3+ MB


**- clearly no missing values**

In [7]:
test_df.info() 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40213 entries, 0 to 40212
Data columns (total 4 columns):
hmid                 40213 non-null int64
reflection_period    40213 non-null object
cleaned_hm           40213 non-null object
num_sentence         40213 non-null int64
dtypes: int64(2), object(2)
memory usage: 1.2+ MB


**- clearly no missing values**

In [8]:
train_df['cleaned_hm'].describe() 

count               60321
unique              58454
top       I WENT TO MOVIE
freq                   76
Name: cleaned_hm, dtype: object

**- unique != count there can be repeated messages**

In [9]:
train_df.groupby(['predicted_category'])['cleaned_hm'].describe() 

Unnamed: 0_level_0,count,unique,top,freq
predicted_category,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
achievement,20274,19832,I got a new job.,19
affection,20880,20384,My son gave me a big hug in the morning when I...,18
bonding,6561,6335,The offsite with colleagues was great fun. We ...,38
enjoy_the_moment,6508,6230,Happiness is a mental or emotional state of we...,24
exercise,729,684,I WENT TO YOGA,14
leisure,4242,3870,I WENT TO MOVIE,76
nature,1127,1121,The sun was shining when I woke up this morning.,2


**- `achievement`, `excercise` and `leisure` have short sentences hence `length of words` can be a good feature** 

In [10]:
#train_df['length_of_words'] = train_df['cleaned_hm'].apply(len) 
#test_df['length_of_words'] = test_df['cleaned_hm'].apply(len) 

### <center> 3) Text-Processing

In [11]:
# creating text pre-process
def text_pre_process(text):
    """
        1.remove punctuation
        2.remove stop words
        3.lower case all words
        return list of clean text words
    """
    nonpunc = ''.join([c for c in text if c not in string.punctuation])
    
    return [word.lower() for word in nonpunc.split() if word not in stopwords.words('english')]

In [12]:
# using count vectorizer
from sklearn.feature_extraction.text import CountVectorizer


In [13]:
# use tf-idf transformer for term frequency and inverse-documnent frequency product
from sklearn.feature_extraction.text import TfidfTransformer

In [22]:
from sklearn.naive_bayes import MultinomialNB

### <center>4) data pipeline

In [23]:
from sklearn.pipeline import Pipeline

In [24]:
text_pipeline = Pipeline([
                ('bow', CountVectorizer(analyzer=text_pre_process)),
                ('tf-idf', TfidfTransformer()),
                ('model', MultinomialNB())
])

In [None]:
text_pipeline

In [14]:
# train_test_spilt
from sklearn.model_selection import train_test_split

In [15]:
x_train, x_test, y_train, y_test = train_test_split(train_df.iloc[:,:], train_df.iloc[:,-1], 
                                                    test_size=0.3,
                                                    shuffle=True, 
                                                    random_state=0)

In [16]:
print(f'train_set : {len(x_train)} \ntest_set : {len(x_test)}')

train_set : 42224 
test_set : 18097


In [None]:
text_pipeline.fit(x_train['cleaned_hm'], y_train)

In [None]:
predictions = text_pipeline.predict(x_test['cleaned_hm'])

In [None]:
from sklearn.metrics import classification_report
print(classification_report(y_test, predictions))


In [None]:
test_results = text_pipeline.predict(test_df['cleaned_hm'])

In [None]:
predict_dataframe = pd.DataFrame(data = test_df['hmid'])

In [None]:
predict_dataframe['predicted_category'] = test_results

In [None]:
predict_dataframe.to_csv('predictions.csv', index=False)

### <center>5) Using nltk.lemmatization then doing prediction

In [None]:
from nltk.stem import WordNetLemmatizer 
lemmatizer = WordNetLemmatizer()

In [None]:
# creating text pre-process
def text_pre_process_lemmatize(text):
    """
        1.remove punctuation
        2.remove stop words
        3.lower case all words
        return list of clean text words
    """
    nonpunc = ''.join([c for c in text if c not in string.punctuation])
    
    return [lemmatizer.lemmatize(word.lower()) for word in nonpunc.split() if word not in stopwords.words('english')]

In [None]:
text_lemmatize_pipeline = Pipeline([
                ('bow', CountVectorizer(analyzer=text_pre_process_lemmatize)),
                ('tf-idf', TfidfTransformer()),
                ('model', MultinomialNB())
])

In [None]:
text_lemmatize_pipeline.fit(x_train['cleaned_hm'], y_train)

In [None]:
predictions_lemma = text_lemmatize_pipeline.predict(x_test['cleaned_hm'])

In [None]:
print(classification_report(y_test, predictions_lemma))

In [None]:
test_results = text_lemmatize_pipeline.predict(test_df['cleaned_hm'])

In [None]:
predict_lemma_dataframe = pd.DataFrame(data = test_df['hmid'])

In [None]:
predict_lemma_dataframe['predicted_category'] = test_results

In [None]:
predict_lemma_dataframe.to_csv('predictions_lemma.csv', index=False)

### <center> use logistic regression

In [30]:
from sklearn.linear_model import LogisticRegression

In [31]:
logistic_model = LogisticRegression(random_state=0, solver='lbfgs', multi_class='multinominal', ) 

In [32]:
from sklearn.preprocessing import LabelEncoder

In [33]:
label_encoder = LabelEncoder()

In [34]:
y_train_label = LabelEncoder().fit_transform(y_train)

In [None]:
text_logit_pipeline = Pipeline([
                                ("bow",CountVectorizer(analyzer=text_pre_process)),
                                ("tfidf", TfidfTransformer()),
                                ("model", LogisticRegression(random_state=0, solver='lbfgs', 
                                                             multi_class='multinomial', max_iter=5000))
                                ])

In [None]:
text_logit_pipeline.fit(x_train['cleaned_hm'], y_train_label)

In [None]:
logit_predictions = text_logit_pipeline.predict(x_test['cleaned_hm'])

In [None]:
print(classification_report(label_encoder.fit_transform(y_test), logit_predictions))

In [None]:
test_results = text_logit_pipeline.predict(test_df['cleaned_hm'])

In [None]:
predictions_logit_text = label_encoder.inverse_transform(test_results)

In [None]:
predict_logit_dataframe = pd.DataFrame(data = test_df['hmid'])

In [None]:
predict_logit_dataframe['predicted_category'] = predictions_logit_text

In [None]:
predict_logit_dataframe.to_csv('predictions_logit1.csv', index=False)

### <center> Support vector machines

In [19]:
from sklearn.svm import LinearSVC

In [None]:
svm = LinearSVC()

In [None]:
text_svc_pipeline = Pipeline([
                                ("bow",CountVectorizer(analyzer=text_pre_process)),
                                ("tfidf", TfidfTransformer()),
                                ("model", LinearSVC(random_state=0, tol=1e-5))
                                ])

In [None]:
text_svc_pipeline.fit(x_train['cleaned_hm'], y_train_label)

In [None]:
svm_predictions = text_svc_pipeline.predict(x_test['cleaned_hm'])

In [None]:
print(classification_report(label_encoder.fit_transform(y_test), svm_predictions))

In [None]:
predict_svm_dataframe = pd.DataFrame(data = test_df['hmid'])

In [None]:
predictions_svm = text_svc_pipeline.predict(test_df['cleaned_hm'])

In [None]:
prediction_svm_text = label_encoder.inverse_transform(predictions_svm)

In [None]:
predict_svm_dataframe['predicted_category'] = prediction_svm_text

In [None]:
predict_svm_dataframe.to_csv('predicitions_svm.csv', index=False)

### <center> Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
rf_model = RandomForestClassifier(n_estimators=100,random_state=0)

In [None]:
text_random_pipeline = Pipeline([
                                ("bow",CountVectorizer(analyzer=text_pre_process)),
                                ("tfidf", TfidfTransformer()),
                                ("model", RandomForestClassifier(n_estimators=100,random_state=0))
                                ])

In [None]:
text_random_pipeline.fit(x_train['cleaned_hm'], y_train)

In [None]:
predictions_random = text_random_pipeline.predict(x_test['cleaned_hm'])

In [None]:
print(classification_report(y_test,predictions_random))

### <center> XGBoost

In [None]:
import xgboost

In [None]:
xgb = xgboost.XGBClassifier()

In [None]:
text_xgb_pipeline = Pipeline([
                                ("bow",CountVectorizer(analyzer=text_pre_process)),
                                ("tfidf", TfidfTransformer()),
                                ("model", xgboost.XGBClassifier())
                                ])

In [None]:
text_xgb_pipeline.fit(x_train['cleaned_hm'], y_train)

In [None]:
predictions_xgb = text_xgb_pipeline.predict(x_test['cleaned_hm'])

In [None]:
print(classification_report(y_test, predictions_xgb))

**clearly svm out-porformed all models in text processing s we are doing hyperparamter tunning for svm**

### <center> Hyperparameter tunning svm

In [56]:
from sklearn.svm import LinearSVC

In [57]:
text_svc_pipeline = Pipeline([
                                ("bow",CountVectorizer(analyzer=text_pre_process)),
                                ("tfidf", TfidfTransformer()),
                                ("model", LinearSVC(random_state=0, tol=1e-5, ))
                                ])

In [58]:
from sklearn.model_selection import GridSearchCV

In [62]:
params_grid = {"model__C": [1, 10, 100], "model__max_iter":[100, 200, 300]}

In [63]:
grid_search_cv = GridSearchCV(estimator=text_svc_pipeline, param_grid=params_grid)

In [64]:
grid_search_cv.fit(x_train['cleaned_hm'], y_train_label)



GridSearchCV(cv='warn', error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('bow',
                                        CountVectorizer(analyzer=<function text_pre_process at 0x7f4143cf90e0>,
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
 