## Before we move on to complex Deep Learning methods for building our classifier, here I am going to start with simple Machine Learning Classifiers like Decision-Tree, Random-Forest, Logistic Regression which are built from bag of words model. 

## The idea here is to establish a baseline model from where we can build on complex time taking deep learning models (lstm, transformers like BERT)

In [119]:
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
import re
import sys
import warnings

import sys; sys.path.insert(0, '..') # helps importing our modules
from src import text_ops

from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, log_loss

from skmultilearn.adapt import MLkNN
from scipy.sparse import csr_matrix, lil_matrix

from textblob import TextBlob
from skmultilearn.problem_transform import LabelPowerset

from sklearn.linear_model import LogisticRegression


In [2]:
raw_data = pd.read_csv('/Users/virajdatt/Desktop/github/public/Machine-Hack-/uHack-Sentiments/data/train.csv')

In [3]:
raw_data['Review'].head()

0    For some reason everybody complains and I'm co...
1    I like everything about it, great choice of sp...
2    Excellent ceiling fan brace. Easy to install a...
3    Work great easy to use . No issues at all with...
4    I would recommend this product because it is p...
Name: Review, dtype: object

In [96]:
# Following are the labels in the dataset
mlabels = ['Components', 'Delivery and Customer Support',
       'Design and Aesthetics', 'Dimensions', 'Features', 'Functionality',
       'Installation', 'Material', 'Price', 'Quality', 'Usability']

## Text Cleanup

In [None]:
# Convert all text to lowercase
raw_data['Review'] = text_ops.lowercase_column(raw_data, 'Review')
# Clean the data
raw_data['Review'] = raw_data['Review'].apply(text_ops.clean_up)

## Text Preprocessing (Vectorizing)
1. Split data into train and validation set.
2. Apply the TFIDF on the train set (avoid invovling the validation part here as it leads to data leakage)

In [22]:
train, test = train_test_split(raw_data,
                               random_state=69,
                               test_size=0.2,
                               shuffle=True)

In [32]:
tfidf = Pipeline(steps=[
                        ('tfidf', 
                        TfidfVectorizer(strip_accents='unicode', 
                                                analyzer='word', 
                                                ngram_range=(1,3), 
                                                norm='l2'))
])

## Modeling
### 1. Binary Relevance
### 2. Classifier Chains
### 3. Label Powerset

## Adapted Algorithm 

### 1. ML-KNN

In [33]:
classifier = BinaryRelevance(GaussianNB())

clf = Pipeline(steps=[(
                        'GNB', classifier
)])

In [34]:
pipeline1 = Pipeline(steps=[
    ('preprocess', tfidf),
    ('classifier', clf)
])

In [35]:
pipeline1.fit(train['Review'], train[mlabels])

Pipeline(steps=[('preprocess', Pipeline(steps=[('tfidf', TfidfVectorizer())])),
                ('classifier',
                 Pipeline(steps=[('GNB',
                                  BinaryRelevance(classifier=GaussianNB(),
                                                  require_dense=[True,
                                                                 True]))]))])

In [36]:
prediction = pipeline1.predict(test['Review'])


In [37]:
print("Accuracy = ",accuracy_score(test[mlabels],prediction))

Accuracy =  0.04478827361563518


In [108]:
classifier_new = MLkNN(k=10)

vectorizer = TfidfVectorizer(strip_accents='unicode', analyzer='word', ngram_range=(1,3), norm='l2')
vectorizer.fit(raw_data['Review'])
vectorizer.fit(raw_data['Review'])

x_train = vectorizer.transform(raw_data['Review'])
y_train = raw_data[mlabels]

# x_test = vectorizer.transform(test['Review'])
# y_test = test[mlabels]

x_train = lil_matrix(x_train).toarray()
y_train = lil_matrix(y_train).toarray()
#x_test = lil_matrix(x_test).toarray()

# train
classifier_new.fit(x_train, y_train)
# predict
# predictions_new = classifier_new.predict(x_test)
# # accuracy
# print("Accuracy = ",accuracy_score(y_test,predictions_new))
# print("\n")
#log_loss(y_test,predictions_new.todense())




MLkNN()

In [109]:
test_data = pd.read_csv('/Users/virajdatt/Desktop/github/public/Machine-Hack-/uHack-Sentiments/data/test.csv')
test_array = vectorizer.transform(test_data['Review'])
test_array = lil_matrix(test_array).toarray()

final_pred = classifier_new.predict(test_array)
#pd.DataFrame.sparse.from_spmatrix(final_pred)

In [110]:
test_data[mlabels] = pd.DataFrame.sparse.from_spmatrix(final_pred)
test_data['Polarity'] = test_data['Review'].apply(lambda tweet: TextBlob(tweet).sentiment.polarity)

def bin_polarity(data):
    if data < 0.5:
        return 0.0
    else:
        return 1.0
test_data['Polarity'] = test_data['Polarity'].apply(bin_polarity)
mlabels.append('Polarity')
#test_data[mlabels] = pd.to_numeric(test_data[mlabels])

In [116]:
test_data[mlabels]

Unnamed: 0,Components,Delivery and Customer Support,Design and Aesthetics,Dimensions,Features,Functionality,Installation,Material,Price,Quality,Usability,Polarity
0,0,0,0,0,0,0,0,0,0,0,0,0.0
1,0,0,0,0,0,1,0,0,0,0,0,1.0
2,0,0,0,0,0,0,0,0,0,0,1,0.0
3,0,0,0,0,0,1,0,0,0,0,0,0.0
4,0,0,0,0,0,1,0,0,0,0,0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
2626,0,0,0,0,0,0,1,0,0,0,0,0.0
2627,0,0,0,0,0,0,0,0,0,0,0,0.0
2628,0,0,0,0,0,0,0,0,0,1,0,1.0
2629,0,0,0,0,0,0,0,0,0,1,0,0.0


In [117]:
test_data[mlabels].to_csv('/Users/virajdatt/Desktop/github/public/Machine-Hack-/uHack-Sentiments/submissions/baseline2.csv', index=False)

In [120]:
# using Label Powerset

# initialize label powerset multi-label classifier
classifier2 = LabelPowerset(LogisticRegression())
# train

pipeline2 = Pipeline(steps=[
    ('preprocess', tfidf),
    ('classifier', classifier2)
])

In [121]:
pipeline1.fit(train['Review'], train[mlabels])

Pipeline(steps=[('preprocess', Pipeline(steps=[('tfidf', TfidfVectorizer())])),
                ('classifier',
                 LabelPowerset(classifier=LogisticRegression(),
                               require_dense=[True, True]))])

In [125]:
#pipeline1.score(test['Review'], test[mlabels])

In [126]:
#log_loss(pipeline1.predict(test['Review']), test[mlabels])

## Important Liks
[TFIDF on train or train+test](!https://stats.stackexchange.com/questions/154660/tfidfvectorizer-should-it-be-used-on-train-only-or-traintest)