## Load important libraries

In [1]:
import warnings
warnings.filterwarnings('ignore')

import os
import gzip
import shutil
from time import time

#pre-processing imports
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from string import punctuation

#imports related to modeling
import numpy as np
from gensim.models import Word2Vec, KeyedVectors
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.svm import NuSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

/kaggle/input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin.gz
/kaggle/input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin
/kaggle/input/sentiment-sentences/sentiment_sentences.txt


## Load the Pre-trained `GoogleNews-vectors-negative300` embedding model

In [2]:
path_to_model = '/kaggle/input/googlenewsvectorsnegative300/GoogleNews-vectors-negative300.bin'

#Load W2V model. This will take some time. 
%time w2v_model = KeyedVectors.load_word2vec_format(path_to_model, binary=True)
print('done loading Word2Vec')

CPU times: user 36.6 s, sys: 6.16 s, total: 42.8 s
Wall time: 1min 10s
done loading Word2Vec


In [3]:
#Inspect the model
print('Shape :',w2v_model.vectors.shape)
word2vec_vocab = w2v_model.index_to_key 
word2vec_vocab_lower = [item.lower() for item in word2vec_vocab]
print('Total vocabulary counts :',len(word2vec_vocab))

Shape : (3000000, 300)
Total vocabulary counts : 3000000


## Load the Sentiment sentence text data  

In [4]:
#Read text data, cats.
#the file path consists of tab separated sentences and cats.

data_path = '/kaggle/input/sentiment-sentences/sentiment_sentences.txt'
texts = []
category = []
fh = open(data_path)
for line in fh:
    text, sentiment = line.split("\t")
    texts.append(text)
    category.append(int(sentiment))

In [5]:
print('Text size :',len(texts))
print('Sentiments size :',len(category))
print('sentiments',set(category))

sentiment = {
    0 : 'Negative',
    1 : 'Positive'
}

sentiment

Text size : 3000
Sentiments size : 3000
sentiments {0, 1}


{0: 'Negative', 1: 'Positive'}

## Text Preprocessing

In [6]:
#preprocess the text.
def preprocess_corpus(texts):
    
    tokenized = []
    for text in texts:
        tokens = word_tokenize(text)

        # Remove punctuation and digit and make word in lowercase
        lower = [token.lower() for token in tokens if token not in punctuation and not token.isdigit()]

        # Remove stopwords
        Stopwords = set(stopwords.words("english"))
        words = [token for token in lower if token not in Stopwords]
        tokenized.append(words)
    
    return tokenized

In [7]:
texts_processed = preprocess_corpus(texts)
print('Original Texts :',texts[0])
print('Processed Texts :',texts_processed[0])
print('Sentiment :',sentiment[category[0]])
print('-'*50)
print('Original Texts :',texts[1])
print('Processed Texts :',texts_processed[1])
print('Sentiment :',sentiment[category[1]])

Original Texts : So there is no way for me to plug it in here in the US unless I go by a converter.
Processed Texts : ['way', 'plug', 'us', 'unless', 'go', 'converter']
Sentiment : Negative
--------------------------------------------------
Original Texts : Good case, Excellent value.
Processed Texts : ['good', 'case', 'excellent', 'value']
Sentiment : Positive


## Embeddings and Feature vector

In [8]:
w2v_model.key_to_index
for word in texts_processed[0]:
    try:
        word = word.lower()
        embeddings = w2v_model.key_to_index[word]
        print(word,'-->',embeddings)
    except:
        print(word)

way --> 132
plug --> 6735
us --> 164
unless --> 2211
go --> 152
converter --> 35011


In [9]:
# Creating a feature vector by averaging all embeddings for all sentences
def features_embeddings(list_of_lists):
    
    # feature vector column length 
    DIMENSION = w2v_model.vector_size
    
    # Zero vector if words not in our w2v_model vocubulary
    zero_vector = np.zeros(DIMENSION)
    
    features = []
    
    # feature vector for each sentence in list_of_lists one by one
    for tokens in list_of_lists:
        
        # feature vector for a single sentence
        feature =  np.zeros(DIMENSION) # shape = 1X DIMENSION
        
        # count for each token/word in a tokens or sentence
        feature_counts = 0 + 1e-5   # to avoid divide-by-zero 
        for token in tokens:
            if token in w2v_model:
                feature += w2v_model[token] # Here both feature and w2v_model[token] are of same shape : 1X DIMENSION
                feature_counts +=1    # Count increases for each token in a tokens
         
        # Average the feature values for each tokens
        if(feature_counts !=0 ):        # for non empty tokens
            features.append(feature/feature_counts) 
        else:                           # for empty tokens
            features.append(zero_vector)
            
    return features

In [10]:
train_vectors = features_embeddings(texts_processed)
print(len(train_vectors))
train_vectors[2999].shape

3000


(300,)

## Build Classification Model

In [11]:
# Split train and test data
X_train, X_test, y_train, y_test = train_test_split(train_vectors, category, test_size = 0.2,  random_state=42, stratify = category)
len(X_train), len(X_test), len(y_train), len(y_test)

(2400, 600, 2400, 600)

### Support Vector Classifier

In [12]:
from sklearn.svm import SVC
SVC = SVC(kernel= 'poly', random_state=42, degree= 3) 
%time SVC.fit(X_train, y_train) # train the model(timing it with an IPython "magic command")

print("Accuracy: ", SVC.score(X_test, y_test))

y_pred = SVC.predict(X_test) 

print(classification_report(y_pred, y_test, target_names = ['Positive','Negative']))

CPU times: user 617 ms, sys: 11 ms, total: 628 ms
Wall time: 632 ms
Accuracy:  0.845
              precision    recall  f1-score   support

    Positive       0.86      0.84      0.85       307
    Negative       0.83      0.85      0.84       293

    accuracy                           0.84       600
   macro avg       0.84      0.85      0.84       600
weighted avg       0.85      0.84      0.85       600



### Nu-Support Vector Classification.

In [13]:
nuSVC = NuSVC(kernel= 'poly', random_state=453) 
%time nuSVC.fit(X_train, y_train) # train the model(timing it with an IPython "magic command")

print("Accuracy: ", nuSVC.score(X_test, y_test))

y_pred1 = nuSVC.predict(X_test) 

print(classification_report(y_pred1, y_test, target_names = ['Positive','Negative']))

CPU times: user 731 ms, sys: 7.04 ms, total: 738 ms
Wall time: 740 ms
Accuracy:  0.8433333333333334
              precision    recall  f1-score   support

    Positive       0.85      0.84      0.84       304
    Negative       0.84      0.85      0.84       296

    accuracy                           0.84       600
   macro avg       0.84      0.84      0.84       600
weighted avg       0.84      0.84      0.84       600



In [14]:
logReg = LogisticRegression(class_weight ='balanced', random_state=42) 
#The “balanced” mode uses the values of y to automatically adjust weights inversely proportional to class frequencies in the input data as 
#n_samples / (n_classes * np.bincount(y)).


%time logReg.fit(X_train, y_train) # train the model(timing it with an IPython "magic command")

print("Accuracy: ", logReg.score(X_test, y_test))

y_pred2 = logReg.predict(X_test) 

print(classification_report(y_pred2, y_test, target_names = ['Positive','Negative']))

CPU times: user 143 ms, sys: 79.4 ms, total: 222 ms
Wall time: 65.2 ms
Accuracy:  0.825
              precision    recall  f1-score   support

    Positive       0.81      0.84      0.82       291
    Negative       0.84      0.82      0.83       309

    accuracy                           0.82       600
   macro avg       0.82      0.83      0.82       600
weighted avg       0.83      0.82      0.83       600



### Best model is Support Vector Classifier with 84.5% accuracy