In [11]:
#mount drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [12]:
# !pip install sentence-transformers
# !pip install contractions
# !pip install gibberish_detector

In [13]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize,sent_tokenize

import contractions
from gibberish_detector import detector

import re
from tqdm import tqdm
import pkg_resources
import itertools

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt')

Detector = detector.create_from_model('/content/drive/MyDrive/Sentiment Score/gibberish-detector.model')

thresh = 0.5
labels = ['Functionality', 'User Experience', 'Performance', 'Customer Support', 'Subscription', 'Feature Request']

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [14]:
lemmatizer = WordNetLemmatizer()

#Text Cleaning
def max_seq(sent, n=2):
    sent = re.sub(' +', ' ', sent)
    clean_word_list = []
    for text in sent.split(' '):
        result = []
        for k, g in itertools.groupby(text):
            result.extend(list(g)[:n])
        result = ''.join(result)
        clean_word_list.append(result)
    return ' '.join(clean_word_list)

def clean_text(txt):
    if isinstance(txt, type(np.nan)):
        txt = ''
    txt = txt.lower()
    txt = max_seq(txt)        
    txt = contractions.fix(txt, slang=True)        
    txt = re.sub(r'[^a-zA-Z0-9 ]', ' ', txt)
    txt = [word for word in word_tokenize(txt) if not Detector.is_gibberish(word)]
    return ' '.join(txt)

In [15]:
#Classifier Structure
class Net(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 16),
            nn.BatchNorm1d(16),
            nn.LeakyReLU(),
            nn.Linear(16,8),
            nn.BatchNorm1d(8),
            nn.LeakyReLU(),
            nn.Linear(8,6)
        )
        
    def forward(self, x):
        x = self.layers(x)
        return torch.sigmoid(x)

In [16]:
#loading the classifier
filepath = '/content/drive/MyDrive/Reviews_data/labelled_reviews/models/clf.pth'
net = Net(model.encode(['']).shape[1])
net.load_state_dict(torch.load(filepath))

<All keys matched successfully>

In [20]:
package = 'net.smartlogic.indiagst'

df = pd.read_csv('/content/drive/MyDrive/Sentiment Score/' + package + '.csv')
corpus = df['en_content'].apply(clean_text) #clean text

#encode vectors
vectors = model.encode(corpus.tolist())
vectors = torch.tensor(vectors).float()

#predict labels
outputs = net(vectors)
outputs = (outputs>thresh).float()

In [18]:
for i, label in enumerate(labels):
    df[label] = outputs[:,i].int()

df.head()

Unnamed: 0,reviewId,content,rating,thumbsUpCount,reviewCreatedVersion,at,en_content,sr_combined_scaled,polarity score,Functionality,User Experience,Performance,Customer Support,Subscription,Feature Request
0,3660a911-7d41-4f85-a0e1-e1a03540b8af,Never thought that such a simple tax calculato...,5,3,1.0,2016-03-25 17:36:23,Never thought that such a simple tax calculato...,3.976,0.3976,1,1,0,0,0,1
1,e518df76-fe20-4e1d-951d-28ff5a60c4b3,It is really helpful when u shop and try to ca...,5,3,1.0,2016-03-29 10:19:55,It is really helpful when u shop and try to ca...,5.413,0.5413,1,0,0,0,0,1
2,11fb3700-f078-4b1c-a5c2-3f8b31d7c7d6,"How can u bring GST calculator now, when GST i...",1,3,,2016-10-17 17:48:50,"How can u bring GST calculator now, when GST i...",0.0,0.2235,1,0,0,0,0,1
3,7c5b51d6-380a-4984-b426-4ff1b0bcdd46,The app is as simple as it can be without any ...,5,5,1.3,2016-10-23 09:13:22,The app is as simple as it can be without any ...,7.07,0.707,0,1,0,0,0,1
4,9f28ccdb-1f7c-4a57-ba39-3dcd5b4e6f97,Danyvad,5,5,1.3,2016-11-07 05:53:33,compassion,4.588,0.4588,0,0,0,0,0,0


In [21]:
# df.to_csv('/content/drive/MyDrive/Sentiment Score/' + package + '.csv', index=False)