In [34]:
#mount drive
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [35]:
# !pip install sentence-transformers
# !pip install contractions
# !pip install gibberish_detector

In [36]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.metrics import confusion_matrix, plot_confusion_matrix

import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize,sent_tokenize

import contractions
from gibberish_detector import detector

import re
from tqdm import tqdm
import pkg_resources
import itertools

import torch
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader

from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('punkt')

Detector = detector.create_from_model('/content/drive/MyDrive/Sentiment_Analysis/gibberish-detector.model')

thresh = 0.5
labels = ['Functionality', 'User Experience', 'Performance', 'Customer Support', 'Subscription', 'Feature Request']

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [40]:
lemmatizer = WordNetLemmatizer()

#Text Cleaning
def max_seq(sent, n=2):
    sent = re.sub(' +', ' ', sent)
    clean_word_list = []
    for text in sent.split(' '):
        result = []
        for k, g in itertools.groupby(text):
            result.extend(list(g)[:n])
        result = ''.join(result)
        clean_word_list.append(result)
    return ' '.join(clean_word_list)

def clean_text(txt):
    if isinstance(txt, type(np.nan)):
        txt = ''
    txt = txt.lower()
    txt = max_seq(txt)        
    txt = contractions.fix(txt, slang=True)        
    txt = re.sub(r'[^a-zA-Z0-9 ]', ' ', txt)
    txt = [word for word in word_tokenize(txt) if not Detector.is_gibberish(word)]
    return ' '.join(txt)

In [41]:
#Classifier Structure
class Net(nn.Module):
    def __init__(self, input_size):
        super().__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_size, 16),
            nn.BatchNorm1d(16),
            nn.LeakyReLU(),
            nn.Linear(16,8),
            nn.BatchNorm1d(8),
            nn.LeakyReLU(),
            nn.Linear(8,6)
        )
        
    def forward(self, x):
        x = self.layers(x)
        return torch.sigmoid(x)

In [42]:
#loading the classifier
filepath = '/content/drive/MyDrive/Reviews_data/labelled_reviews/models/clf.pth'
net = Net(model.encode(['']).shape[1])
net.load_state_dict(torch.load(filepath))

<All keys matched successfully>

In [43]:
df = pd.read_csv('/content/drive/MyDrive/Dashboard/Data/com.cama.app.huge80sclock.csv')
corpus = df['en_content'].apply(clean_text) #clean text

#encode vectors
vectors = model.encode(corpus.tolist())
vectors = torch.tensor(vectors).float()

#predict labels
outputs = net(vectors)
outputs = (outputs>thresh).float()

In [44]:
for i, label in enumerate(labels):
    df[label] = outputs[:,i].int()

df.head()

Unnamed: 0,reviewId,content,rating,thumbsUpCount,reviewCreatedVersion,at,en_content,sr_combined_scaled,polarity score,Functionality,User Experience,Performance,Customer Support,Subscription,Feature Request
0,e0a68a14-5434-41a9-9bf7-c0d9ddebb34a,Cool,5,0,1.1,2018-08-25 19:23:16,Cool,3.182,0.3182,0,0,0,0,0,0
1,6978712c-2eba-4d8e-a2d1-fa88918b538d,Togliete la status bar del cellulare in alto....,2,0,1.2,2018-09-23 12:31:36,Remove the cell phone status bar at the top ....,0.0,0.2023,0,1,0,1,0,1
2,b5ae1ff1-e7f8-4f6e-92f6-3fed68c63140,Bella app,5,1,,2018-10-04 12:55:42,Beautiful app,5.994,0.5994,0,1,1,0,0,0
3,8c545564-0f31-4917-b8c4-20734a6e7fa5,Molto bello e carino,5,1,1.3.3,2018-10-10 18:28:10,Very nice and cute,7.474,0.7474,0,1,0,0,0,0
4,d3015bbd-fa05-4d67-b2bd-0fc725b0289e,"App accettabile, sarebbe gradito che il piccol...",3,0,1.3.3,2018-10-12 18:16:06,"Acceptable app, it would be appreciated if the...",0.0,0.8442,0,0,0,0,0,1


In [45]:
#df.to_csv('/content/drive/MyDrive/Dashboard/Data/com.cama.app.huge80sclock.predicted.csv', index=False)