In [61]:
import os
import csv
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient

In [62]:
endpoint = "https://paulberindeiemihai.cognitiveservices.azure.com/"
key = "EQuQSv7urwg1oPumitD0jLvu8IPPFfvWG7puE1KQvPNqwupUQ1LIJQQJ99BEAC5RqLJXJ3w3AAAaACOGIkzc"

client = TextAnalyticsClient(endpoint=endpoint, credential=AzureKeyCredential(key))

In [64]:
# load some data
crtDir =  os.getcwd()
fileName = os.path.join(crtDir, 'data', 'reviews_mixed.csv')

data = []
with open(fileName) as csv_file:
    csv_reader = csv.reader(csv_file, delimiter=',')
    line_count = 0
    for row in csv_reader:
        if line_count == 0:
            dataNames = row
        else:
            data.append(row)
        line_count += 1

inputs = [data[i][0] for i in range(len(data))][:100]
outputs = [data[i][1] for i in range(len(data))][:100]
labelNames = list(set(outputs))

print(inputs[:2])
print(labelNames[:2])

['The rooms are extremely small, practically only a bed.', 'Room safe did not work.']
['negative', 'positive']


In [30]:
# Split inputs into batches of 10
batch_size = 10
batches = [inputs[i:i + batch_size] for i in range(0, len(inputs), batch_size)]

# Process each batch separately
docs = []
for batch in batches:
    result = client.analyze_sentiment(batch, show_opinion_mining=True)
    docs.extend([doc for doc in result if not doc.is_error])

# Visualize the sentiment for each document
print("Let's visualize the sentiment of each of these documents")
for idx, doc in enumerate(docs):
    print(f"Document text: {inputs[idx]}")
    print(f"Overall sentiment: {doc.sentiment}")

Let's visualize the sentiment of each of these documents
Document text: The rooms are extremely small, practically only a bed.
Overall sentiment: negative
Document text: Room safe did not work.
Overall sentiment: negative
Document text: Mattress very comfortable.
Overall sentiment: positive
Document text: Very uncomfortable, thin mattress, with plastic cover that rustles every time you move.
Overall sentiment: negative
Document text: No bathroom in room
Overall sentiment: negative
Document text: The bed was soooo comfy.
Overall sentiment: positive
Document text: someone must have been smoking in the room next door.
Overall sentiment: neutral
Document text: The bed is very comfortable.
Overall sentiment: positive
Document text: Very spacious rooms, quiet and very comfortable.
Overall sentiment: positive
Document text: For 3 people in a bedroom the sofa bed is a bit unconfortable.
Overall sentiment: negative
Document text: Lights in the common room were too dim.
Overall sentiment: negati

Impartire date (antrenament si test)

In [65]:
# prepare data for training and testing

import numpy as np

np.random.seed(5)
# noSamples = inputs.shape[0]
noSamples = len(inputs)
indexes = [i for i in range(noSamples)]
trainSample = np.random.choice(indexes, int(0.8 * noSamples), replace = False)
testSample = [i for i in indexes  if not i in trainSample]

trainInputs = [inputs[i] for i in trainSample]
trainOutputs = [outputs[i] for i in trainSample]
testInputs = [inputs[i] for i in testSample]
testOutputs = [outputs[i] for i in testSample]

print(trainInputs[:3])

['Just to give you an idea: the shutters of the windows were not working, did not go neither up or down - just hanging down only one side and the other up....', 'and hip and CLEAN!', "Toilet paper wasn't replaced everyday!"]


Extragere caracteristici


In [66]:
# extract some features from the raw text

# # representation 1: Bag of Words
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()

trainFeatures = vectorizer.fit_transform(trainInputs)
testFeatures = vectorizer.transform(testInputs)

# vocabulary size
print("vocab size: ", len(vectorizer.vocabulary_),  " words")
# no of emails (Samples)
print("traindata size: ", len(trainInputs), " emails")
# shape of feature matrix
print("trainFeatures shape: ", trainFeatures.shape)

# vocabbulary from the train data
print('some words of the vocab: ', vectorizer.get_feature_names_out()[-20:])
# extracted features
print('some features: ', trainFeatures.toarray()[:3])

vocab size:  341  words
traindata size:  80  emails
trainFeatures shape:  (80, 341)
some words of the vocab:  ['was' 'wasn' 'water' 'we' 'wear' 'well' 'were' 'wet' 'which' 'whole'
 'window' 'windows' 'winter' 'with' 'work' 'working' 'workout' 'would'
 'you' 'your']
some features:  [[0 0 0 ... 0 1 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [70]:
# representation 2: tf-idf features - word granularity
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=50)

trainFeatures = vectorizer.fit_transform(trainInputs)
testFeatures = vectorizer.transform(testInputs)

# vocabbulary from the train data
print('vocab: ', vectorizer.get_feature_names_out()[:10])
# extracted features
print('features: ', trainFeatures.toarray()[:3])

vocab:  ['all' 'and' 'are' 'at' 'bathroom' 'bed' 'bit' 'clean' 'cold'
 'comfortable']
features:  [[0.         0.14203772 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.4980966  0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.33735761 0.19218537 0.19992677 0.19992677 0.         0.
  0.         0.         0.         0.         0.         0.
  0.31496244 0.         0.17372323 0.         0.         0.43886705
  0.         0.         0.         0.19218537 0.23237012 0.
  0.21943352 0.19992677]
 [0.         0.81777684 0.         0.         0.         0.
  0.         0.57553543 0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.    

In [68]:
# representation 3: embedded features extracted by a pre-train model (in fact, word2vec pretrained model)

import gensim

# Load Google's pre-trained Word2Vec
crtDir =  os.getcwd()
modelPath = os.path.join(crtDir, 'models', 'GoogleNews-vectors-negative300.bin')

word2vecModel300 = gensim.models.KeyedVectors.load_word2vec_format(modelPath, binary=True)
print(word2vecModel300.most_similar('support'))
print("vec for house: ", word2vecModel300["house"])

ModuleNotFoundError: No module named 'gensim'

In [69]:
word = "comfy"
if (word in word2vecModel300.index_to_key):
    print("vec for comfy: ", word2vecModel300[word])
else:
    print("word was not found!")

NameError: name 'word2vecModel300' is not defined

In [55]:
# Named Entity Recognition (NER)
import spacy

# Load spaCy model (make sure to download the model first: python -m spacy download en_core_web_sm)
nlp = spacy.load('en_core_web_sm')

# Function to extract named entities
def extract_named_entities(texts):
    ner_features = []
    for doc in nlp.pipe(texts, disable=["parser"]):  # disabling unnecessary pipeline components for faster processing
        entities = [ent.text for ent in doc.ents]
        ner_features.append(entities)
    return ner_features

ner_features = extract_named_entities(trainInputs)

# Print named entities for each document
for idx, entities in enumerate(ner_features):
    print(f"Document {idx+1} named entities: {entities}")

Document 1 named entities: ['only one']
Document 2 named entities: ['CLEAN']
Document 3 named entities: ['Toilet']
Document 4 named entities: []
Document 5 named entities: []
Document 6 named entities: []
Document 7 named entities: []
Document 8 named entities: []
Document 9 named entities: []
Document 10 named entities: ['4']
Document 11 named entities: []
Document 12 named entities: ['the day']
Document 13 named entities: []
Document 14 named entities: []
Document 15 named entities: []
Document 16 named entities: []
Document 17 named entities: ['Lift']
Document 18 named entities: []
Document 19 named entities: ['a minute']
Document 20 named entities: []
Document 21 named entities: []
Document 22 named entities: ['Stiff']
Document 23 named entities: []
Document 24 named entities: ['the A/C']
Document 25 named entities: ['free coffee & tea']
Document 26 named entities: []
Document 27 named entities: ['winter']
Document 28 named entities: []
Document 29 named entities: []
Document 30 na

In [56]:
# Part-of-Speech (POS) Tagging
def extract_pos_tags(texts):
    pos_features = []
    for doc in nlp.pipe(texts, disable=["parser", "ner"]):  # disabling unnecessary pipeline components for faster processing
        pos_tags = [(token.text, token.pos_) for token in doc]
        pos_features.append(pos_tags)
    return pos_features

pos_features = extract_pos_tags(trainInputs)

# Print POS tags for each document
for idx, pos_tags in enumerate(pos_features):
    print(f"Document {idx+1} POS tags: {pos_tags}")

Document 1 POS tags: [('Just', 'ADV'), ('to', 'PART'), ('give', 'VERB'), ('you', 'PRON'), ('an', 'DET'), ('idea', 'NOUN'), (':', 'PUNCT'), ('the', 'DET'), ('shutters', 'NOUN'), ('of', 'ADP'), ('the', 'DET'), ('windows', 'NOUN'), ('were', 'AUX'), ('not', 'PART'), ('working', 'VERB'), (',', 'PUNCT'), ('did', 'AUX'), ('not', 'PART'), ('go', 'VERB'), ('neither', 'CCONJ'), ('up', 'ADP'), ('or', 'CCONJ'), ('down', 'ADV'), ('-', 'PUNCT'), ('just', 'ADV'), ('hanging', 'VERB'), ('down', 'ADV'), ('only', 'ADV'), ('one', 'NUM'), ('side', 'NOUN'), ('and', 'CCONJ'), ('the', 'DET'), ('other', 'ADJ'), ('up', 'ADV'), ('....', 'PUNCT')]
Document 2 POS tags: [('and', 'CCONJ'), ('hip', 'NOUN'), ('and', 'CCONJ'), ('CLEAN', 'PROPN'), ('!', 'PUNCT')]
Document 3 POS tags: [('Toilet', 'NOUN'), ('paper', 'NOUN'), ('was', 'AUX'), ("n't", 'PART'), ('replaced', 'VERB'), ('everyday', 'ADV'), ('!', 'PUNCT')]
Document 4 POS tags: [('The', 'DET'), ('water', 'NOUN'), ('pressure', 'NOUN'), ('was', 'AUX'), ('not', 'PART