In [1]:
from azure.ai.textanalytics import TextAnalyticsClient
from azure.core.credentials import AzureKeyCredential
import pandas as pd

In [2]:
key = open("./access_key/cognitive_api_key.txt").read()
endpoint = open("./access_url/cognitive_api_url.txt").read()

In [3]:
def authenticate_client():
    ta_credential = AzureKeyCredential(key)
    text_analytics_client = TextAnalyticsClient(
            endpoint=endpoint, 
            credential=ta_credential)
    return text_analytics_client

client = authenticate_client()

In [7]:
def sentiment_analysis_example(client):

    documents = ["I had the best day of my life. I wish you were there with me."]
    response = client.analyze_sentiment(documents=documents)[0]
    print("Document Sentiment: {}".format(response.sentiment))
    print("Overall scores: positive={0:.2f}; neutral={1:.2f}; negative={2:.2f} \n".format(
        response.confidence_scores.positive,
        response.confidence_scores.neutral,
        response.confidence_scores.negative,
    ))
    for idx, sentence in enumerate(response.sentences):
        print("Sentence: {}".format(sentence.text))
        print("Sentence {} sentiment: {}".format(idx+1, sentence.sentiment))
        print("Sentence score:\nPositive={0:.2f}\nNeutral={1:.2f}\nNegative={2:.2f}\n".format(
            sentence.confidence_scores.positive,
            sentence.confidence_scores.neutral,
            sentence.confidence_scores.negative,
        ))
          
sentiment_analysis_example(client)

Document Sentiment: positive
Overall scores: positive=1.00; neutral=0.00; negative=0.00 

Sentence: I had the best day of my life.
Sentence 1 sentiment: positive
Sentence score:
Positive=1.00
Neutral=0.00
Negative=0.00

Sentence: I wish you were there with me.
Sentence 2 sentiment: neutral
Sentence score:
Positive=0.21
Neutral=0.77
Negative=0.02



In [8]:
def send_request(document):
    if len(document) > 10:
        all_doc_send=[]
        response=[]
        range_10 = [i for i in range(0,len(document)+1,10)]
        for i in range(1,len(range_10)):
            all_doc_send.append(document[range_10[i - 1 ]:range_10[i]])
        for doc in all_doc_send:
            response.extend(client.analyze_sentiment(documents=doc))
    else:
        response = client.analyze_sentiment(documents=document)
    return response

def format_response(response):
    for resp in response:
        print("Document Sentiment: {}".format(resp.sentiment))
        print("Overall scores: positive={0:.2f}; neutral={1:.2f}; negative={2:.2f} \n".format(
            resp.confidence_scores.positive,
            resp.confidence_scores.neutral,
            resp.confidence_scores.negative,
        ))
        
def get_pred(resp,get_neutral=False):
    positive = resp.confidence_scores.positive
    neutral = resp.confidence_scores.neutral
    negative = resp.confidence_scores.negative
    if get_neutral :
        if (positive > neutral) & (positive > negative):
            return "good"
        elif (negative > neutral) & (negative > positive):
            return "bad"
        else:
            return "neutral"
    else:
        if positive > negative:
            return "good"
        else:
            return "bad"
    
def get_response_prediction(response,get_neutral=False):
    dic={
        "good_score":[],
        "neutral_score":[],
        "bad_score":[],
        "prediction":[],
    }
    for resp in response:
        dic["good_score"].append(resp.confidence_scores.positive)
        dic["neutral_score"].append(resp.confidence_scores.neutral)
        dic["bad_score"].append(resp.confidence_scores.negative)
        dic["prediction"].append(get_pred(resp,get_neutral=get_neutral))
    return dic

# Benchmark azure cognitive

In [9]:
test_sample = pd.read_csv("./data/sample_test_dataset.csv",index_col=0)
print("Dimensionnalité :",test_sample.shape)
test_sample.head()

Dimensionnalité : (4000, 2)


Unnamed: 0,text,sentiment
0,Chillin @ the beach with my girl brit-brit wat...,good
1,@indraherlambang really wanna say that ure lik...,good
2,"@ambermac Gotta love Brad Pitt's performance, ...",good
3,Just realized that Matthew Sweet &amp; Susanna...,good
4,"@marksphone sorry, meant that to come from my ...",good


In [10]:
test_sample["sentiment"].value_counts()

good    2000
bad     2000
Name: sentiment, dtype: int64

## Test avec 10 tweet

In [11]:
document_send = test_sample.iloc[:10]["text"].tolist()
y_true = test_sample.iloc[:10]["sentiment"].tolist()
for sentence in document_send:
    print("-",sentence)
    print()

- Chillin @ the beach with my girl brit-brit watchin the laker game n finally enjoyin this cali weather 

- @indraherlambang really wanna say that ure like Ryan Seacrest (Indo version). It's a compliment. Hope u'll be as succes as him! 

- @ambermac Gotta love Brad Pitt's performance, too. Subtitles or not. 

- Just realized that Matthew Sweet &amp; Susanna Hoffs have a second volume of &quot;Under the Covers&quot; streeting this summer.  Yay!

- @marksphone sorry, meant that to come from my personal acct! people first 

- ftw. displacement. instantaneous. What the fuck is terminal velocity ? 

- is chillin in bed watching dog the bounty hunter! 

- Mmmmmmm having my mom's pho right now 

- Getting sushi with @nicurnmama 

- @ajsouthern DELICIOUS!!  Thanks! 



In [12]:
format_response(send_request(document_send))

Document Sentiment: positive
Overall scores: positive=0.95; neutral=0.05; negative=0.00 

Document Sentiment: positive
Overall scores: positive=0.97; neutral=0.03; negative=0.00 

Document Sentiment: mixed
Overall scores: positive=0.50; neutral=0.06; negative=0.44 

Document Sentiment: positive
Overall scores: positive=0.80; neutral=0.18; negative=0.02 

Document Sentiment: negative
Overall scores: positive=0.00; neutral=0.00; negative=1.00 

Document Sentiment: mixed
Overall scores: positive=0.49; neutral=0.01; negative=0.50 

Document Sentiment: neutral
Overall scores: positive=0.33; neutral=0.66; negative=0.01 

Document Sentiment: positive
Overall scores: positive=0.68; neutral=0.27; negative=0.05 

Document Sentiment: positive
Overall scores: positive=0.70; neutral=0.25; negative=0.05 

Document Sentiment: positive
Overall scores: positive=0.99; neutral=0.01; negative=0.00 



In [13]:
dic_response = get_response_prediction(send_request(document_send))
print(dic_response["prediction"])

['good', 'good', 'good', 'good', 'bad', 'bad', 'good', 'good', 'good', 'good']


In [15]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report,roc_auc_score
def benchmark(dic_response,y_true):
    y_pred = dic_response["prediction"]
    print("accuracy score :",accuracy_score(y_true,y_pred))
    print(classification_report(y_true,y_pred))
    return pd.DataFrame(confusion_matrix(y_true,y_pred),columns=["good","bad"],index=["good","bad"])

In [16]:
y_pred = dic_response["prediction"]

In [17]:
print("accuracy score :",accuracy_score(y_true,y_pred))
# print(classification_report(y_true,y_pred))
# pd.DataFrame(confusion_matrix(y_true,y_pred),columns=["good","bad"],index=["good","bad"])

accuracy score : 0.8


## tous les tweets

Le coût de ce service est de 0.87€ pour 1000 requêtes, avec ce test nous somme à 4000 requêtes sois un total de 3.48€

In [18]:
document_send = test_sample["text"].tolist()
y_true = test_sample["sentiment"].tolist()

In [19]:
dic_response = get_response_prediction(send_request(document_send))

In [20]:
benchmark(dic_response,y_true)

accuracy score : 0.70975
              precision    recall  f1-score   support

         bad       0.74      0.64      0.69      2000
        good       0.68      0.78      0.73      2000

    accuracy                           0.71      4000
   macro avg       0.71      0.71      0.71      4000
weighted avg       0.71      0.71      0.71      4000



Unnamed: 0,good,bad
good,1276,724
bad,437,1563


In [24]:
dic = {"bad":0,"good":1}
y_test = [dic[y] for y in y_true]

In [27]:
print("AUROC score :",roc_auc_score(y_test, dic_response["good_score"]))

AUROC score : 0.743618625


Se modèle est entrainer principalement sur un échantillon large et non spécialisé, la détéction de sentiment des tweets ici est assez médiocre avec une accuracy de seulement 0.7 et une AUROC de seulement 0.74 ( calcul de l'auroc car le service cognitive mesure le degré de sentiment positif, négatif et neutre, la valeur AUROC ici prend plus de sens )