In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import numpy as np
import plotly.express as px
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

In [None]:
%%capture

from datasets import load_dataset
help(load_dataset)

mode = 'force_redownload'

train_data = load_dataset('md_gender_bias', name = 'funpedia', split = 'train', download_mode = mode)
test_data = load_dataset('md_gender_bias', name = 'funpedia', split = 'test', download_mode = mode)
val_data = load_dataset('md_gender_bias', name = 'funpedia', split = 'validation', download_mode = mode)

In [None]:
print(train_data)
print(test_data)
print(val_data)

Dataset({
    features: ['text', 'gender', 'title', 'persona'],
    num_rows: 23897
})
Dataset({
    features: ['text', 'gender', 'title', 'persona'],
    num_rows: 2938
})
Dataset({
    features: ['text', 'gender', 'title', 'persona'],
    num_rows: 2984
})


In [None]:
train_data = pd.DataFrame(train_data)
test_data = pd.DataFrame(test_data)
val_data = pd.DataFrame(val_data)

In [None]:
total_nrows = len(train_data) + len(test_data) + len(val_data)
print('percentage data that is TRAIN:', round((len(train_data)/total_nrows)*100, 2))
print('percentage data that is TEST:', round((len(test_data)/total_nrows)*100, 2))
print('percentage data that is VALIDATION:', round((len(val_data)/total_nrows)*100, 2))

percentage data that is TRAIN: 80.14
percentage data that is TEST: 9.85
percentage data that is VALIDATION: 10.01


In [None]:
#combine pre-split datasets into one then apply train test split function twice
unified_data = train_data.append(test_data, ignore_index=True)
unified_data = unified_data.append(val_data, ignore_index=True)
print('nrows raw unified dataset:', len(unified_data))

nrows raw unified dataset: 29819


In [None]:
#drop all columns except text and label
unified_data = unified_data[['text', 'gender']]

In [None]:
#ensure all text lowercase so can compare for duplicates
for col in ['text', 'gender']:
    unified_data[col] = unified_data[col].str.lower()

In [None]:
#check duplicate rows in 'text' column
print('Number of duplicates:', len(unified_data[unified_data.duplicated(['text'])]))
#drop duplicates
unified_data.drop_duplicates(subset=['text'], inplace = True)
#recheck number of duplicates
print('Number of duplicates:', len(unified_data[unified_data.duplicated(['text'])]))

Number of duplicates: 1
Number of duplicates: 0


In [None]:
#drop NAs
unified_data.dropna(inplace = True)

In [None]:
#drop 'gender-neutral' class?
unified_data = unified_data[unified_data['gender'] != 'gender-neutral']

In [None]:
#downsample 'male' to fix class imbalance
from imblearn.under_sampling import RandomUnderSampler
undersample = RandomUnderSampler(sampling_strategy='majority', random_state= 0)
X = np.array(unified_data['text']).reshape(-1, 1)
y = np.array(unified_data['gender']).reshape(-1, 1)
X_undersample, y_undersample = undersample.fit_resample(X, y)

In [None]:
#flatten to 1D array in order to reocmbine into a new undersampled 'unified_data' dataframe
X_undersample = X_undersample.flatten()
y_undersample = y_undersample.flatten()

In [None]:
print(X_undersample.shape)
print(y_undersample.shape)

(8914,)
(8914,)


In [None]:
unified_data = pd.DataFrame({'text': X_undersample, 'gender': list(y_undersample)}, columns=['text', 'gender'])

In [None]:
unified_data['len_text'] = 0

unified_data.head()

Unnamed: 0,text,gender,len_text
0,danielle frenkel is a high jumper born in israel,female,0
1,tania mihailuk is a politician who was born in...,female,0
2,juliet taylor is a woman who works as a castin...,female,0
3,margaret caroline rudd was born in britain. sh...,female,0
4,retta scott was an american artist who died in...,female,0


In [None]:
#inefficient loop takes too long
#for row in range(0, len(unified_data)):
    #unified_data['len_text'][row] = len(unified_data['text'][row])
    
unified_data['len_text'] = unified_data['text'].str.len()

In [None]:
#test 'train_test_split'
X = unified_data['text']
y = unified_data['gender']

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state = 0, test_size = 0.2, stratify = y)   #'stratify' argument ensures same class proportions for each split

In [None]:
#convert two classes to new binary id column
unified_data['gender_id'] = unified_data['gender']
unified_data['gender_id'].replace(['male', 'female'], [0, 1], inplace = True)
print(unified_data[unified_data['gender'] == 'male'].head(3))
print(unified_data[unified_data['gender'] == 'female'].head(3))

                                                   text gender  len_text  \
4457  holy moly! josh brown guest starred in the sho...   male        99   
4458  jim colver has been serving the residents of a...   male        74   
4459  born in cleveland, ford was luckily taken by t...   male       104   

      gender_id  
4457          0  
4458          0  
4459          0  
                                                text  gender  len_text  \
0   danielle frenkel is a high jumper born in israel  female        48   
1  tania mihailuk is a politician who was born in...  female        56   
2  juliet taylor is a woman who works as a castin...  female        56   

   gender_id  
0          1  
1          1  
2          1  


In [None]:
other_unified = unified_data.copy()

### <font color='red'>AI006 - Output Sentiment Analysis From Pretrained Model</font>

- https://towardsdatascience.com/the-most-favorable-pre-trained-sentiment-classifiers-in-python-9107c06442c6

#### Vader

In [None]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

#calculate the negative, positive, neutral and compound scores, plus verbal evaluation
def sentiment_vader(sentence):

    # Create a SentimentIntensityAnalyzer object.
    sid_obj = SentimentIntensityAnalyzer()

    sentiment_dict = sid_obj.polarity_scores(sentence)
    negative = sentiment_dict['neg']
    neutral = sentiment_dict['neu']
    positive = sentiment_dict['pos']
    compound = sentiment_dict['compound']

    if sentiment_dict['compound'] >= 0.05 :
        overall_sentiment = "Positive"

    elif sentiment_dict['compound'] <= - 0.05 :
        overall_sentiment = "Negative"

    else :
        overall_sentiment = "Neutral"
    
    print('negative, neutral, positive, compound, overall_sentiment')
    return negative, neutral, positive, compound, overall_sentiment

In [None]:
for i in range(0, 2 + 1):
    print('TEXT:', other_unified['text'][i], '-->', '\nSENTIMENT:', sentiment_vader(other_unified['text'][i]), '\n', '-'*30)

negative, neutral, positive, compound, overall_sentiment
TEXT: danielle frenkel is a high jumper born in israel --> 
SENTIMENT: (0.0, 1.0, 0.0, 0.0, 'Neutral') 
 ------------------------------
negative, neutral, positive, compound, overall_sentiment
TEXT: tania mihailuk is a politician who was born in australia --> 
SENTIMENT: (0.0, 1.0, 0.0, 0.0, 'Neutral') 
 ------------------------------
negative, neutral, positive, compound, overall_sentiment
TEXT: juliet taylor is a woman who works as a casting director --> 
SENTIMENT: (0.0, 1.0, 0.0, 0.0, 'Neutral') 
 ------------------------------
negative, neutral, positive, compound, overall_sentiment
TEXT: margaret caroline rudd was born in britain. she was a notorious female forger. --> 
SENTIMENT: (0.195, 0.805, 0.0, -0.4404, 'Negative') 
 ------------------------------
negative, neutral, positive, compound, overall_sentiment
TEXT: retta scott was an american artist who died in 1990 --> 
SENTIMENT: (0.286, 0.714, 0.0, -0.5574, 'Negative') 


#### TextBlob

In [None]:
from textblob import TextBlob

#call the classifier
def sentiment_textblob(row):
  
    classifier = TextBlob(row)
    polarity = classifier.sentiment.polarity
    subjectivity = classifier.sentiment.subjectivity
    
    print('polarity, subjectivity')
    return polarity, subjectivity

In [None]:
for i in range(0, 2 + 1):
    print('TEXT:', other_unified['text'][i], '-->', '\nSENTIMENT:', sentiment_textblob(other_unified['text'][i]), '\n', '-'*30)

polarity, subjectivity
TEXT: danielle frenkel is a high jumper born in israel --> 
SENTIMENT: (0.16, 0.5399999999999999) 
 ------------------------------
polarity, subjectivity
TEXT: tania mihailuk is a politician who was born in australia --> 
SENTIMENT: (0.0, 0.0) 
 ------------------------------
polarity, subjectivity
TEXT: juliet taylor is a woman who works as a casting director --> 
SENTIMENT: (0.0, 0.0) 
 ------------------------------
polarity, subjectivity
TEXT: margaret caroline rudd was born in britain. she was a notorious female forger. --> 
SENTIMENT: (0.0, 0.16666666666666666) 
 ------------------------------
polarity, subjectivity
TEXT: retta scott was an american artist who died in 1990 --> 
SENTIMENT: (0.0, 0.0) 
 ------------------------------
polarity, subjectivity
TEXT: cicely mary barker was a fantasy illustrator who depicted fairies and flowers --> 
SENTIMENT: (0.0, 0.0) 
 ------------------------------


#### Happy Transformer

In [None]:
try:
    #sentencepiece causing error 'legacy-install-failure': https://stackoverflow.com/questions/71575380/sentencepiece-library-is-not-being-installed-in-the-system
    from happytransformer import HappyTextClassification

    happy_tc = HappyTextClassification(model_type="DISTILBERT", model_name="distilbert-base-uncased-finetuned-sst-2-english", num_labels=2)

    def sentiment_happy_transformer(text):
        result = happy_tc.classify_text(text)
        if result.label == 'LABEL_1':
            print('positive sentiment:', result.score)
        elif result.label == 'LABEL_0':
            print('negative sentiment:', result.score)
        else:
            print('neutral sentiment:', result.score)
            
    for i in range(0, 2 + 1):
        sentiment_happy_transformer(other_unified['text'][i])
        
except:
    print('install error, sentencepiece not ocmpatible with Python 3.10 seemingly')

install error, sentencepiece not ocmpatible with Python 3.10 seemingly


#### Amazon Comprehend

In [None]:
#https://dev.classmethod.jp/articles/comprehend-operations-using-python-boto3/

import boto3

def detect_sentiment(text):
    comprehend = boto3.client('comprehend', region_name='eu-west-2')
    response = comprehend.detect_sentiment(Text=text, LanguageCode='en')
    return response

In [None]:
for i in range(0, 2 + 1):
    print('TEXT:', other_unified['text'][i], '-->', '\nANALYSIS:\n', detect_sentiment(other_unified['text'][i]), '\n', '-'*30)

TEXT: danielle frenkel is a high jumper born in israel --> 
ANALYSIS:
 {'Sentiment': 'NEUTRAL', 'SentimentScore': {'Positive': 0.07776756584644318, 'Negative': 0.017122093588113785, 'Neutral': 0.9037699103355408, 'Mixed': 0.0013403829652816057}, 'ResponseMetadata': {'RequestId': '4c755e42-a77a-4283-9b23-03e1fe7cf4ef', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '4c755e42-a77a-4283-9b23-03e1fe7cf4ef', 'content-type': 'application/x-amz-json-1.1', 'content-length': '164', 'date': 'Mon, 18 Jul 2022 12:14:44 GMT'}, 'RetryAttempts': 0}} 
 ------------------------------
TEXT: tania mihailuk is a politician who was born in australia --> 
ANALYSIS:
 {'Sentiment': 'NEUTRAL', 'SentimentScore': {'Positive': 0.0014963001012802124, 'Negative': 0.042056210339069366, 'Neutral': 0.9535102248191833, 'Mixed': 0.0029372223652899265}, 'ResponseMetadata': {'RequestId': '0777f138-1083-46b1-8f42-38622f802744', 'HTTPStatusCode': 200, 'HTTPHeaders': {'x-amzn-requestid': '0777f138-1083-46b1-8f42-

#### text2emotion

In [None]:
#be aware, text2emotion builds on top of NLTK and so requires NLTK data downloads to work - recurring SSL CERTIFICATE VERIFY FAILED error
#I downloaded manually and stored in relevant directory
import text2emotion as te

for i in range(0, 2 + 1):
        print('TEXT:', other_unified['text'][i], '-->', '\nEMOTIONS:', te.get_emotion(other_unified['text'][i]), '\n', '-'*30)

TEXT: danielle frenkel is a high jumper born in israel --> 
EMOTIONS: {'Happy': 0.0, 'Angry': 0.0, 'Surprise': 0.0, 'Sad': 0.5, 'Fear': 0.5} 
 ------------------------------
TEXT: tania mihailuk is a politician who was born in australia --> 
EMOTIONS: {'Happy': 0.0, 'Angry': 0.0, 'Surprise': 0.0, 'Sad': 1.0, 'Fear': 0.0} 
 ------------------------------
TEXT: juliet taylor is a woman who works as a casting director --> 
EMOTIONS: {'Happy': 0.0, 'Angry': 0.0, 'Surprise': 0.5, 'Sad': 0.0, 'Fear': 0.5} 
 ------------------------------
TEXT: margaret caroline rudd was born in britain. she was a notorious female forger. --> 
EMOTIONS: {'Happy': 0.0, 'Angry': 0.0, 'Surprise': 0.0, 'Sad': 1.0, 'Fear': 0.0} 
 ------------------------------
TEXT: retta scott was an american artist who died in 1990 --> 
EMOTIONS: {'Happy': 0.0, 'Angry': 0.0, 'Surprise': 0.0, 'Sad': 0.0, 'Fear': 1.0} 
 ------------------------------
TEXT: cicely mary barker was a fantasy illustrator who depicted fairies and flowe