In [3]:
import re
import spacy
import numpy
import pandas
import unicodedata

from matplotlib import pyplot

from nltk.tokenize import TweetTokenizer

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, recall_score, precision_score, confusion_matrix, ConfusionMatrixDisplay, \
    classification_report, accuracy_score
from sklearn.model_selection import train_test_split


In [4]:
HYPERLINKS_REGEX = r'(http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+)|(www\.[^ \s]+)'
HASHTAGS_REGEX = r'#\w+'
MENTIONS_REGEX = r'@\w+'
NUMERIC_REGEX = r'[0-9]+'

In [5]:
class Stopwords:
    """
    A utility class designed to store all the essential information related to Greek stopwords.
    """
    def __init__(self, path):
        self.__stopwords = set()

        stopwordFile = open(path, "r", encoding="utf8")

        words = []

        for word in stopwordFile:
            words.append(word)

        for word in words:
            stopword = word.replace("\n","")
            self.__stopwords.add(stopword)
            
        stopwordFile.close()

    def have(self, word):
        return word in self.__stopwords

In [6]:
class Tweet:
    """
    A class designed to store all the essential information related to a Tweet.
    """
    def __init__(self):
        self.__originalText = None
        self.__text = None
        self.__mentions = []
        self.__links = []
        self.__hashtags = []
        self.__sentiment = None
        self.__party = None
        self.__id = None
    
    def setID(self, id):
        self.__id = id
        
    def getID(self):
        return self.__id
        
    def addLinks(self, links):
        for link in links:
            self.__links.append(link)
    
    def getLinks(self):
        return self.__links
    
    def getTotalLinks(self):
        return len(self.__links)

    def addHashtags(self, hashtags):
        for hashtag in hashtags:
            self.__hashtags.append(hashtag)
        
    def getHashtags(self):
        return self.__hashtags

    def getTotalHashtags(self):
        return len(self.__hashtags)

    def addMentions(self, mentions):
        for mention in mentions:
            self.__mentions.append(mention)
    
    def getMentions(self):
        return self.__mentions
    
    def getTotalMentions(self):
        return len(self.__mentions)

    def setParty(self, party):
        self.__party = party

    def getParty(self):
        return self.__party

    def setSentiment(self, sentiment):
        self.__sentiment = sentiment

    def getSentiment(self):
        return self.__sentiment

    def setText(self, text):
        self.__text = text

    def getText(self):
        return self.__text

    def setOriginalText(self, originalText):
        self.__originalText = originalText

    def getOriginalText(self):
        return self.__originalText

    def isPositive(self):
        return self.__sentiment == 0

    def isNegative(self):
        return self.__sentiment == 1


In [7]:
classes = {"POSITIVE": 0, "NEGATIVE": 1, "NEUTRAL": 2}
reverseClasses = {0: "POSITIVE", 1: "NEGATIVE", 2: "NEUTRAL"}

nlp = spacy.load("el_core_news_lg")

# Read the README section. This file is really important
greekStopwords = Stopwords("Greek-Stopwords.txt")

tokenizer = TweetTokenizer(preserve_case=True, reduce_len=True)
encounteredParties = dict()


def removeAccents(token):
    """
    A function designed to eliminate accents from the given token.
    """
    return ''.join(
        character for character in unicodedata.normalize('NFD', token) if unicodedata.category(character) != 'Mn')


def hasSpecialCharacters(token):
    """
    A function designed to check if the given token has any special characters.
    """
    for character in token:
        if not character.isalnum():
            return True

    return False


def removeSpecialCharacters(token):
    """
    A function designed to eliminate special characters from the given token.
    """
    refinedToken = ""
    for character in token:
        if character.isalnum():
            refinedToken += character
    return refinedToken


def normalize(token):
    """
    A function designed to normalize the given token.
    
    Normalization includes:
    
    - Uppercase conversion
    
    - Accent elimination
    
    - Replacing consecutive identical characters with a single character
    
    - Note that order matters as the normalization won't succeed with the word Αγέεεέλη
    """
    token = token.upper()
    token = removeAccents(token)
    token = re.sub(r'(.)\1+', r'\1', token)
    return token


def processDataset(path, isTestingSet):
    """
    A function designed to preprocess the given dataset.
    The full processing routine is documented in the supplied report.    
    """
    dataframe = pandas.read_csv(path, encoding="utf-8")
    tweets = []

    for _, row in dataframe.iterrows():

        tweet = Tweet()
        
        # Store the Tweet's id
        tweet.setID(row['New_ID'])
        
        # Store Tweet's party
        tweet.setParty(row['Party'])

        # Store Tweet's sentiment as long as it doesn't belong to the testing set
        if not isTestingSet:
            tweet.setSentiment(classes[row['Sentiment']])

        # Store Tweet's original text
        text = row['Text']
        tweet.setOriginalText(text)

        # Find all hyperlinks in the Tweet's text and store them.
        links = re.findall(HYPERLINKS_REGEX, text)
        links = ["".join(group) for group in links]
        tweet.addLinks(links)

        # Eliminate all hyperlinks from the Tweet's text
        text = re.sub(HYPERLINKS_REGEX, " ", text)

        # Find all hashtags in the Tweet's text and store them.
        hashtags = re.findall(HASHTAGS_REGEX, text)

        # Eliminate all hashtags from the Tweet's text
        text = re.sub(HASHTAGS_REGEX, " ", text)

        # Find all mentions in the Tweet's text and store them.
        mentions = re.findall(MENTIONS_REGEX, text)
        tweet.addMentions(mentions)

        # Eliminate all mentions from the Tweet's text
        text = re.sub(MENTIONS_REGEX, " ", text)

        # Eliminate all numeric characters from the Tweet's text
        text = re.sub(NUMERIC_REGEX, " ", text)

        # Tokenize the Tweet's modified text using the ntlk TweetTokenizer
        # Join the tokens produced and then tokenize the produced string using spaCy
        # This decision was made after observations on how the spaCy model handles tokenization
        tweetTokens = nlp(" ".join(tokenizer.tokenize(text)))

        refinedTokens = []

        for token in tweetTokens:

            # If the corresponding token is identified as a stopword, eliminate it.
            if greekStopwords.have(normalize(token.text)):
                continue

            # If the corresponding token is a special character, eliminate it.
            if hasSpecialCharacters(token.text):
                continue

            # Lemmatize the corresponding token
            refinedToken = token.lemma_

            # Normalize the corresponding token
            refinedToken = normalize(refinedToken)

            # Eliminate the concluding Σ character, if present, as
            # words such as ΤΣΙΠΡΑ and ΤΣΙΠΡΑΣ share the same semantic value
            if len(refinedToken) > 1 and refinedToken[len(refinedToken) - 1] == "Σ":
                refinedToken = refinedToken[:len(refinedToken) - 1]

            # Store the refined token
            refinedTokens.append(refinedToken)

        refinedHashtags = []

        for hashtag in hashtags:

            # Remove the # character
            refinedHashtag = hashtag[1:]

            # Normalize the corresponding hashtag
            refinedHashtag = normalize(refinedHashtag)

            # Eliminate any special characters from the corresponding hashtag
            refinedHashtag = removeSpecialCharacters(refinedHashtag)

            # Eliminate the concluding Σ character, if present, as
            # hashtags such as #ΓΑΤΑ_ΤΕΛΟΣ and #ΓΑΤΑ_ΤΕΛΟ refer to the same topic
            if len(refinedHashtag) > 1 and refinedHashtag[len(refinedHashtag) - 1] == "Σ":
                refinedHashtag = refinedHashtag[:len(refinedHashtag) - 1]

            # Store the refined hashtag in the refined tokens list
            refinedTokens.append(refinedHashtag)

            # Store the refined hashtag in the refined hashtags list
            refinedHashtags.append(refinedHashtag)

        # Unify all refined tokens into a single string and store it
        tweet.setText(" ".join(refinedTokens))

        # Store all the Tweet's refined hashtags for potential future use
        tweet.addHashtags(refinedHashtags)
        tweets.append(tweet)

    return tweets


def prepareDataset(tweets, vectorizer, useParty, useLinks, isTrainingSet):
    corpus = []
    labels = []
    ids = []
    partyCounter = 1

    for tweet in tweets:
        
        # Store the Tweet's id
        ids.append(tweet.getID())
        
        # Store the Tweet's text and sentiment
        corpus.append(tweet.getText())
        labels.append(tweet.getSentiment())

        # In the case of the training set, store all encountered parties
        # and assign a unique numerical identifier to each one of them
        if isTrainingSet and tweet.getParty() not in encounteredParties.keys():
            encounteredParties[tweet.getParty()] = partyCounter
            partyCounter += 1

    if isTrainingSet:
        featureMatrix = vectorizer.fit_transform(corpus)

    else:
        featureMatrix = vectorizer.transform(corpus)

    instances = featureMatrix.toarray()

    # In the case where the Tweet's party is included as a feature
    # if its party has been encountered before set the feature's value to the
    # unique numerical identifier associated with the Tweet's party, otherwise set it to 0
    if useParty:
        parties = []
        for tweet in tweets:
            if tweet.getParty() in encounteredParties.keys():
                parties.append(encounteredParties[tweet.getParty()])
            else:
                parties.append(0)

        instances = numpy.hstack((instances, numpy.array(parties).reshape(-1, 1)))

    # In the case where the presence of hyperlinks in a Tweet is included as a feature
    # if the Tweet has at least one hyperlink set the feature's value to 1, otherwise set it to 0
    if useLinks:
        links = []
        for tweet in tweets:
            if tweet.getTotalLinks() > 0:
                links.append(1)
            else:
                links.append(0)

        instances = numpy.hstack((instances, numpy.array(links).reshape(-1, 1)))

    return instances, labels, ids


In [8]:
def evaluateModel(model, instances, labels, average):
    """
    A function designed to evaluate a model's F1, Recall, Precision and Accuracy scores
    """
    predictions = model.predict(instances)
    f1 = f1_score(labels, predictions, average=average)
    recall = recall_score(labels, predictions, average=average)
    precision = precision_score(labels, predictions, average=average)
    accuracy = accuracy_score(labels, predictions)
    
    return f1, recall, precision, accuracy

In [9]:
trainingTweets = processDataset(path="Data/train_set.csv", isTestingSet=False)
validationTweets = processDataset(path="Data/valid_set.csv", isTestingSet=False)
testingTweets = processDataset(path="Data/test_set.csv", isTestingSet=True)

In [None]:
useParty = True
useLinks = False
bow = True
maxFeatures = 1600

# IF - ELSE control structure to experiment with both BOW and TF-IDF
# Both vectorizers come with a custom tokenizer as I don't trust sklearn enough
if bow:
    vectorizer = CountVectorizer(max_features=maxFeatures,
                                 tokenizer=lambda text: text.split(), lowercase=False)
else:
    vectorizer = TfidfVectorizer(max_features=maxFeatures,
                                 tokenizer=lambda text: text.split(), lowercase=False)

trainingInstances, trainingLabels, _ = prepareDataset(tweets=trainingTweets,
                                                      vectorizer=vectorizer,
                                                      useParty=useParty,
                                                      useLinks=useLinks,
                                                      isTrainingSet=True)

validationInstances, validationLabels, _ = prepareDataset(tweets=validationTweets,
                                                          vectorizer=vectorizer,
                                                          useParty=useParty,
                                                          useLinks=useLinks,
                                                          isTrainingSet=False)

testingInstances, _, testingIds = prepareDataset(tweets=testingTweets,
                                                 vectorizer=vectorizer,
                                                 useParty=useParty,
                                                 useLinks=useLinks,
                                                 isTrainingSet=False)


def printEssentials():
    """
    Utility function to print the essential
    information of each experiment
    """
    print("# VECTORIZER #")
    if bow:
        print("vectorizer: BOW")
    else:
        print("vectorizer: TF-IDF")
    print(f"useParty: {useParty}")
    print(f"useLinks: {useLinks}")
    print(f"maxFeatures: {maxFeatures}")
    print()


# Model parameters as suggested by Optuna
iterations = 10000
C = 0.00252187198163273
solver = "saga"
penalty = "l2"
multiClass = "multinomial"

# Evaluation parameter
average = "weighted"

# Set a random state to ensure reproducibility
randomState = 420


def modelScaling(trainingInstances, trainingLabels, validationInstances, validationLabels, testingInstances,
                 testingIds):
    sampleSize = []
    trainingMetrics = []
    validationMetrics = []
    model = None
    start = 0.1
    end = 1.1
    step = 0.1
    steps = numpy.arange(start, end, step)
    sentimentLabels = ["POSITIVE", "NEGATIVE", "NEUTRAL"]

    for i in steps:

        print(f"Progress: {(i * 100):.2f} %")

        model = LogisticRegression(max_iter=iterations,
                                   penalty=penalty, C=C,
                                   solver=solver, multi_class=multiClass)

        # Train the model on the complete training set or on a (i * 100) percentage of it  
        if i >= 1.0:
            X, Y = trainingInstances, trainingLabels
        else:
            X, _, Y, _ = train_test_split(trainingInstances, trainingLabels,
                                          train_size=i, random_state=randomState)

        # Train the model
        model.fit(X, Y)

        # Evaluate the model on the training data
        metrics = evaluateModel(model, X, Y, average=average)
        trainingMetrics.append(metrics)

        # Evaluate the model on the validation data
        metrics = evaluateModel(model, validationInstances, validationLabels, average=average)
        validationMetrics.append(metrics)

        sampleSize.append(i)

    # Evaluate the fully trained model on the validation set
    valLogMetrics = evaluateModel(model, validationInstances, validationLabels, average=average)

    # Evaluate the fully trained model on the training set
    trnLogMetrics = evaluateModel(model, trainingInstances, trainingLabels, average=average)

    # Plot each of the following metrics
    METRICS = ["F1", "RECALL", "PRECISION", "ACCURACY"]
    for i in range(len(METRICS)):
        metric = METRICS[i]
        print(f"Training - {metric}: {trnLogMetrics[i]}")
        print(f"Validation - {metric}: {valLogMetrics[i]}")
        print()
        trainingMetric = [value[i] for value in trainingMetrics]
        validationMetric = [value[i] for value in validationMetrics]
        pyplot.plot(sampleSize, trainingMetric, label="Training", marker='o')
        pyplot.plot(sampleSize, validationMetric, label="Validation", marker='x')
        pyplot.xlabel('Data Percentage')
        pyplot.ylabel(metric)
        pyplot.grid(True)
        pyplot.legend()
        pyplot.show()

    # Calculate, display and save the Confusion Matrix related to the validation set
    predictions = model.predict(validationInstances)
    confusionMatrix = confusion_matrix(validationLabels, predictions, labels=[0, 1, 2])
    confusionMatrixDisplay = ConfusionMatrixDisplay(confusion_matrix=confusionMatrix, display_labels=sentimentLabels)
    confusionMatrixDisplay.plot()
    pyplot.show()

    # Calculate and print the classification report related to the training set
    classificationReport = classification_report(validationLabels, predictions, target_names=sentimentLabels)
    print("# Validation - Classification Report #")
    print(classificationReport)
    print()

    # Calculate, display and save the Confusion Matrix related to the training set
    predictions = model.predict(trainingInstances)
    confusionMatrix = confusion_matrix(trainingLabels, predictions, labels=[0, 1, 2])
    confusionMatrixDisplay = ConfusionMatrixDisplay(confusion_matrix=confusionMatrix, display_labels=sentimentLabels)
    confusionMatrixDisplay.plot()
    pyplot.show()

    # Calculate and print the classification report related to the training set
    classificationReport = classification_report(trainingLabels, predictions, target_names=sentimentLabels)
    print("# Training - Classification Report #")
    print(classificationReport)
    print()

    # Make the predictions on the testing set and save them as instructed
    testPredictions = model.predict(testingInstances)
    testPredictions = [reverseClasses[prediction] for prediction in testPredictions]
    values = list(zip(testingIds, testPredictions))
    dataframe = pandas.DataFrame(values, columns=['Id', 'Predicted'])
    dataframe.to_csv('submission.csv', index=False)


# Print each parameter used for the corresponding experiment
printEssentials()
print(f"# MODEL-SCALING #")
print(f"iterations: {iterations}")
print(f"penalty: {penalty}")
print(f"C: {C}")
print(f"solver: {solver}")
print(f"multiClass: {multiClass}")
print(f"average: {average}")
print(f"randomState: {randomState}")
print()

modelScaling(trainingInstances=trainingInstances,
             trainingLabels=trainingLabels,
             validationInstances=validationInstances,
             validationLabels=validationLabels,
             testingInstances=testingInstances,
             testingIds=testingIds)


In [11]:
import optuna

def study(trial):

    # Definition of the BOW vectorizer search space
    useParty = trial.suggest_categorical('useParty', [True, False])
    useLinks = trial.suggest_categorical('useLinks', [True, False])
    maxFeatures = trial.suggest_int(name="Features", low=1500, high=2000, step=100)
    vectorizer = CountVectorizer(max_features=maxFeatures,
                                 tokenizer=lambda text: text.split(), lowercase=False)

    trainingInstances, trainingLabels, _ = prepareDataset(tweets=trainingTweets,
                                                          vectorizer=vectorizer,
                                                          useParty=useParty,
                                                          useLinks=useLinks,
                                                          isTrainingSet=True)

    validationInstances, validationLabels, _ = prepareDataset(tweets=validationTweets,
                                                              vectorizer=vectorizer,
                                                              useParty=useParty,
                                                              useLinks=useLinks,
                                                              isTrainingSet=False)

    iterations = 10000

    # Definition of the Logistic Regression model search space
    C = trial.suggest_float('C', low=1e-4, high=1e-2, log=True)
    multiClass = trial.suggest_categorical('multi_class', ['multinomial', 'ovr'])
    penalty = trial.suggest_categorical('penalty', ['l2', 'l1'])

    model = LogisticRegression(max_iter=iterations,
                               penalty=penalty, C=C,
                               solver="saga", multi_class=multiClass)
    model.fit(trainingInstances, trainingLabels)

    predictions = model.predict(validationInstances)
    valF1 = f1_score(validationLabels, predictions, average="weighted")

    predictions = model.predict(trainingInstances)
    trainF1 = f1_score(trainingLabels, predictions, average="weighted")

    threshold = 0.01

    if trainF1 - valF1 <= threshold:
        return valF1

    else:
        return 0


study = optuna.create_study(direction='maximize')
study.optimize()
parameters = study.best_params
print(f"Best Parameters: {parameters}")