# Importing the necessary Libraries

In [1]:
from tensorflow.keras.preprocessing.text import Tokenizer
import re
import numpy as np


# Reading the datasets

In [2]:
def read_file(file_list):
    '''
    Reads the txt file and assigns the parameters to respective list, updating the dictionary.
    Also, performing One Hot Encoding on the Sentiments.
    '''
    dataset = {}
    for path in file_list:
        dataset[path] = {}
        tweet = []
        tweetgts = []
        tweetid = []
        with open(path, encoding='utf8') as file:
            for line in file:
                line = line[:len(line) - 1]
                contents = line.split('\t')
                tweetid.append(int(contents[0]))
                if(contents[1] == 'positive'):
                    tweetgts.append([0, 1, 0])
                elif(contents[1] == 'negative'):
                    tweetgts.append([0, 0, 1])
                else:
                    tweetgts.append([1, 0, 0])
                tweet.append(contents[2])
        dataset[path]['tweet'] = tweet
        dataset[path]['sentiment'] = tweetgts
        dataset[path]['ids'] = tweetid
    return dataset
dataset = read_file(['twitter-training-data.txt', 'twitter-dev-data.txt','twitter-test1.txt','twitter-test2.txt','twitter-test3.txt'])

# Text Pre-processing

In [3]:
def cleanup_text(texts):
    '''
    Pre-processed the tweets and returns a clean tweets after
    replacing and removing the unwanted bits and pieces from the tweet.
    '''
    cleaned_text = []
    for text in texts:
        # remove ugly &quot and &amp
        text = re.sub(r"&quot;(.*?)&quot;", "\g<1>", text)
        text = re.sub(r"&amp;", "", text)

        # replace emoticon
        text = re.sub(
            r"(^| )(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)",
            "\g<1>TOKEMOTICON",
            text,
        )

        text = text.lower()
        text = text.replace("tokemoticon", "TOKEMOTICON")

        # replace url
        text = re.sub(
            r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?",
            "TOKURL",
            text,
        )

        # replace mention
        text = re.sub(r"@[\w]+", "TOKMENTION", text)

        # replace hashtag
        text = re.sub(r"#[\w]+", "TOKTAG", text)

        # replace dollar
        text = re.sub(r"\£\d+", "TOKPOUND", text)

        # remove punctuation
        text = re.sub("[^a-zA-Z0-9]", " ", text)

        # remove multiple spaces
        text = re.sub(r" +", " ", text)

        # remove newline
        text = re.sub(r"\n", " ", text)
        
        #Remove Digits
        text= re.sub('[0-9\n]',' ',text)

        cleaned_text.append(text)
    return cleaned_text

# Tokenizing the tweets

## Please change the test sets value below to get the accuracies

In [4]:
cleaned_tweets = cleanup_text(dataset['twitter-training-data.txt']['tweet'])
v_clean_tweets = cleanup_text(dataset['twitter-test3.txt']['tweet'])
tokenizer = Tokenizer(num_words = 5000,oov_token='<oov>')
tokenizer.fit_on_texts(cleaned_tweets)
word_index= tokenizer.word_index
print(len(word_index))
train_tokenized_sentence = tokenizer.texts_to_sequences(cleaned_tweets)
valid_tokenized_sentence = tokenizer.texts_to_sequences(v_clean_tweets)

35402


# Padding the list

In [5]:
def padding(seq, max_len = 45):
    '''
    Padding to make tweets same in length.
    Filling empty spaces with 0.
    '''
    pad_value = 0
    ls=[]
    for i in seq:
        pad_size = max_len - len(i)
        final_list = [*i, *[pad_value] * pad_size]
        ls.append(final_list)
    return ls
train_padded_seq = padding(train_tokenized_sentence)
valid_padded_seq = padding(valid_tokenized_sentence)

# Converting to Numpy Array, easier to feed to the classifier

In [6]:
train_tweet = np.array(train_padded_seq)
train_sentiment = np.array(np.argmax(dataset['twitter-training-data.txt']['sentiment'], axis=1))

valid_tweet = np.array(valid_padded_seq)
valid_sentiment = np.array(np.argmax(dataset['twitter-test3.txt']['sentiment'],axis=1))


# Support Vector Machine (SVM) Classifier

In [7]:
from sklearn.svm import SVC
def accuracy(y_pred, y_actual):
    acc = 0
    for i in range(len(y_pred)):
        if (y_pred[i] == y_actual[i]):
            acc = acc + 1
    acc = (acc/len(y_pred)) * 100
    return acc

svc = SVC(gamma='auto')
svc.fit(train_tweet, train_sentiment)

SVC(gamma='auto')

In [8]:
y_pred = svc.predict(valid_tweet)
print('Support Vector Machine (SVM) accuracy on test data - ' + str(accuracy(y_pred, valid_sentiment)) + ' %')

Support Vector Machine (SVM) accuracy on test data - 41.40395124001681 %
