# Assignment Two:  Sentiment Classification

For this exercise you will be using the "SemEval 2017 task 4" corpus provided on the module website, available through the following link: https://warwick.ac.uk/fac/sci/dcs/teaching/material/cs918/semeval-tweets.tar.bz2 You will focus particularly on Subtask A, i.e. classifying the overall sentiment of a tweet as positive, negative or neutral.

You are requested to produce a standalone Python program or Jupyter notebook for coursework submission. The input to your program is the SemEval data downloaded. Note that TAs need to run your program on their own machine by using the original SemEval data. As such, don’t submit a Python program that takes as input some preprocessed files.

#### Import necessary packages
You may import more packages here.

In [1]:
# Import necessary packages
import re
from os.path import join
import numpy as np
import pandas as pd
import nltk
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.tokenize import RegexpTokenizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

In [2]:
# Define test sets
testsets = ['twitter-test1.txt', 'twitter-test2.txt', 'twitter-test3.txt']

In [3]:
# Skeleton: Evaluation code for the test sets
def read_test(testset):
    '''
    readin the testset and return a dictionary
    :param testset: str, the file name of the testset to compare
    '''
    id_gts = {}
    with open(testset, 'r', encoding='utf8') as fh:
        for line in fh:
            fields = line.split('\t')
            tweetid = fields[0]
            gt = fields[1]

            id_gts[tweetid] = gt

    return id_gts


def confusion(id_preds, testset, classifier):
    '''
    print the confusion matrix of {'positive', 'netative'} between preds and testset
    :param id_preds: a dictionary of predictions formated as {<tweetid>:<sentiment>, ... }
    :param testset: str, the file name of the testset to compare
    :classifier: str, the name of the classifier
    '''
    id_gts = read_test(testset)

    gts = []
    for m, c1 in id_gts.items():
        if c1 not in gts:
            gts.append(c1)

    gts = ['positive', 'negative', 'neutral']

    conf = {}
    for c1 in gts:
        conf[c1] = {}
        for c2 in gts:
            conf[c1][c2] = 0

    for tweetid, gt in id_gts.items():
        if tweetid in id_preds:
            pred = id_preds[tweetid]
        else:
            pred = 'neutral'
        conf[pred][gt] += 1

    print(''.ljust(12) + '  '.join(gts))

    for c1 in gts:
        print(c1.ljust(12), end='')
        for c2 in gts:
            if sum(conf[c1].values()) > 0:
                print('%.3f     ' % (conf[c1][c2] / float(sum(conf[c1].values()))), end='')
            else:
                print('0.000     ', end='')
        print('')

    print('')


def evaluate(id_preds, testset, classifier):
    '''
    print the macro-F1 score of {'positive', 'netative'} between preds and testset
    :param id_preds: a dictionary of predictions formated as {<tweetid>:<sentiment>, ... }
    :param testset: str, the file name of the testset to compare
    :classifier: str, the name of the classifier
    '''
    id_gts = read_test(testset)

    acc_by_class = {}
    for gt in ['positive', 'negative', 'neutral']:
        acc_by_class[gt] = {'tp': 0, 'fp': 0, 'tn': 0, 'fn': 0}

    catf1s = {}

    ok = 0
    for tweetid, gt in id_gts.items():
        if tweetid in id_preds:
            pred = id_preds[tweetid]
        else:
            pred = 'neutral'

        if gt == pred:
            ok += 1
            acc_by_class[gt]['tp'] += 1
        else:
            acc_by_class[gt]['fn'] += 1
            acc_by_class[pred]['fp'] += 1

    catcount = 0
    itemcount = 0
    macro = {'p': 0, 'r': 0, 'f1': 0}
    micro = {'p': 0, 'r': 0, 'f1': 0}
    semevalmacro = {'p': 0, 'r': 0, 'f1': 0}

    microtp = 0
    microfp = 0
    microtn = 0
    microfn = 0
    for cat, acc in acc_by_class.items():
        catcount += 1

        microtp += acc['tp']
        microfp += acc['fp']
        microtn += acc['tn']
        microfn += acc['fn']

        p = 0
        if (acc['tp'] + acc['fp']) > 0:
            p = float(acc['tp']) / (acc['tp'] + acc['fp'])

        r = 0
        if (acc['tp'] + acc['fn']) > 0:
            r = float(acc['tp']) / (acc['tp'] + acc['fn'])

        f1 = 0
        if (p + r) > 0:
            f1 = 2 * p * r / (p + r)

        catf1s[cat] = f1

        n = acc['tp'] + acc['fn']

        macro['p'] += p
        macro['r'] += r
        macro['f1'] += f1

        if cat in ['positive', 'negative']:
            semevalmacro['p'] += p
            semevalmacro['r'] += r
            semevalmacro['f1'] += f1

        itemcount += n

    micro['p'] = float(microtp) / float(microtp + microfp)
    micro['r'] = float(microtp) / float(microtp + microfn)
    micro['f1'] = 2 * float(micro['p']) * micro['r'] / float(micro['p'] + micro['r'])

    semevalmacrof1 = semevalmacro['f1'] / 2

    print(testset + ' (' + classifier + '): %.3f' % semevalmacrof1)

#### Load training set, dev set and testing set
Here, you need to load the training set, the development set and the test set. For better classification results, you may need to preprocess tweets before sending them to the classifiers.

In [8]:
# Load training set, dev set and testing set
data = {}
tweetids = {}
tweetgts = {}
tweets = {}

for dataset in ['twitter-training-data.txt'] + testsets:
    data[dataset] = []
    tweets[dataset] = []
    tweetids[dataset] = []
    tweetgts[dataset] = []



import re


def cleanup_text(texts):
    cleaned_text = []
    for text in texts:
        # remove ugly &quot and &amp
        text = re.sub(r"&quot;(.*?)&quot;", "\g<1>", text)
        text = re.sub(r"&amp;", "", text)

        # replace emoticon
        text = re.sub(
            r"(^| )(\:\w+\:|\<[\/\\]?3|[\(\)\\\D|\*\$][\-\^]?[\:\;\=]|[\:\;\=B8][\-\^]?[3DOPp\@\$\*\\\)\(\/\|])(?=\s|[\!\.\?]|$)",
            "\g<1>TOKEMOTICON",
            text,
        )

        text = text.lower()
        text = text.replace("tokemoticon", "TOKEMOTICON")

        # replace url
        text = re.sub(
            r"(http|ftp|https)://([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:/~+#-]*[\w@?^=%&/~+#-])?",
            "TOKURL",
            text,
        )

        # replace mention
        text = re.sub(r"@[\w]+", "TOKMENTION", text)

        # replace hashtag
        text = re.sub(r"#[\w]+", "TOKHASHTAG", text)

        # replace dollar
        text = re.sub(r"\$\d+", "TOKDOLLAR", text)

        # remove punctuation
        text = re.sub("[^a-zA-Z0-9]", " ", text)

        # remove multiple spaces
        text = re.sub(r" +", " ", text)

        # remove newline
        text = re.sub(r"\n", " ", text)
        
        #Remove Digits
        text= re.sub('[0-9\n]',' ',text)

        cleaned_text.append(text)
    return cleaned_text


            
# #function for pre-processing the data
# def pre_process_data(senti_data):
    
  
#     senti_data['processed_text'] = senti_data['tweets']
#    # calling all the functions for pre-processing that were defined earlier
#     senti_data['processed_text'] = senti_data['processed_text'].apply(lambda x : x.lower())
#     senti_data['processed_text'] = np.vectorize(spec_punc_remove)(senti_data['processed_text'])
#     senti_data['processed_text'] = np.vectorize(contract_convert)(senti_data['processed_text'])
#     senti_data['processed_text'] = np.vectorize(remove_single_char)(senti_data['processed_text'])
    
#     senti_data['processed_text'] = np.vectorize(remove_space)(senti_data['processed_text'])

#     senti_data['processed_text'] = np.vectorize(replace_url)(senti_data['processed_text'])
    
#     senti_data['processed_text'] = np.vectorize(replace_mentions)(senti_data['processed_text'])
#     senti_data['processed_text'] = np.vectorize(remove_numbers)(senti_data['processed_text'])

#     senti_data['processed_text'] = np.vectorize(replace_hashtags)(senti_data['processed_text'])      

#     senti_data['processed_text'] = np.vectorize(remove_punct)(senti_data['processed_text'])       
#     senti_data['processed_text'] = senti_data['processed_text'].str.strip()    
#     senti_data['processed_text'] = np.vectorize(replace_Long)(senti_data['processed_text'])  
#     senti_data['processed_text'] = np.vectorize(remove_digit)(senti_data['processed_text']) 
#     #Adding exta features namely, positive, negative , and Abusive words
#     senti_data['Abusive words'] = senti_data['processed_text'].apply(lambda x: len(re.findall(abs_words_pattern,x)))
#     senti_data['positive words'] = senti_data['processed_text'].apply(lambda x: len(re.findall(pos_words_pattern,x)))
#     senti_data['negative words'] = senti_data['processed_text'].apply(lambda x: len(re.findall(neg_words_pattern,x)))
    
    
#     return senti_data 
# # write code to read in the datasets here

#Loading the training data as Dictionary
data = {}
with open('twitter-training-data.txt', 'r', encoding='utf-8') as file:
     for lines in file:
            col_val = lines.split('\t')
            tweetids = col_val[0]
            tweetgts = col_val[1]
            tweets = col_val[2].strip()
            data[tweetids] = tweetgts, tweets 
#Converting dic file to dataframe
data = pd.DataFrame.from_dict(data,orient='index',columns = ['tweetgts','tweets'])
data['tweetids'] = data.index
data.reset_index(drop=True, inplace=True)
#calling the function for pre-processing the training data
data = cleanup_text(data)

#feature extraction using Tf-idf for test data
def feature_test(data, vocab):
    tfidf_vectoriser = TfidfVectorizer(min_df=5, ngram_range=(1,3), vocabulary=vocab)
  
    tfidf = tfidf_vectoriser.fit_transform(cleanup_text(data['tweets']))
    return tfidf


#feature extraction using tf-idf for train data
def feature_train(data):
    tfidf_vectoriser = TfidfVectorizer(min_df=5, ngram_range=(1,3))

    tfidf = tfidf_vectoriser.fit_transform(cleanup_text(data['tweets']))

    pickle.dump(tfidf_vectoriser.vocabulary_, open('TF-IDF_Vocabulary.pkl', 'wb'))

    return tfidf


#x_train and y_train 
x_train = feature_train(data)
y_train = data['tweetgts']

# Logistic Regression
logistic_regression = LogisticRegression()
logistic_regression.fit(x_train, y_train)
pickle.dump(logistic_regression, open('LogisticRegression.pkl', 'wb'))

#Support vector machine
SVM = LinearSVC()
SVM.fit(x_train, y_train)
pickle.dump(SVM, open('SVM.pkl', 'wb'))


TypeError: list indices must be integers or slices, not str

#### Build sentiment classifiers
You need to create your own classifiers (at least 3 classifiers). For each classifier, you can choose between the bag-of-word features and the word-embedding-based features. Each classifier has to be evaluated over 3 test sets. Make sure your classifier produce consistent performance across the test sets. Marking will be based on the performance over all 5 test sets (2 of them are not provided to you).

In [11]:
# Buid traditional sentiment classifiers. An example classifier name 'svm' is given
# in the code below. You should replace the other two classifier names
# with your own choices. For features used for classifier training, 
# the 'bow' feature is given in the code. But you could also explore the 
# use of other features.
for classifier in ['Support Vector Machine', 'Logistic Regression']:
    for features in ['bow', '<feature-2-name>']:
        # Skeleton: Creation and training of the classifiers
        if classifier == 'Support Vector Machine':
            print('Training ' + classifier)
            model = pickle.load(open('SVM.pkl','rb'))
            vocab = pickle.load(open('TF-IDF_Vocabulary.pkl','rb'))
        elif classifier == 'Logistic Regression':
            print('Training ' + classifier)
            model = pickle.load(open('LogisticRegression.pkl','rb'))
            vocab = pickle.load(open('TF-IDF_Vocabulary.pkl','rb'))
        else:
            print('Unknown classifier name' + classifier)
            continue

        # Predition performance of the classifiers
    for testset in testsets:
        id_preds = {}
        if(classifier == "Support Vector Machine") | (classifier == "Logistic Regression") :
            #write the prediction and the evaluation code here
            testset_name = testset
            testset_path = join('semeval-tweets', testset_name)
            #loading test data and calling feature_test function for feature extraction on the test data
            data_test = pd.read_csv(testset, sep="\t", names=['tweetids', 'tweetgts', 'tweets'], dtype={'tweetids':'object','tweetgts':'object','tweets':'object'})                
            data_test = pre_process_data(data_test)
     
            features_extract = feature_test(data_test, vocab)

        
            id_preds = dict(zip(data_test['tweetids'],model.predict(features_extract)))
            confusion(id_preds,testset, classifier)
            evaluate(id_preds, testset, classifier)


Training Support Vector Machine
Training Support Vector Machine
            positive  negative  neutral
positive    0.697     0.059     0.244     
negative    0.151     0.639     0.210     
neutral     0.243     0.149     0.608     

twitter-test1.txt (Support Vector Machine): 0.577
            positive  negative  neutral
positive    0.759     0.037     0.204     
negative    0.157     0.672     0.172     
neutral     0.333     0.097     0.571     

twitter-test2.txt (Support Vector Machine): 0.634
            positive  negative  neutral
positive    0.733     0.056     0.211     
negative    0.203     0.554     0.243     
neutral     0.292     0.136     0.572     

twitter-test3.txt (Support Vector Machine): 0.550
Training Logistic Regression
Training Logistic Regression
            positive  negative  neutral
positive    0.739     0.054     0.207     
negative    0.164     0.735     0.102     
neutral     0.229     0.161     0.610     

twitter-test1.txt (Logistic Regression): 0.563
 