# Classification and Vector Semantics

CS2731 Homework 3, Fall 2020

## Author
[Fangzheng Guo](fag24@pitt.edu)

### Task 1: Global experimental framework set up, make appropriate cross-validation splits.

In [102]:
# import
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [103]:
# import
from scipy.stats import ttest_ind
from imblearn.over_sampling import RandomOverSampler
import nltk
nltk.download('stopwords')
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /Users/fguo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [104]:
# split(80% training, 20% testing)
def get_data():
    df = pd.read_excel("SFUcorpus.xlsx")
    X = df['comment_text']
    Y = df['is_constructive']

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, train_size = 0.8, random_state = 42)
    
    # drop empty lines
    X_train.dropna(axis=0, how='any', inplace=True)
    X_test.dropna(axis=0, how='any', inplace=True)
    
    return X_train, X_test, Y_train, Y_test

### Task 2: 
#### Extract and preprocess the comment text to determine the vocabulary set.
for preprocessing, my approach includes:
- remove all symbols (for example '.', '?') in text, convert all characters to lower-case
- remove stopwords (useless words) in text
- use Porter Stemmer to do stemming for every word left in text

In [105]:
stop_words = set(stopwords.words('english'))
def preprocessing(comment, remove_stopwords, stemming):
    comment = re.sub('[^a-z0-9\s]', '', comment.lower())
    word_list = comment.split(' ')
    if remove_stopwords:
        word_list = [word for word in word_list if word not in stop_words]
    if stemming:
        porter_stemmer = PorterStemmer()
        for i in range(len(word_list)):
            word_list[i] = porter_stemmer.stem(word_list[i])
    return ' '.join(word_list)

#### Train a logistic regression classifier using bag of words, record the performance of your logistic regression classifier using cross-validation.

In [106]:
def process_bag_of_words(remove_stopwords, stemming):
    X_train, X_test, Y_train, Y_test = get_data()
    
    X_train = X_train.apply(lambda comment: preprocessing(comment, remove_stopwords, stemming))
    X_test = X_test.apply(lambda comment: preprocessing(comment, remove_stopwords, stemming))
    
    count_vectorizer = CountVectorizer()
    X_train_bag_of_words = count_vectorizer.fit_transform(X_train)
    
    ## majority-vote classifier
    majority = DummyClassifier(strategy='most_frequent', random_state=None, constant=None)

    ## logistic regression
    lgm = LogisticRegression(C=10, random_state=0, solver='newton-cg', multi_class='multinomial')
    
    scores_majority = cross_val_score(majority, X_train_bag_of_words, Y_train, cv=5)
    scores_lgm_BOW = cross_val_score(lgm, X_train_bag_of_words, Y_train, cv=5)
    
    statistic, p_value = ttest_ind(scores_lgm_BOW, scores_majority)
    
    print("average accuracy of majority model: ", np.mean(scores_majority))
    print("average accuracy of logistic regression model: ", np.mean(scores_lgm_BOW))
    print("p value: ", '%.20f' % p_value)

In [107]:
process_bag_of_words(remove_stopwords = False, stemming = False)

average accuracy of majority model:  0.5251789020925447
average accuracy of logistic regression model:  0.8392377894812784
p value:  0.00000001319950393971


### Task 3: two improvements to logistic regression classifier
#### Sparse vector semantic representation.


In [108]:
def process_tf_idf(remove_stopwords, stemming):
    X_train, X_test, Y_train, Y_test = get_data()
    
    X_train = X_train.apply(lambda comment: preprocessing(comment, remove_stopwords, stemming))
    X_test = X_test.apply(lambda comment: preprocessing(comment, remove_stopwords, stemming))
    
    tf_idf_vectorizer = TfidfVectorizer()
    X_train_tf_idf = tf_idf_vectorizer.fit_transform(X_train)
    
    ## majority-vote classifier
    majority = DummyClassifier(strategy='most_frequent', random_state=None, constant=None)

    ## logistic regression
    lgm = LogisticRegression(C=10, random_state=0, solver='newton-cg', multi_class='multinomial')
    
    scores_majority = cross_val_score(majority, X_train_tf_idf, Y_train, cv=5)
    scores_lgm_TFIDF = cross_val_score(lgm, X_train_tf_idf, Y_train, cv=5)
    statistic, p_value = ttest_ind(scores_lgm_TFIDF, scores_majority)
    
    print("average accuracy of majority model: ", np.mean(scores_majority))
    print("average accuracy of logistic regression model: ", np.mean(scores_lgm_TFIDF))
    print("p value: ", '%.20f' % p_value)

In [109]:
process_tf_idf(remove_stopwords = False, stemming = False)

average accuracy of majority model:  0.5251789020925447
average accuracy of logistic regression model:  0.8596118434387678
p value:  0.00000001508479227388


#### Dense vector semantic representation.

Use pre-trained word vectors [GloVe](https://nlp.stanford.edu/projects/glove/) to get the dense vector of comments in training set.

In [110]:
def read_glove():
    res = dict()
    with open('glove.6B.50d.txt', encoding = 'utf-8') as file:
        for line in file:
            values = line.split(' ')
            word = values[0]
            vectors = np.asarray(values[1:], dtype = 'float32')
            res[word] = vectors
    return res

In [111]:
vector_map = read_glove()

In [112]:
def process_word2vec(remove_stopwords, stemming):
    X_train, X_test, Y_train, Y_test = get_data()
    
    X_train = X_train.apply(lambda comment: preprocessing(comment, remove_stopwords, stemming))
    X_test = X_test.apply(lambda comment: preprocessing(comment, remove_stopwords, stemming))
    
    dense_vectors = []
    for comment in X_train:
        dense_vector = np.zeros(50)
        word_count = 0
        tokens = word_tokenize(comment)
        for word in tokens:
            if word in vector_map:
                dense_vector += map[word]
                word_count += 1
        if word_count == 0:
            dense_vectors.append(dense_vector)
        else:
            dense_vectors.append(dense_vector / word_count)
    
    X_train_word2vec = pd.DataFrame(dense_vectors)
    
    ## majority-vote classifier
    majority = DummyClassifier(strategy='most_frequent', random_state=None, constant=None)

    ## logistic regression
    lgm = LogisticRegression(C=10, random_state=0, solver='newton-cg', multi_class='multinomial')
    
    scores_majority = cross_val_score(majority, X_train_word2vec, Y_train, cv=5)
    scores_lgm_word2vec = cross_val_score(lgm, X_train_word2vec, Y_train, cv=5)
    statistic, p_value = ttest_ind(scores_lgm_word2vec, scores_majority)

    print("average accuracy of majority model: ", np.mean(scores_majority))
    print("average accuracy of logistic regression model: ", np.mean(scores_lgm_word2vec))
    print("p value: ", '%.20f' % p_value)

In [113]:
process_word2vec(remove_stopwords=False, stemming=False)

average accuracy of majority model:  0.5251789020925447
average accuracy of logistic regression model:  0.6856825076181543
p value:  0.00080723766474663798


#### Cross-validation performance testing
Test the performance of three logistic regression models, with/without preprocessing steps. Please see the table in report.

In [114]:
process_bag_of_words(remove_stopwords = False, stemming = False)
process_bag_of_words(remove_stopwords = True, stemming = False)
process_bag_of_words(remove_stopwords = False, stemming = True)
process_bag_of_words(remove_stopwords = True, stemming = True)

process_tf_idf(remove_stopwords = False, stemming = False)
process_tf_idf(remove_stopwords = True, stemming = False)
process_tf_idf(remove_stopwords = False, stemming = True)
process_tf_idf(remove_stopwords = True, stemming = True)

process_word2vec(remove_stopwords=False, stemming=False)
process_word2vec(remove_stopwords=True, stemming=False)
process_word2vec(remove_stopwords=False, stemming=True)
process_word2vec(remove_stopwords=True, stemming=True)

average accuracy of majority model:  0.5251789020925447
average accuracy of logistic regression model:  0.8392377894812784
p value:  0.00000001319950393971
average accuracy of majority model:  0.5251789020925447
average accuracy of logistic regression model:  0.8176518735334838
p value:  0.00000000677103120742
average accuracy of majority model:  0.5251789020925447
average accuracy of logistic regression model:  0.844035337242898
p value:  0.00000011121276604964
average accuracy of majority model:  0.5251789020925447
average accuracy of logistic regression model:  0.8380473991775486
p value:  0.00000004297654161048
average accuracy of majority model:  0.5251789020925447
average accuracy of logistic regression model:  0.8596118434387678
p value:  0.00000001508479227388
average accuracy of majority model:  0.5251789020925447
average accuracy of logistic regression model:  0.77560575922166
p value:  0.00000280960933344991
average accuracy of majority model:  0.5251789020925447
average acc

#### Testing "best" classifier on test set
From the result above, we know that for this specific case, TF-IDF model without removing stopwords and without stemming has the best performance. Then we test its performance on test set.

In [115]:
def test_tf_idf():
    X_train, X_test, Y_train, Y_test = get_data()
    
    X_train = X_train.apply(lambda comment: preprocessing(comment, remove_stopwords = False, stemming = False))
    X_test = X_test.apply(lambda comment: preprocessing(comment, remove_stopwords = False, stemming = False))
    
    tf_idf_vectorizer = TfidfVectorizer()

    X_train_tf_idf = tf_idf_vectorizer.fit_transform(X_train)
    
    X_test_tf_idf = tf_idf_vectorizer.transform(X_test)
    
    ## logistic regression
    lgm = LogisticRegression(C=10, random_state=0, solver='newton-cg', multi_class='multinomial')
    lgm.fit(X_train_tf_idf, Y_train)

    Y_result = lgm.predict(X_test_tf_idf)
    
    score = accuracy_score(Y_test, Y_result)
    print("accuracy score of logistic regression model: ", score)

In [116]:
test_tf_idf()

accuracy score of logistic regression model:  0.7990430622009569


### Task 4: Experiment on whether balancing dataset helps
Question: will balancing training set help to improve the performance?
#### Use RandomOverSampler to balance the training set

In [117]:
def balance(train, test):
    randomOverSampler = RandomOverSampler(random_state=42)
    return randomOverSampler.fit_sample(train, test)   

In [118]:
def process_bag_of_words_balanced(remove_stopwords, stemming):
    X_train, X_test, Y_train, Y_test = get_data()
    
    X_train = X_train.apply(lambda comment: preprocessing(comment, remove_stopwords, stemming))
    X_test = X_test.apply(lambda comment: preprocessing(comment, remove_stopwords, stemming))
    
    count_vectorizer = CountVectorizer()
    X_train_bag_of_words = count_vectorizer.fit_transform(X_train)
    
    ## majority-vote classifier
    majority = DummyClassifier(strategy='most_frequent', random_state=None, constant=None)

    ## logistic regression
    lgm = LogisticRegression(C=10, random_state=0, solver='newton-cg', multi_class='multinomial')
    
    X_train_balanced, Y_train_balanced = balance(X_train_bag_of_words, Y_train)
    
    scores_majority = cross_val_score(majority, X_train_bag_of_words, Y_train, cv=5)
    scores_lgm_BOW = cross_val_score(lgm, X_train_balanced, Y_train_balanced, cv=5)
    
    statistic, p_value = ttest_ind(scores_lgm_BOW, scores_majority)
    
    print("average accuracy of majority model: ", np.mean(scores_majority))
    print("average accuracy of logistic regression model: ", np.mean(scores_lgm_BOW))
    print("p value: ", '%.20f' % p_value)

In [119]:
process_bag_of_words_balanced(remove_stopwords=False, stemming=False)

average accuracy of majority model:  0.5251789020925447
average accuracy of logistic regression model:  0.8595872518286312
p value:  0.00000000045436593501


In [120]:
def process_tf_idf_balanced(remove_stopwords, stemming):
    X_train, X_test, Y_train, Y_test = get_data()
    
    X_train = X_train.apply(lambda comment: preprocessing(comment, remove_stopwords, stemming))
    X_test = X_test.apply(lambda comment: preprocessing(comment, remove_stopwords, stemming))
    
    tf_idf_vectorizer = TfidfVectorizer()
    X_train_tf_idf = tf_idf_vectorizer.fit_transform(X_train)
    
    ## majority-vote classifier
    majority = DummyClassifier(strategy='most_frequent', random_state=None, constant=None)

    ## logistic regression
    lgm = LogisticRegression(C=10, random_state=0, solver='newton-cg', multi_class='multinomial')
    
    X_train_balanced, Y_train_balanced = balance(X_train_tf_idf, Y_train)
    
    scores_majority = cross_val_score(majority, X_train_tf_idf, Y_train, cv=5)
    scores_lgm_TFIDF = cross_val_score(lgm, X_train_balanced, Y_train_balanced, cv=5)
    statistic, p_value = ttest_ind(scores_lgm_TFIDF, scores_majority)
    
    print("average accuracy of majority model: ", np.mean(scores_majority))
    print("average accuracy of logistic regression model: ", np.mean(scores_lgm_TFIDF))
    print("p value: ", '%.20f' % p_value)

In [121]:
process_tf_idf_balanced(remove_stopwords=False, stemming=False)

average accuracy of majority model:  0.5251789020925447
average accuracy of logistic regression model:  0.8652168234064787
p value:  0.00000000012032765678


In [122]:
def process_word2vec_balanced(remove_stopwords, stemming):
    X_train, X_test, Y_train, Y_test = get_data()
    
    X_train = X_train.apply(lambda comment: preprocessing(comment, remove_stopwords, stemming))
    X_test = X_test.apply(lambda comment: preprocessing(comment, remove_stopwords, stemming))
    
    dense_vectors = []
    for comment in X_train:
        dense_vector = np.zeros(50)
        word_count = 0
        tokens = word_tokenize(comment)
        for word in tokens:
            if word in vector_map:
                dense_vector += map[word]
                word_count += 1
        if word_count == 0:
            dense_vectors.append(dense_vector)
        else:
            dense_vectors.append(dense_vector / word_count)
    
    X_train_word2vec = pd.DataFrame(dense_vectors)
    
    ## majority-vote classifier
    majority = DummyClassifier(strategy='most_frequent', random_state=None, constant=None)

    ## logistic regression
    lgm = LogisticRegression(C=10, random_state=0, solver='newton-cg', multi_class='multinomial')
    
    X_train_balanced, Y_train_balanced = balance(X_train_word2vec, Y_train)
    
    scores_majority = cross_val_score(majority, X_train_word2vec, Y_train, cv=5)
    scores_lgm_word2vec = cross_val_score(lgm, X_train_balanced, Y_train_balanced, cv=5)
    statistic, p_value = ttest_ind(scores_lgm_word2vec, scores_majority)

    print("average accuracy of majority model: ", np.mean(scores_majority))
    print("average accuracy of logistic regression model: ", np.mean(scores_lgm_word2vec))
    print("p value: ", '%.20f' % p_value)

In [123]:
process_word2vec_balanced(remove_stopwords=False, stemming=False)

average accuracy of majority model:  0.5251789020925447
average accuracy of logistic regression model:  0.6744905956112853
p value:  0.00016852175814685281


#### Test TF-IDF classifier (trained on balanced dataset) on test set

In [124]:
def test_balanced_tf_idf():
    X_train, X_test, Y_train, Y_test = get_data()
    
    X_train = X_train.apply(lambda comment: preprocessing(comment, remove_stopwords = False, stemming = False))
    X_test = X_test.apply(lambda comment: preprocessing(comment, remove_stopwords = False, stemming = False))
    
    tf_idf_vectorizer = TfidfVectorizer()

    X_train_tf_idf = tf_idf_vectorizer.fit_transform(X_train)
    
    X_test_tf_idf = tf_idf_vectorizer.transform(X_test)
    
    X_train_balanced, Y_train_balanced = balance(X_train_tf_idf, Y_train)
    
    ## logistic regression
    lgm = LogisticRegression(C=10, random_state=0, solver='newton-cg', multi_class='multinomial')
    lgm.fit(X_train_balanced, Y_train_balanced)

    Y_result = lgm.predict(X_test_tf_idf)
    
    score = accuracy_score(Y_test, Y_result)
    print("accuracy score of logistic regression model: ", score)

In [125]:
test_balanced_tf_idf()

accuracy score of logistic regression model:  0.7894736842105263
