# Classification and Vector Semantics

CS2731 Homework 3, Fall 2020

## Author
[Fangzheng Guo](fag24@pitt.edu)

### Step 1: Global experimental framework set up, make appropriate cross-validation splits.

In [31]:
# import
import pandas as pd
import re
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score

In [32]:
# import
from scipy.stats import ttest_ind
from imblearn.over_sampling import RandomOverSampler
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to /Users/fguo/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/fguo/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [33]:
# split(80% training, 20% testing)
def get_data():
    df = pd.read_excel("SFUcorpus.xlsx")
    X = df['comment_text']
    Y = df['is_constructive']

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, train_size = 0.8, random_state = 42)
    
    # drop empty lines
    X_train.dropna(axis=0, how='any', inplace=True)
    X_test.dropna(axis=0, how='any', inplace=True)
    
    return X_train, X_test, Y_train, Y_test

### Step 2: 
#### Extract and preprocess the comment text to determine the vocabulary set.
for preprocessing, my approach includes:
- remove all symbols (for example '.', '?') in text, convert all characters to lower-case
- remove stopwords (useless words) in text
- use Porter Stemmer to do stemming for every word left in text

In [34]:
stop_words = set(stopwords.words('english'))
def preprocessing(comment, remove_stopwords, stemming):
    comment = re.sub('[^a-z0-9\s]', '', comment.lower())
    word_list = comment.split(' ')
    if stopwords:
        word_list = [word for word in word_list if word not in stop_words]
    if stemming:
        porter_stemmer = PorterStemmer()
        for word in word_list:
            word = porter_stemmer.stem(word)
    return ' '.join(word_list)

#### Train a logistic regression classifier using bag of words, record the performance of your logistic regression classifier using cross-validation.

In [35]:
def process_bag_of_words(remove_stopwords, stemming):
    X_train, X_test, Y_train, Y_test = get_data()
    
    X_train = X_train.apply(lambda comment: preprocessing(comment, remove_stopwords, stemming))
    X_test = X_test.apply(lambda comment: preprocessing(comment, remove_stopwords, stemming))
    
    count_vectorizer = CountVectorizer()
    X_train_bag_of_words = count_vectorizer.fit_transform(X_train)
    
    ## majority-vote classifier
    majority = DummyClassifier(strategy='most_frequent', random_state=None, constant=None)

    ## logistic regression
    lgm = LogisticRegression(C=10, random_state=0, solver='newton-cg', multi_class='multinomial')
    
    scores_majority = cross_val_score(majority, X_train_bag_of_words, Y_train, cv=5)
    scores_lgm_BOW = cross_val_score(lgm, X_train_bag_of_words, Y_train, cv=5)
    print("average accuracy of majority model: ", np.mean(scores_majority))
    print("average accuracy of logistic regression model: ", np.mean(scores_lgm_BOW))

In [36]:
process_bag_of_words(remove_stopwords = False, stemming = False)

average accuracy of majority model:  0.5251789020925447
average accuracy of logistic regression model:  0.8176518735334838
