In [1]:
# Import libraries
import nltk
import sklearn
import pandas as pd
import numpy as np
import csv, re
import string
import codecs
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer
from spacy.lang.en import English
from nltk.stem import PorterStemmer
from tqdm import tqdm
from sklearn.linear_model import LogisticRegression
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import jsonlines
import spacy
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

In [2]:
# Load CSV data

path_file= r"data/wcpr_mypersonality.csv"
df = pd.read_csv(path_file, encoding='latin-1')

# Convert to jsonlines format
with jsonlines.open('myPersonality.jsonl', 'w') as writer:
    for _, row in df.iterrows():
        writer.write(row.to_dict())
        
# Load jsonlines data
data = []
with jsonlines.open('myPersonality.jsonl', 'r') as reader:
    for item in reader:
        data.append(item)

# Convert to DataFrame
df = pd.DataFrame(data)

FileNotFoundError: [Errno 2] No such file or directory: 'data/wcpr_mypersonality.csv'

In [None]:
df.columns

In [None]:
import csv
import pandas as pd
from io import StringIO

def read_and_clean_lines(df):
    print("\nReading and cleaning text from dataframe")
    lines = []
    neurotic_flags = []

    csv_data = StringIO(df.to_csv(index=False))
    reader = csv.reader(csv_data)
    next(reader, None)
    for row in reader:
        lines.append(row[1])
        neurotic_flags.append(row[8])

    print("Read {} status posts.".format(len(lines)))
    print("Read {} labels".format(len(neurotic_flags)))
    return lines, neurotic_flags

In [None]:
# Read a set of stoplist words from filename, assuming it contains one word per line

def load_stopwords(filename):
    stopwords = []
    with codecs.open(filename, 'r', encoding='ascii', errors='ignore') as fp:
        stopwords = fp.read().split('\n')
    return set(stopwords)

In [None]:
stop_words = load_stopwords(path_file)

Split dataset

In [None]:
# Split dataset into training items/labels
# X_train and y_train  are the training items and labels, respectively
# X_test  and y_test   are the test items and labels, respectively
def split_training_set(lines, labels, test_size=0.2, random_seed=42):
    X_train, X_test, y_train, y_test = train_test_split(lines, labels, test_size=test_size,stratify=labels,shuffle=True)
    print("Training set label counts: {}".format(Counter(y_train)))
    print("Test set     label counts: {}".format(Counter(y_test)))
    return X_train, X_test, y_train, y_test

In [None]:
# Read the dataset in and split it into training documents/labels (X) and test documents/labels (y)
lines, neurotic_flags = read_and_clean_lines(df)
X_train, X_test, y_train, y_test = split_training_set(lines, neurotic_flags)

Pass the above stopwords(frequently used words) list as argument to count vectorizer.
CountVectorizer tokenizes(tokenization means dividing the sentences in words) the text along with performing very basic preprocessing. It removes the punctuation marks and converts all the words to lowercase.

In [None]:
# Function to read train(X) data use stopwords, lowercase and ngram_range as arguments
def convert_text_into_features(X, stopwords_arg, analyzefn="word", range=(1,2)):
    training_vectorizer = CountVectorizer(stop_words=stopwords_arg,
                                          analyzer=analyzefn,
                                          lowercase=True,
                                          ngram_range=range)
    X_features = training_vectorizer.fit_transform(X)
    return X_features, training_vectorizer

In [None]:
# Functions to tokenize and normalize
def whitespace_tokenizer(line):
    return line.split()

def normalize_tokens(tokenlist):
    normalized_tokens = [token.lower().replace('_','+') for token in tokenlist   # lowercase, _ => +
                             if re.search('[^\s]', token) is not None            # ignore whitespace tokens
                             and not token.startswith("@")                       # ignore  handles
                        ]
    return normalized_tokens  

In [None]:
# Functions for ngram and filtering stopwords
def ngrams(tokens, n):
    # Returns all ngrams of size n in sentence, where an ngram is itself a list of tokens
    return [tokens[i:i+n] for i in range(len(tokens)-n+1)]

def filter_punctuation_bigrams(ngrams):
    punct = string.punctuation
    return [ngram   for ngram in ngrams   if ngram[0] not in punct and ngram[1] not in punct]

# Filters values with punctionation tokens for 3-grams and returns the items that were not removed
def filter_punctuation_trigrams(ngrams):
    punct = string.punctuation
    return [ngram for ngram in ngrams if ngram[0] not in punct and ngram[1] not in punct and ngram[2] not in punct]

# Filters values with stop words for 2-grams and returns the items that were not removed
def filter_stopword_bigrams(ngrams, stopwords):
    result = [ngram   for ngram in ngrams   if ngram[0] not in stopwords and ngram[1] not in stopwords]
    return result

# Filters values with stop words for 3-grams and returns the items that were not removed
def filter_stopword_trigrams(ngrams, stopwords):
    result = [ngram for ngram in ngrams if ngram[0] not in stopwords and ngram[1] not in stopwords and ngram[2] not in stopwords]
    return result

Input:
lines     - a raw text corpus, where each element in the list is a string
stopwords - a set of strings that are stopwords
remove_stopword_bigrams = True or False

Output:  a corresponding list converting the raw strings to space-separated features

The features extracted should include non-stopword, non-punctuation unigrams,
plus the bigram features that were counted in collect_bigram_counts from the previous assignment
represented as underscore_separated tokens.

In [None]:
# Function to convert the raw strings to space-separated features 

def convert_lines_to_feature_strings(lines, stopwords, remove_stopword_ngrams=True, applied_ngrams=[2, 3]):

    print(" Converting from raw text to unigram and bigram features")
    if remove_stopword_ngrams:
        print(" Includes filtering stopword bigrams")
        
    print(" Initializing")
    nlp          = English(parser=False)
    all_features = []
    print(" Iterating through documents extracting ngram features")
    stemmer = PorterStemmer()

    for line in tqdm(lines):
        
        # Get spacy tokenization and normalize the tokens
        spacy_analysis    = nlp(line)
        spacy_tokens      = [token.orth_ for token in spacy_analysis]
        normalized_tokens = normalize_tokens(spacy_tokens)
        
        # Collect unigram tokens as features
        # Exclude unigrams that are stopwords or are punctuation strings (e.g. '.' or ',')
        unigrams          = [token   for token in normalized_tokens
                                 if token not in stopwords and token not in string.punctuation]

        # Collect string bigram tokens as features
        bigrams = []
        bigram_tokens     = ["_".join(bigram) for bigram in bigrams]
        bigrams           = ngrams(normalized_tokens, 2) 
        bigrams           = filter_punctuation_bigrams(bigrams)
        if remove_stopword_ngrams:
            bigrams = filter_stopword_bigrams(bigrams, stopwords)
        bigram_tokens = ["_".join(bigram) for bigram in bigrams]

       # Collect string trigram tokens as features
        trigrams = []
        trigram_tokens    = ["_".join(trigram) for trigram in trigrams]
        trigrams          = ngrams(normalized_tokens, 3)
        trigrams          = filter_punctuation_trigrams(trigrams)
        if remove_stopword_ngrams:
            trigrams = filter_stopword_trigrams(trigrams, stopwords)
        trigram_tokens = ["_".join(trigram) for trigram in trigrams]

        feature_string = " ".join(unigrams) + " " + " ".join(bigram_tokens)
        if 3 in applied_ngrams:
            feature_string += " " + " ".join(trigram_tokens)
        
        # Add this feature string to the output
        all_features.append(feature_string)
        
    return all_features

Roll your own feature extraction.

In [None]:
# Call convert_lines_to_feature_strings() to get your features
# as a whitespace-separated string that will now represent the document.
print("Creating feature strings for training data")
X_train_feature_strings = convert_lines_to_feature_strings(X_train, stop_words, applied_ngrams=[2, 3])
                                                           
print("Creating feature strings for test data")
X_test_documents = convert_lines_to_feature_strings(X_test,  stop_words, applied_ngrams=[2, 3])
    
# Call CountVectorizer with whitespace-based tokenization as the analyzer, so that it uses exactly your features,
# but without doing any of its own analysis/feature-extraction.
X_features_train, training_vectorizer = convert_text_into_features(X_train_feature_strings, stop_words, whitespace_tokenizer)
        

In [None]:
# Apply the "vectorizer" created using the training data to the test documents, to create testset feature vectors
X_test_features =  training_vectorizer.transform(X_test_documents)

logistic regression

In [None]:
# Create a logistic regression classifier trained on the featurized training data
lr_classifier = LogisticRegression(solver='liblinear')
lr_classifier.fit(X_features_train, y_train)

In [None]:
# Classify the test data and see how well you perform
    
print("Classifying test data...")
predicted_labels = lr_classifier.predict(X_test_features)
print('Accuracy  = {}'.format(metrics.accuracy_score(predicted_labels,  y_test)))
for label in ['n', 'y']:
    print('Precision for label {} = {}'.format(label, metrics.precision_score(predicted_labels, y_test, pos_label=label)))
    print('Recall    for label {} = {}'.format(label, metrics.recall_score(predicted_labels, y_test, pos_label=label)))

In [None]:
 # Graph the confusion matrix to show accuracies

print("Generating plots...")
metrics.plot_confusion_matrix(lr_classifier, X_test_features, y_test, normalize='true')
plt.show()