In [27]:
import sklearn
import pandas as pd
import numpy as np
import csv, re
import string
import codecs
import nltk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import jsonlines

In [28]:
# Load data into a pandas dataframe

data = pd.read_csv('/data/wcpr_mypersonality.csv',encoding='mac-roman')

In [29]:
data.columns

Index(['#AUTHID', 'STATUS', 'sEXT', 'sNEU', 'sAGR', 'sCON', 'sOPN', 'cEXT',
       'cNEU', 'cAGR', 'cCON', 'cOPN', 'DATE', 'NETWORKSIZE', 'BETWEENNESS',
       'NBETWEENNESS', 'DENSITY', 'BROKERAGE', 'NBROKERAGE', 'TRANSITIVITY'],
      dtype='object')

In [30]:
X = data['STATUS']
y = data['cNEU']

In [31]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, shuffle=True, random_state=42)

In [32]:
# Extract features using vectorizer
#the training and testing data were vectorized by CountVectorizer and TfidfVectorizer objects. 
#This transformation is needed, because the ML algorithms work only with numerical data. 
#CountVectorizer creates a dictionary containing the occurrence number of tokens, 
#while TfidfVectorizer generates a dictionary with the tf-idf values of tokens.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Initialize count vectorizer
count_vectorizer = CountVectorizer(analyzer='word', stop_words='english')

# create count train and test variables
count_train = count_vectorizer.fit_transform(X_train, y_train)
count_test = count_vectorizer.transform(X_test)

# Initialize TfidfVectorizer
count_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english')

# create count train and test variables
tdidf_train = count_vectorizer.fit_transform(X_train, y_train)
tdidf_test = count_vectorizer.transform(X_test)

In [33]:
# Train a Logistic Regression classifier
clf = LogisticRegression(solver='liblinear')

# Train model for count vectorizer
clf.fit(count_train, y_train)

# Train model for TDIDFvectorizer
clf.fit(tdidf_train, y_train)

In [34]:
# Classify the test data and see how well the vectorizers perform

print("Classifying test data...")
# Run predict on count test data
predicted_labels_count = clf.predict(count_test)
print('Logistic Regression Count Accuracy  = {}'.format(metrics.accuracy_score(predicted_labels_count,  y_test)))

# Run predict on tdidf test data
predicted_labels_tdidf = clf.predict(tdidf_test)
print('Logistic Regression TDIDF Accuracy  = {}'.format(metrics.accuracy_score(predicted_labels_tdidf,  y_test)))

Classifying test data...
Logistic Regression Count Accuracy  = 0.6224798387096774
Logistic Regression TDIDF Accuracy  = 0.6461693548387096


In [36]:
# Report for both vectorizers
report_count = classification_report(predicted_labels_count, y_test)
print(f"Classification report for CountVectorizer:\n{report_count}")

report_tdidf = classification_report(predicted_labels_tdidf, y_test)
print(f"Classification report for tdidfVectorizer:\n{report_tdidf}")

Classification report for CountVectorizer:
              precision    recall  f1-score   support

           n       0.80      0.66      0.73      1489
           y       0.33      0.49      0.40       495

    accuracy                           0.62      1984
   macro avg       0.56      0.58      0.56      1984
weighted avg       0.68      0.62      0.64      1984

Classification report for tdidfVectorizer:
              precision    recall  f1-score   support

           n       0.93      0.65      0.77      1776
           y       0.17      0.60      0.26       208

    accuracy                           0.65      1984
   macro avg       0.55      0.63      0.51      1984
weighted avg       0.85      0.65      0.71      1984

