In [11]:
import sklearn
import pandas as pd
import numpy as np
import csv, re
import string
import codecs
import nltk
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import GridSearchCV
import sklearn.metrics as metrics
import matplotlib.pyplot as plt
import jsonlines

In [2]:
# Load data into a pandas dataframe

data = pd.read_csv('data/wcpr_mypersonality.csv',encoding='mac-roman')

In [3]:
data.columns

Index(['#AUTHID', 'STATUS', 'sEXT', 'sNEU', 'sAGR', 'sCON', 'sOPN', 'cEXT',
       'cNEU', 'cAGR', 'cCON', 'cOPN', 'DATE', 'NETWORKSIZE', 'BETWEENNESS',
       'NBETWEENNESS', 'DENSITY', 'BROKERAGE', 'NBROKERAGE', 'TRANSITIVITY'],
      dtype='object')

In [4]:
X = data['STATUS']
y = data['cNEU']

In [5]:
from sklearn.model_selection import train_test_split

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, shuffle=True, random_state=42)

In [6]:
# Extract features using vectorizer
#the training and testing data were vectorized by CountVectorizer and TfidfVectorizer objects. 
#This transformation is needed, because the ML algorithms work only with numerical data. 
#CountVectorizer creates a dictionary containing the occurrence number of tokens, 
#while TfidfVectorizer generates a dictionary with the tf-idf values of tokens.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

# Initialize count vectorizer
count_vectorizer = CountVectorizer(analyzer='word', stop_words='english')

# create count train and test variables
count_train = count_vectorizer.fit_transform(X_train, y_train)
count_test = count_vectorizer.transform(X_test)

# Initialize TfidfVectorizer
count_vectorizer = TfidfVectorizer(analyzer='word', stop_words='english')

# create count train and test variables
tdidf_train = count_vectorizer.fit_transform(X_train, y_train)
tdidf_test = count_vectorizer.transform(X_test)

In [12]:
# Define the hyperparameters to search over
param_grid = {'C': [0.01, 0.1, 1, 10], 'penalty': ['l1', 'l2']}

# Create a grid search object
grid_search = GridSearchCV(LogisticRegression(solver='liblinear'), param_grid, cv=5)

# Fit the grid search object to the training data
grid_search.fit(count_train, y_train)

# Print the best hyperparameters found by the grid search
print("Best hyperparameters: ", grid_search.best_params_)

# Create a new logistic regression object with the best hyperparameters and fit it to the entire training data
best_logreg = LogisticRegression(**grid_search.best_params_, solver='liblinear')
best_logreg.fit(count_train, y_train)

# Evaluate the performance of the final model on the test data
test_accuracy = best_logreg.score(count_test, y_test)
print("Test accuracy: ", test_accuracy)

Best hyperparameters:  {'C': 0.1, 'penalty': 'l2'}
Test accuracy:  0.6370967741935484


In [16]:
# Classify the test data and see how well the vectorizers perform

print("Classifying test data...")
# Run predict on count test data
predicted_labels_count = best_logreg.predict(count_test)
print('Logistic Regression Accuracy  = {}'.format(metrics.accuracy_score(predicted_labels_count,  y_test)))


Classifying test data...
Logistic Regression Accuracy  = 0.6370967741935484


In [17]:
# Report for both vectorizers
report_count = classification_report(predicted_labels_count, y_test)
print(f"Classification report after parameter tuning:\n{report_count}")

Classification report after parameter tuning:
              precision    recall  f1-score   support

           n       0.96      0.64      0.77      1856
           y       0.10      0.59      0.17       128

    accuracy                           0.64      1984
   macro avg       0.53      0.62      0.47      1984
weighted avg       0.90      0.64      0.73      1984

