In [1]:
current_dir = "gdrive/My Drive/Colab Notebooks/Thesis/gold_standard"

import os
import glob
from google.colab import drive
import pandas as pd
import re
import numpy as np
from sklearn.linear_model import SGDRegressor
from sklearn.svm import SVR
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import LeaveOneOut
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.pipeline import Pipeline
import pickle
from collections import Counter


from sklearn.linear_model import (
    LinearRegression,
    Ridge,
    Lasso,
    ElasticNet,
    PoissonRegressor,
    LogisticRegression,)

from sklearn.ensemble import RandomForestRegressor ,GradientBoostingRegressor, BaggingRegressor, AdaBoostRegressor, HistGradientBoostingRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor



# Mount Google Drive
drive.mount('/content/gdrive')


Mounted at /content/gdrive


In [2]:
os.chdir(current_dir)

In [3]:
with open('enriched_dataframe.pkl', 'rb') as file:
    data  = pickle.load(file)

In [84]:
class TextRegressionModel:
    def __init__(self, train_data, target_column, text_column, vectorizer = None, model=None):
        self.train_data = train_data
        self.target_column = target_column
        self.text_column = text_column
        self.model = Ridge() if model is None else model
        self.vectorizer = CountVectorizer() if vectorizer is None else vectorizer
        self.results_df = None
        self.data_list = []
        self.feature_importance_list = []


    def vectorize_text(self):
        X = self.vectorizer.fit_transform(self.train_data[self.text_column])
        self.feature_names = self.vectorizer.get_feature_names_out()
        return X

    def perform_loocv(self):
        X = self.vectorize_text()
        y = self.train_data[self.target_column]
        loo = LeaveOneOut()
        for train_index, test_index in loo.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y.iloc[train_index], y.iloc[test_index]
            names_test = self.train_data['name'].iloc[test_index]

            if hasattr(self.model, "requires_dense") and self.model.requires_dense:
              X_train = X_train.toarray()
              X_test = X_test.toarray()


            self.model.fit(X_train, y_train)

            y_pred = self.model.predict(X_test)

            for name, predicted, actual in zip(names_test, y_pred, y_test):
                self.data_list.append({'name': name, 'predicted': predicted, 'actual': actual, 'difference': predicted - actual})
            try:
              split_feature_importance = self.model.coef_
              feature_importance_dict = {
                  'test_sample_name': names_test.iloc[0],
                  'feature_importance': {
                      token: importance for token, importance in zip(self.feature_names, split_feature_importance) if abs(importance) > 0
                  }
              }
              self.feature_importance_list.append(feature_importance_dict)
            except:
              pass

        self.results_df = pd.DataFrame(self.data_list)
        self.results_df['MAE'] = self.results_df['difference'].abs()
        return


    def get_coefficients(self):
        # Initialize a dictionary to count the frequency of each token
        token_frequency = {}

        for split_info in self.feature_importance_list:
            for token in split_info['feature_importance']:
                # Count the frequency of each token across different models
                if token in token_frequency:
                    token_frequency[token] += 1
                else:
                    token_frequency[token] = 1

        # Sort tokens by their frequency in descending order
        top_tokens = sorted(token_frequency, key=token_frequency.get, reverse=True)

        # Print the top tokens and their feature importance in each model
        for token in top_tokens:
            # Create a list of importances for each test sample where the token appears
            token_importances_list = [split_info['feature_importance'][token]
                                      for split_info in self.feature_importance_list if token in split_info['feature_importance']]

            # Calculate the average importance for the token
            average_importance = sum(token_importances_list) / len(token_importances_list)

            print(f"\nToken: {token} \nTop performing feature in {token_frequency[token]}/10 models | Average Strength: {average_importance}")

            # Create a list of tuples (test_sample_name, importance) for each test sample where the token appears
            token_importances = [(split_info['test_sample_name'], split_info['feature_importance'][token])
                                 for split_info in self.feature_importance_list if token in split_info['feature_importance']]

            # Sort this list by the absolute value of the importance in descending order
            sorted_token_importances = sorted(token_importances, key=lambda x: abs(x[1]), reverse=True)

            # Print the sorted importances for each token
            for test_sample, importance in sorted_token_importances:
                print(f"Test Sample: {test_sample},  Strength: {importance}")
        return

    def evaluate_model(self):
        overall_r2 = r2_score(self.results_df['actual'], self.results_df['predicted'])
        overall_mse = mean_squared_error(self.results_df['actual'], self.results_df['predicted'])
        overall_rmse = np.sqrt(overall_mse)
        overall_mae = mean_absolute_error(self.results_df['actual'], self.results_df['predicted'])
        ranked_results = self.results_df.sort_values(by='MAE', ascending=True)
        return {
        'info': f"LOOCV Results for {self.model} {self.vectorizer}",
        'results': ranked_results[['name', 'actual', 'predicted', 'MAE']].to_string(index=False),
        'overall_r2': overall_r2,
        'overall_mse': overall_mse,
        'overall_rmse': overall_rmse,
        'overall_mae': overall_mae
                }



In [85]:
train_data = data

In [93]:
vectorizers = [CountVectorizer(ngram_range=(1,1)), TfidfVectorizer(ngram_range=(1,1)), CountVectorizer(ngram_range=(2,2)), TfidfVectorizer(ngram_range=(2,2))]

models = [
    LinearRegression(),
    Lasso(),
    ElasticNet(),
    Ridge(),
    DecisionTreeRegressor(),
    RandomForestRegressor(),
    KNeighborsRegressor(),
    SVR(kernel='linear'),
    SVR(kernel="rbf"),
    PoissonRegressor(),
    #GradientBoostingRegressor(),
    BaggingRegressor(),
    AdaBoostRegressor(),
    #HistGradientBoostingRegressor()
]


In [94]:
winners = []
for model in models:
  for vectorizer in vectorizers:
    test = TextRegressionModel(train_data, target_column='score', text_column='clean_text', vectorizer=vectorizer, model=model)
    test.perform_loocv()
    t = test.evaluate_model()
    r2 = t["overall_r2"]
    if r2>0.5:
      winners.append(t)




In [None]:
info:
LOOCV Results for ElasticNet() CountVectorizer()
results:
    name  actual  predicted      MAE
bernardo    35.0  35.144126 0.144126
ridgeway    19.0  19.714079 0.714079
    gacy    27.0  26.265473 0.734527
  kemper    26.0  24.874667 1.125333
   marsh    35.8  34.036246 1.763754
   bundy    34.0  36.648760 2.648760
  dahmer    23.0  26.873856 3.873856
mitchell    34.0  38.616285 4.616285
   wayne    40.0  34.122665 5.877335
    carl    36.0  44.118066 8.118066
overall_r2:
0.637832007578986
overall_mse:
14.922625092518496
overall_rmse:
3.8629813735660825
overall_mae:
2.9616120845826357


info:
LOOCV Results for Lasso() CountVectorizer()
results:
    name  actual  predicted      MAE
bernardo    35.0  35.248499 0.248499
  kemper    26.0  25.351551 0.648449
ridgeway    19.0  19.808522 0.808522
   bundy    34.0  34.957588 0.957588
    gacy    27.0  26.011289 0.988711
   marsh    35.8  33.493328 2.306672
  dahmer    23.0  26.826342 3.826342
mitchell    34.0  38.772289 4.772289
   wayne    40.0  34.201087 5.798913
    carl    36.0  44.077803 8.077803
overall_r2:
0.6489502456457403
overall_mse:
14.464513658511176
overall_rmse:
3.803224113631903
overall_mae:
2.843378831244703


info:
LOOCV Results for ElasticNet() CountVectorizer(ngram_range=(2, 2))
results:
    name  actual  predicted      MAE
  dahmer    23.0  23.701694 0.701694
mitchell    34.0  35.901163 1.901163
    gacy    27.0  29.036768 2.036768
   bundy    34.0  31.901030 2.098970
  kemper    26.0  23.802226 2.197774
bernardo    35.0  32.501906 2.498094
   marsh    35.8  33.112114 2.687886
    carl    36.0  39.586108 3.586108
ridgeway    19.0  23.829672 4.829672
   wayne    40.0  34.271181 5.728819
overall_r2:
0.757395999757643
overall_mse:
9.996158184385981
overall_rmse:
3.1616701574304016
overall_mae:
2.826694832770637


info:
LOOCV Results for Lasso() CountVectorizer(ngram_range=(2, 2))
results:
    name  actual  predicted      MAE
  dahmer    23.0  22.784588 0.215412
   bundy    34.0  34.412090 0.412090
    gacy    27.0  28.884616 1.884616
bernardo    35.0  32.898529 2.101471
mitchell    34.0  36.460480 2.460480
   marsh    35.8  33.307327 2.492673
  kemper    26.0  23.364392 2.635608
    carl    36.0  39.410318 3.410318
ridgeway    19.0  22.469374 3.469374
   wayne    40.0  34.497808 5.502192
overall_r2:
0.8025926496279935
overall_mse:
8.133893501788009
overall_rmse:
2.8519981594994075
overall_mae:
2.4584234565094314

In [95]:
for i in sorted(winners, key=lambda i: i["overall_r2"]):
  for k,v in i.items():
    print(f"{k}:")
    print(v)
  print("\n")

info:
LOOCV Results for ElasticNet() CountVectorizer()
results:
    name  actual  predicted      MAE
bernardo    35.0  35.144126 0.144126
ridgeway    19.0  19.714079 0.714079
    gacy    27.0  26.265473 0.734527
  kemper    26.0  24.874667 1.125333
   marsh    35.8  34.036246 1.763754
   bundy    34.0  36.648760 2.648760
  dahmer    23.0  26.873856 3.873856
mitchell    34.0  38.616285 4.616285
   wayne    40.0  34.122665 5.877335
    carl    36.0  44.118066 8.118066
overall_r2:
0.637832007578986
overall_mse:
14.922625092518496
overall_rmse:
3.8629813735660825
overall_mae:
2.9616120845826357


info:
LOOCV Results for Lasso() CountVectorizer()
results:
    name  actual  predicted      MAE
bernardo    35.0  35.248499 0.248499
  kemper    26.0  25.351551 0.648449
ridgeway    19.0  19.808522 0.808522
   bundy    34.0  34.957588 0.957588
    gacy    27.0  26.011289 0.988711
   marsh    35.8  33.493328 2.306672
  dahmer    23.0  26.826342 3.826342
mitchell    34.0  38.772289 4.772289
   wayne

In [101]:
lasso_check = TextRegressionModel(train_data, target_column='score', text_column='clean_text', vectorizer=CountVectorizer(), model=Lasso())
lasso_check.perform_loocv()
lasso_check.get_coefficients()


Token: was 
Top performing feature in 10/10 models | Average Strength: -0.28791459779137324
Test Sample: mitchell,  Strength: -0.3415503895369341
Test Sample: gacy,  Strength: -0.32519851656160603
Test Sample: bundy,  Strength: -0.315799205600534
Test Sample: kemper,  Strength: -0.31545620478839653
Test Sample: dahmer,  Strength: -0.3129697038410047
Test Sample: bernardo,  Strength: -0.31120015395030975
Test Sample: ridgeway,  Strength: -0.2956437338466488
Test Sample: marsh,  Strength: -0.28972556709169117
Test Sample: wayne,  Strength: -0.2064881919409591
Test Sample: carl,  Strength: -0.1651143107556484

Token: like 
Top performing feature in 9/10 models | Average Strength: 0.01914764540030129
Test Sample: wayne,  Strength: 0.04281761751233612
Test Sample: bundy,  Strength: 0.04028041719449961
Test Sample: carl,  Strength: -0.03627253961572249
Test Sample: kemper,  Strength: 0.0321273749518544
Test Sample: ridgeway,  Strength: 0.027116296014817874
Test Sample: dahmer,  Strength: 0.

In [None]:
Token: and and
Top performing feature in 9/10 models | Average Strength: -0.29357909154040557
Test Sample: bernardo,  Strength: -0.4435550291168784
Test Sample: gacy,  Strength: -0.3764228605323173
Test Sample: dahmer,  Strength: -0.3676888028632868
Test Sample: carl,  Strength: -0.3587643419616539
Test Sample: bundy,  Strength: -0.3376406755373605
Test Sample: mitchell,  Strength: -0.2970501974800098
Test Sample: marsh,  Strength: -0.24605658787025422
Test Sample: ridgeway,  Strength: -0.11153288272254681
Test Sample: kemper,  Strength: -0.1035004457793427


In [103]:
lasso_check = TextRegressionModel(train_data, target_column='score', text_column='clean_text', vectorizer=CountVectorizer(ngram_range= (2,2)), model=Lasso())
lasso_check.perform_loocv()
lasso_check.get_coefficients()


Token: it was 
Top performing feature in 10/10 models | Average Strength: -1.0246022544218585
Test Sample: kemper,  Strength: -1.1466792659839617
Test Sample: carl,  Strength: -1.1075503017551414
Test Sample: mitchell,  Strength: -1.0624943341600304
Test Sample: bundy,  Strength: -1.0532740567648367
Test Sample: dahmer,  Strength: -1.0406091481261253
Test Sample: gacy,  Strength: -1.022153274012203
Test Sample: marsh,  Strength: -1.004597789704983
Test Sample: bernardo,  Strength: -1.000131276562792
Test Sample: ridgeway,  Strength: -0.9379491739182455
Test Sample: wayne,  Strength: -0.8705839232302678

Token: and and 
Top performing feature in 9/10 models | Average Strength: -0.29357909154040557
Test Sample: bernardo,  Strength: -0.4435550291168784
Test Sample: gacy,  Strength: -0.3764228605323173
Test Sample: dahmer,  Strength: -0.3676888028632868
Test Sample: carl,  Strength: -0.3587643419616539
Test Sample: bundy,  Strength: -0.3376406755373605
Test Sample: mitchell,  Strength: -0