# Answering research question 1

In [1]:
import pandas as pd
from collections import Counter

## Loading data

In [2]:
data_pan15_train = pd.read_csv(r'C:\Users\Sten\Documents\EUR BIM\thesis\data\data\raw_data\PAN_15_training.csv')
data_pan15_test = pd.read_csv(r'C:\Users\Sten\Documents\EUR BIM\thesis\data\data\raw_data\PAN_15_test.csv')

In [3]:
combined_pan15_train = data_pan15_train.groupby('author').agg({
    'text': ' '.join,
    'gender': 'first',
    'age': 'first'
}).reset_index()
combined_pan15_test = data_pan15_test.groupby('author').agg({
    'text': ' '.join,
    'gender': 'first',
    'age': 'first'
}).reset_index()

In [4]:
df_gpt = pd.read_csv(r'C:\Users\Sten\Documents\EUR BIM\thesis\data\data\gpt1.csv')
df_gemini = pd.read_csv(r'C:\Users\Sten\Documents\EUR BIM\thesis\data\data\gemini1.csv')

## Variable creation

In [5]:
import re
from nltk.probability import FreqDist
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import statistics


# Character-based features
def character_count(text):
    return len(text)

def alphabetic_ratio(text):
    alphabetic = sum(c.isalpha() for c in text)
    return alphabetic/len(text)

def uppercase_ratio(text):
    upper = sum(c.isupper() for c in text)
    return upper/len(text)

def digit_ratio(text):
    digit = sum(c.isdigit() for c in text)
    return digit/len(text)

def whitespace_ratio(text):
    whitespace = sum(c.isspace() for c in text)
    return whitespace/len(text)

def tab_ratio(text):
    tabs = text.count('\t')
    return tabs/len(text)

def letter_ratio(text, letter):
    text = text.lower()
    letter_count = text.count(letter)
    return letter_count/len(text)

def specialcharacter_ratio(text, character):
    spec_count = text.count(character)
    return spec_count/len(text)

# Word-based features
def number_words(text):
    words = re.findall(r'\b\w+\b', text)
    return len(words)

def word_length(text):
    words = re.findall(r'\b\w+\b', text)
    num_words = len(words)
    if num_words == 0:
        return 0

    total_length = sum(len(word) for word in words)
    return total_length/num_words

def vocabulary_richness(text):
    words = re.findall(r'\b\w+\b', text)
    num_words = len(words)
    if num_words == 0:
        return 0
    
    num_uniq_words = len(set(words))
    return num_uniq_words/num_words

def long_words(text):
    words = re.findall(r'\b\w+\b', text)
    long_words_list = [word for word in words if len(word) > 6]
    return len(long_words_list)/len(words)

def short_words(text):
    words = re.findall(r'\b\w+\b', text)
    short_words_list = [word for word in words if 1 <= len(word) <= 3]
    return len(short_words_list)/len(words)

def legomena(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    legomena = [word for word in freq if freq[word] == 1]
    return len(legomena)/len(words)

def dislegomena(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    dislegomena = [word for word in freq if freq[word] == 2]
    return len(dislegomena)/len(words)

def yules_k(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    N = len(words)
    Vi = FreqDist(freq.values())
    K = 10**4 * ((-N + sum(i**2 * Vi[i] for i in Vi))/N**2)
    return K

def simpson_d(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    N = len(words)
    if N < 2:
        return 0
    D = sum(fr * (fr - 1) / (N * (N - 1)) for fr in freq.values())
    return D

def sichel_s(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    dislegomena = [word for word in freq if freq[word] == 2]
    S = len(dislegomena)/len(freq.values())
    return S

def honores_r(text):
    words = re.findall(r'\b\w+\b', text.lower())
    if not words:
        return 0
    freq = FreqDist(words)
    N = len(words)
    V = len(freq.values())
    legomena = [word for word in freq if freq[word] == 1]
    unique_count_ratio = len(legomena) / V if V > 0 else 0
    if unique_count_ratio == 1 or N == 0:
        return 0
    R = (100*np.log(N)/(1-(len(legomena)/V)))
    return R

def entropy(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    N = len(words)
    E = -sum((fr / N) * np.log(fr/N) for fr in freq.values())
    return E

# Syntatic features
def punctuations_ratio(text, punctuation):
    punctuation_list = re.findall(punctuation, text)
    return len(punctuation_list)/len(text)

# Structural features
def lines(text):
    return len(text.split('\n'))

def sentences(text):
    return len(sent_tokenize(text))

def paragraphs(text):
    return len([par for par in text.split('\n\n') if par.strip()])

def sentence_paragraph(text):
    pars = [par for par in text.split('\n\n') if par.strip()]
    return statistics.mean([len(sent_tokenize(par)) for par in pars])

def words_paragraph(text):
    pars = [par for par in text.split('\n\n') if par.strip()]
    return statistics.mean([len(re.findall(r'\b\w+\b', par)) for par in pars])

def chars_paragraph(text):
    pars = [par for par in text.split('\n\n') if par.strip()]
    return statistics.mean([len(par) for par in pars])

def words_sentences(text):
    sents = sent_tokenize(text)
    return statistics.mean([len(word_tokenize(sentence)) for sentence in sents])

def uppercase_start(text):
    sents = sent_tokenize(text)
    return (sum(1 for sentence in sents if sentence[0].isupper()) / len(sents))

def extract_features(dataframe, text_column):
    features = pd.DataFrame()

    # Character-based features
    features['total_characters'] = dataframe[text_column].apply(character_count)
    features['ratio_alphabetic'] = dataframe[text_column].apply(alphabetic_ratio)
    features['ratio_uppercase'] = dataframe[text_column].apply(uppercase_ratio)
    features['ratio_digit'] = dataframe[text_column].apply(digit_ratio)
    features['ratio_whitespace'] = dataframe[text_column].apply(whitespace_ratio)
    features['ratio_tabspace'] = dataframe[text_column].apply(tab_ratio)
    letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
    for letter in letters:
        features[letter+'_frequency'] = dataframe[text_column].apply(letter_ratio, args=(letter,))
    special_characters = ['~', '@', '#', '$', '%', '^', '&', '*', '-', '_', '=', '+', '>', '<', '[', ']', '{', '}', '/', '\\', '|']
    for character in special_characters:
        features[character+'_frequency'] = dataframe[text_column].apply(specialcharacter_ratio, args=(character,))
    
    # Word-based features
    features['total_words'] = dataframe[text_column].apply(number_words)
    features['word_length'] = dataframe[text_column].apply(word_length)
    features['vocabulary_richness'] = dataframe[text_column].apply(vocabulary_richness)
    features['long_words'] = dataframe[text_column].apply(long_words)
    features['short_words'] = dataframe[text_column].apply(short_words)
    features['hapax_legomena'] = dataframe[text_column].apply(legomena)
    features['hapax_dislegomena'] = dataframe[text_column].apply(dislegomena)
    features['yules_k'] = dataframe[text_column].apply(yules_k)
    features['simpson_d'] = dataframe[text_column].apply(simpson_d)
    features['sichel_s'] = dataframe[text_column].apply(sichel_s)
    #features['honore_r'] = dataframe[text_column].apply(honores_r)
    features['entropy'] =  dataframe[text_column].apply(entropy)
    # Brunet W?
    # word length frequency distribution

    
    # Syntactic features
    punctuations = [r"’", r",", r"\.", r":", r";", r"\?", r"\?{2,}", r"!", r"!{2,}", r"\.{3}"]
    for punctuation in punctuations:
        features[punctuation+"_frequency"] = dataframe[text_column].apply(punctuations_ratio, args=(punctuation,))

    # Structural features
    features['number_lines'] = dataframe[text_column].apply(lines)
    features['number_sentences'] = dataframe[text_column].apply(sentences)
    features['number_paragraphs'] = dataframe[text_column].apply(paragraphs)
    features['sentences_per_paragraph'] = dataframe[text_column].apply(sentence_paragraph)
    features['word_per_paragraph'] = dataframe[text_column].apply(words_paragraph)
    features['character_per_paragraph'] = dataframe[text_column].apply(chars_paragraph)
    features['word_per_sentence'] = dataframe[text_column].apply(words_sentences)
    features['ratio_sentencestart_uppercase'] = dataframe[text_column].apply(uppercase_start)
    #features['gender'] = dataframe['gender']
        
    return features


## Data transformation

In [6]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [7]:
combined_pan15_train_features = extract_features(combined_pan15_train, 'text')
combined_pan15_test_features = extract_features(combined_pan15_test, 'text')

In [8]:
scaler = StandardScaler()
combined_pan15_train_features_scaled = pd.DataFrame(scaler.fit_transform(combined_pan15_train_features), columns=combined_pan15_train_features.columns)
combined_pan15_test_features_scaled = pd.DataFrame(scaler.transform(combined_pan15_test_features), columns=combined_pan15_test_features.columns)

In [10]:
X_train = combined_pan15_train_features_scaled
X_test = combined_pan15_test_features_scaled

In [9]:
y_train = combined_pan15_train['gender']
y_test = combined_pan15_test['gender']

## Training model

In [22]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [129]:
model = LinearSVC(max_iter=10000, C= 0.01, class_weight= None, loss= 'squared_hinge', tol= 0.0001)
model.fit(X_train, y_train)

In [130]:
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.7535211267605634

## Training improved model

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from scipy.sparse import csr_matrix, hstack

In [12]:
char_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), sublinear_tf=True, min_df=2)
word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), sublinear_tf=True, min_df=2)

combined_features = FeatureUnion([
    ('char', char_vectorizer),
    ('word', word_vectorizer)
])

In [13]:
tfidf_features_train = combined_features.fit_transform(combined_pan15_train['text'])
tfidf_features_test =  combined_features.transform(combined_pan15_test['text'])

In [14]:
X_train = hstack([tfidf_features_train, csr_matrix(combined_pan15_train_features_scaled)])
X_test = hstack([tfidf_features_test, csr_matrix(combined_pan15_test_features_scaled)])

In [34]:
model = LinearSVC(max_iter=10000, C= 0.01, class_weight= None, loss= 'squared_hinge', tol= 0.0001)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Precision:", round(precision_score(y_test, y_pred, pos_label='F'), 3))
print("Recall:", round(recall_score(y_test, y_pred, pos_label='F'), 3))

Accuracy: 0.754
Precision: 0.743
Recall: 0.775


In [30]:
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)

# Make predictions
y_pred_lr = model_lr.predict(X_test)

# Print evaluation metrics
print("Logistic Regression:")
print("Accuracy:", round(accuracy_score(y_test, y_pred_lr), 3))
print("Precision:", round(precision_score(y_test, y_pred_lr, pos_label='F'), 3))
print("Recall:", round(recall_score(y_test, y_pred_lr, pos_label='F'), 3))

Logistic Regression:
Accuracy: 0.754
Precision: 0.725
Recall: 0.817


In [29]:
model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)

# Make predictions
y_pred_rf = model_rf.predict(X_test)

# Print evaluation metrics
print("Random Forest:")
print("Accuracy:", round(accuracy_score(y_test, y_pred_rf), 3))
print("Precision:", round(precision_score(y_test, y_pred_rf, pos_label='F'), 3))
print("Recall:", round(recall_score(y_test, y_pred_rf, pos_label='F'), 3))

Random Forest:
Accuracy: 0.725
Precision: 0.716
Recall: 0.746


In [41]:
len(y_pred)

142

In [20]:
X_train

<152x164169 sparse matrix of type '<class 'numpy.float64'>'
	with 1463113 stored elements in Compressed Sparse Row format>

## Predicting LLM data

In [17]:
gpt_features = extract_features(df_gpt, 'text')

In [18]:
gpt_features_scaled = pd.DataFrame(scaler.transform(gpt_features), columns=gpt_features.columns)

In [23]:
gpt_pred = model.predict(gpt_features_scaled)



ValueError: X has 82 features, but LinearSVC is expecting 164169 features as input.

In [33]:
Counter(gpt_pred)

Counter({'M': 109, 'F': 141})

In [135]:
gpt_pred

array(['M', 'F', 'M', 'M', 'M', 'F', 'M', 'M', 'M', 'M', 'F', 'M', 'M',
       'M', 'M', 'M', 'F', 'M', 'M', 'M', 'F', 'M', 'M', 'F', 'M', 'F',
       'F', 'F', 'M', 'F', 'M', 'M', 'F', 'F', 'M', 'F', 'M', 'F', 'F',
       'F', 'M', 'M', 'F', 'F', 'F', 'M', 'F', 'M', 'F', 'F', 'F', 'M',
       'F', 'M', 'M', 'F', 'F', 'M', 'F', 'M', 'M', 'F', 'F', 'M', 'M',
       'M', 'F', 'F', 'M', 'F', 'M', 'M', 'F', 'M', 'F', 'M', 'M', 'F',
       'F', 'M', 'M', 'M', 'F', 'F', 'M', 'F', 'M', 'M', 'F', 'M', 'F',
       'M', 'F', 'M', 'M', 'F', 'M', 'F', 'F', 'M', 'M', 'F', 'M', 'F',
       'M', 'M', 'F', 'F', 'M', 'F', 'F', 'M', 'M', 'F', 'M', 'F', 'F',
       'M', 'M', 'F', 'M', 'F', 'F', 'M', 'M', 'F', 'M', 'F', 'M', 'M',
       'M', 'F', 'F', 'F', 'F', 'F', 'M', 'F', 'F', 'F', 'F', 'M', 'M',
       'F', 'F', 'M', 'F', 'M', 'F', 'F', 'M', 'F', 'F', 'F', 'F', 'M',
       'F', 'M', 'F', 'F', 'F', 'F', 'F', 'M', 'F', 'F', 'F', 'F', 'F',
       'F', 'F', 'F', 'F', 'M', 'F', 'F', 'F', 'F', 'M', 'F', 'F

In [39]:
gemini_features = extract_features(df_gemini, 'text')

In [40]:
gemini_features_scaled = pd.DataFrame(scaler.fit_transform(gemini_features), columns=gemini_features.columns)

In [29]:
gemini_pred = model.predict(gemini_features_scaled)



ValueError: X has 82 features, but LinearSVC is expecting 164169 features as input.

In [35]:
Counter(gemini_pred)

NameError: name 'gemini_pred' is not defined

In [24]:
gemini_features_scaled

Unnamed: 0,total_characters,ratio_alphabetic,ratio_uppercase,ratio_digit,ratio_whitespace,ratio_tabspace,a_frequency,b_frequency,c_frequency,d_frequency,...,"!{2,}_frequency",\.{3}_frequency,number_lines,number_sentences,number_paragraphs,sentences_per_paragraph,word_per_paragraph,character_per_paragraph,word_per_sentence,ratio_sentencestart_uppercase
0,2.537847,-2.870541,1.533162,-0.085749,1.936818,0.0,-0.420483,0.544050,0.029489,-0.591686,...,0.0,0.0,0.0,7.320620,0.0,7.320620,2.519023,2.537847,-2.143724,1.998703
1,2.421016,-2.407078,-0.302430,-0.085749,2.209718,0.0,-0.767916,0.984258,0.214907,0.536856,...,0.0,0.0,0.0,3.186978,0.0,3.186978,2.861480,2.421016,-1.407426,2.044244
2,1.638248,-2.884950,1.248158,-0.085749,3.366712,0.0,0.151419,1.967440,-2.189132,0.373271,...,0.0,0.0,0.0,2.153567,0.0,2.153567,2.359209,1.638248,-1.199004,0.827662
3,1.038516,-2.144681,-0.473977,-0.085749,1.896953,0.0,-0.459126,-0.108360,-1.395644,-1.257025,...,0.0,0.0,0.0,1.636862,0.0,1.636862,1.331836,1.038516,-1.208947,1.657149
4,1.591516,-2.481022,-0.766962,-0.085749,2.152108,0.0,-0.939128,0.266312,-0.209004,0.177763,...,0.0,0.0,0.0,3.186978,0.0,3.186978,1.948260,1.591516,-1.618793,2.044244
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
132,-0.266096,0.577826,0.130744,-0.085749,-0.753723,0.0,1.250395,0.569119,-1.113973,-1.179941,...,0.0,0.0,0.0,-0.429959,0.0,-0.429959,-0.334792,-0.266096,0.275780,-0.665415
133,0.146707,0.821527,-1.351670,-0.085749,-0.720373,0.0,-0.319365,0.509908,0.923525,-0.746799,...,0.0,0.0,0.0,-0.429959,0.0,-0.429959,-0.037995,0.146707,0.670161,-0.665415
134,-0.293357,0.259872,-1.250403,-0.085749,-0.421717,0.0,-0.114009,-1.094035,-0.328616,0.726827,...,0.0,0.0,0.0,-0.429959,0.0,-0.429959,-0.334792,-0.293357,0.322178,-0.665415
135,-0.569856,0.915314,2.856036,-0.085749,-0.957574,0.0,-0.454568,-0.945358,-0.614918,1.426485,...,0.0,0.0,0.0,-0.429959,0.0,-0.429959,-0.631589,-0.569856,-0.049004,-0.665415


In [25]:
gpt_features_scaled

Unnamed: 0,total_characters,ratio_alphabetic,ratio_uppercase,ratio_digit,ratio_whitespace,ratio_tabspace,a_frequency,b_frequency,c_frequency,d_frequency,...,"!{2,}_frequency",\.{3}_frequency,number_lines,number_sentences,number_paragraphs,sentences_per_paragraph,word_per_paragraph,character_per_paragraph,word_per_sentence,ratio_sentencestart_uppercase
0,-2.300641,3.588769,-1.690213,-1.505044,-1.080866,-3.423313,1.009045,-0.777722,-0.776379,4.347600,...,-0.615171,-0.726474,-0.420531,-1.276310,-0.244418,-1.102070,-1.532015,-1.443390,0.189019,0.016981
1,-2.371937,3.867174,-1.586585,-1.557759,-1.086958,-3.423313,1.273053,-2.088889,-0.072512,3.326807,...,-0.615171,-0.726474,-0.420531,-1.383305,-0.244418,-1.218635,-1.605638,-1.500535,1.230736,0.823698
2,-2.372356,3.576506,-1.735803,-1.557759,-0.899938,-3.423313,1.379058,-1.096225,0.705938,3.332185,...,-0.615171,-0.726474,-0.420531,-1.365473,-0.244418,-1.199208,-1.601431,-1.500871,0.894019,-0.655283
3,-2.364388,3.782512,-1.576787,-1.557759,-0.989821,-3.423313,-0.221008,-1.875153,-0.642413,4.738412,...,-0.615171,-0.726474,-0.420531,-1.418970,-0.244418,-1.257490,-1.597224,-1.494484,2.682825,-0.655283
4,-2.386196,3.443498,-1.726003,-1.557759,-0.943442,-3.423313,1.088378,-0.766340,-0.406108,3.685398,...,-0.615171,-0.726474,-0.420531,-1.311975,-0.244418,-1.140925,-1.609845,-1.511964,0.283721,-1.887767
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
245,-2.406746,4.448035,-2.250283,-1.557759,-1.313927,-3.423313,1.723779,-1.416760,-0.242465,2.731020,...,-0.615171,-0.726474,-0.420531,-1.454635,-0.244418,-1.296345,-1.639293,-1.528435,8.806854,3.042169
246,-2.426037,4.265779,-2.214060,-1.557759,-1.222449,-3.423313,1.087720,-0.245548,-0.492761,3.344958,...,-0.615171,-0.726474,-0.420531,-1.454635,-0.244418,-1.296345,-1.645604,-1.543897,8.775287,3.042169
247,-2.417230,4.311095,-2.179401,-1.557759,-1.322574,-3.423313,-0.216587,-0.833784,-0.197470,2.865185,...,-0.615171,-0.726474,-0.420531,-1.454635,-0.244418,-1.296345,-1.641397,-1.536838,8.806854,3.042169
248,-2.404229,4.499228,-2.250343,-1.557759,-1.382755,-3.423313,0.538793,-0.913124,-0.172029,3.758954,...,-0.615171,-0.726474,-0.420531,-1.454635,-0.244418,-1.296345,-1.641397,-1.526418,8.712153,3.042169


In [19]:
gpt_features_tfidf = combined_features.transform(df_gpt['text'])
gpt_features_combined = hstack([gpt_features_tfidf, csr_matrix(gpt_features_scaled)])

In [36]:
gpt_pred = model.predict(gpt_features_combined)
Counter(gpt_pred)

Counter({'M': 109, 'F': 141})

In [41]:
gemini_features_tfidf = combined_features.transform(df_gemini['text'])
gemini_features_combined = hstack([gemini_features_tfidf, csr_matrix(gemini_features_scaled)])

In [42]:
gemini_pred = model.predict(gemini_features_combined)
Counter(gemini_pred)

Counter({'F': 67, 'M': 70})