# Answering research question 1

In [1]:
import pandas as pd
from collections import Counter

## Loading data

In [2]:
data_pan15_train = pd.read_csv(r'C:\Users\Sten\Documents\EUR BIM\thesis\data\data\raw_data\PAN_15_training.csv')
data_pan15_test = pd.read_csv(r'C:\Users\Sten\Documents\EUR BIM\thesis\data\data\raw_data\PAN_15_test.csv')

In [3]:
combined_pan15_train = data_pan15_train.groupby('author').agg({
    'text': ' '.join,
    'gender': 'first',
    'age': 'first'
}).reset_index()
combined_pan15_test = data_pan15_test.groupby('author').agg({
    'text': ' '.join,
    'gender': 'first',
    'age': 'first'
}).reset_index()

In [4]:
df_gpt = pd.read_csv(r'C:\Users\Sten\Documents\EUR BIM\thesis\data\data\gpt1.csv')
df_gemini = pd.read_csv(r'C:\Users\Sten\Documents\EUR BIM\thesis\data\data\gemini1.csv')
df_llama = pd.read_csv(r'C:\Users\Sten\Documents\EUR BIM\thesis\data\data\llama1.csv')

## Variable creation

In [5]:
import re
from nltk.probability import FreqDist
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import statistics


# Character-based features
def character_count(text):
    return len(text)

def alphabetic_ratio(text):
    alphabetic = sum(c.isalpha() for c in text)
    return alphabetic/len(text)

def uppercase_ratio(text):
    upper = sum(c.isupper() for c in text)
    return upper/len(text)

def digit_ratio(text):
    digit = sum(c.isdigit() for c in text)
    return digit/len(text)

def whitespace_ratio(text):
    whitespace = sum(c.isspace() for c in text)
    return whitespace/len(text)

def tab_ratio(text):
    tabs = text.count('\t')
    return tabs/len(text)

def letter_ratio(text, letter):
    text = text.lower()
    letter_count = text.count(letter)
    return letter_count/len(text)

def specialcharacter_ratio(text, character):
    spec_count = text.count(character)
    return spec_count/len(text)

# Word-based features
def number_words(text):
    words = re.findall(r'\b\w+\b', text)
    return len(words)

def word_length(text):
    words = re.findall(r'\b\w+\b', text)
    num_words = len(words)
    if num_words == 0:
        return 0

    total_length = sum(len(word) for word in words)
    return total_length/num_words

def vocabulary_richness(text):
    words = re.findall(r'\b\w+\b', text)
    num_words = len(words)
    if num_words == 0:
        return 0
    
    num_uniq_words = len(set(words))
    return num_uniq_words/num_words

def long_words(text):
    words = re.findall(r'\b\w+\b', text)
    long_words_list = [word for word in words if len(word) > 6]
    return len(long_words_list)/len(words)

def short_words(text):
    words = re.findall(r'\b\w+\b', text)
    short_words_list = [word for word in words if 1 <= len(word) <= 3]
    return len(short_words_list)/len(words)

def legomena(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    legomena = [word for word in freq if freq[word] == 1]
    return len(legomena)/len(words)

def dislegomena(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    dislegomena = [word for word in freq if freq[word] == 2]
    return len(dislegomena)/len(words)

def yules_k(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    N = len(words)
    Vi = FreqDist(freq.values())
    K = 10**4 * ((-N + sum(i**2 * Vi[i] for i in Vi))/N**2)
    return K

def simpson_d(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    N = len(words)
    if N < 2:
        return 0
    D = sum(fr * (fr - 1) / (N * (N - 1)) for fr in freq.values())
    return D

def sichel_s(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    dislegomena = [word for word in freq if freq[word] == 2]
    S = len(dislegomena)/len(freq.values())
    return S

def honores_r(text):
    words = re.findall(r'\b\w+\b', text.lower())
    if not words:
        return 0
    freq = FreqDist(words)
    N = len(words)
    V = len(freq.values())
    legomena = [word for word in freq if freq[word] == 1]
    unique_count_ratio = len(legomena) / V if V > 0 else 0
    if unique_count_ratio == 1 or N == 0:
        return 0
    R = (100*np.log(N)/(1-(len(legomena)/V)))
    return R

def entropy(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    N = len(words)
    E = -sum((fr / N) * np.log(fr/N) for fr in freq.values())
    return E

# Syntatic features
def punctuations_ratio(text, punctuation):
    punctuation_list = re.findall(punctuation, text)
    return len(punctuation_list)/len(text)

# Structural features
def lines(text):
    return len(text.split('\n'))

def sentences(text):
    return len(sent_tokenize(text))

def paragraphs(text):
    return len([par for par in text.split('\n\n') if par.strip()])

def sentence_paragraph(text):
    pars = [par for par in text.split('\n\n') if par.strip()]
    return statistics.mean([len(sent_tokenize(par)) for par in pars])

def words_paragraph(text):
    pars = [par for par in text.split('\n\n') if par.strip()]
    return statistics.mean([len(re.findall(r'\b\w+\b', par)) for par in pars])

def chars_paragraph(text):
    pars = [par for par in text.split('\n\n') if par.strip()]
    return statistics.mean([len(par) for par in pars])

def words_sentences(text):
    sents = sent_tokenize(text)
    return statistics.mean([len(word_tokenize(sentence)) for sentence in sents])

def uppercase_start(text):
    sents = sent_tokenize(text)
    return (sum(1 for sentence in sents if sentence[0].isupper()) / len(sents))

def extract_features(dataframe, text_column):
    features = pd.DataFrame()

    # Character-based features
    features['total_characters'] = dataframe[text_column].apply(character_count)
    features['ratio_alphabetic'] = dataframe[text_column].apply(alphabetic_ratio)
    features['ratio_uppercase'] = dataframe[text_column].apply(uppercase_ratio)
    features['ratio_digit'] = dataframe[text_column].apply(digit_ratio)
    features['ratio_whitespace'] = dataframe[text_column].apply(whitespace_ratio)
    features['ratio_tabspace'] = dataframe[text_column].apply(tab_ratio)
    letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
    for letter in letters:
        features[letter+'_frequency'] = dataframe[text_column].apply(letter_ratio, args=(letter,))
    special_characters = ['~', '@', '#', '$', '%', '^', '&', '*', '-', '_', '=', '+', '>', '<', '[', ']', '{', '}', '/', '\\', '|']
    for character in special_characters:
        features[character+'_frequency'] = dataframe[text_column].apply(specialcharacter_ratio, args=(character,))
    
    # Word-based features
    features['total_words'] = dataframe[text_column].apply(number_words)
    features['word_length'] = dataframe[text_column].apply(word_length)
    features['vocabulary_richness'] = dataframe[text_column].apply(vocabulary_richness)
    features['long_words'] = dataframe[text_column].apply(long_words)
    features['short_words'] = dataframe[text_column].apply(short_words)
    features['hapax_legomena'] = dataframe[text_column].apply(legomena)
    features['hapax_dislegomena'] = dataframe[text_column].apply(dislegomena)
    features['yules_k'] = dataframe[text_column].apply(yules_k)
    features['simpson_d'] = dataframe[text_column].apply(simpson_d)
    features['sichel_s'] = dataframe[text_column].apply(sichel_s)
    #features['honore_r'] = dataframe[text_column].apply(honores_r)
    features['entropy'] =  dataframe[text_column].apply(entropy)
    # Brunet W?
    # word length frequency distribution

    
    # Syntactic features
    punctuations = [r"’", r",", r"\.", r":", r";", r"\?", r"\?{2,}", r"!", r"!{2,}", r"\.{3}"]
    for punctuation in punctuations:
        features[punctuation+"_frequency"] = dataframe[text_column].apply(punctuations_ratio, args=(punctuation,))

    # Structural features
    features['number_lines'] = dataframe[text_column].apply(lines)
    features['number_sentences'] = dataframe[text_column].apply(sentences)
    features['number_paragraphs'] = dataframe[text_column].apply(paragraphs)
    features['sentences_per_paragraph'] = dataframe[text_column].apply(sentence_paragraph)
    features['word_per_paragraph'] = dataframe[text_column].apply(words_paragraph)
    features['character_per_paragraph'] = dataframe[text_column].apply(chars_paragraph)
    features['word_per_sentence'] = dataframe[text_column].apply(words_sentences)
    features['ratio_sentencestart_uppercase'] = dataframe[text_column].apply(uppercase_start)
    #features['gender'] = dataframe['gender']
        
    return features


## Data transformation

In [6]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

In [7]:
combined_pan15_train_features = extract_features(combined_pan15_train, 'text')
combined_pan15_test_features = extract_features(combined_pan15_test, 'text')

In [8]:
scaler = StandardScaler()
combined_pan15_train_features_scaled = pd.DataFrame(scaler.fit_transform(combined_pan15_train_features), columns=combined_pan15_train_features.columns)
combined_pan15_test_features_scaled = pd.DataFrame(scaler.transform(combined_pan15_test_features), columns=combined_pan15_test_features.columns)

In [67]:
X_train = combined_pan15_train_features_scaled
X_test = combined_pan15_test_features_scaled

In [9]:
y_train = combined_pan15_train['gender']
y_test = combined_pan15_test['gender']

## Training model

In [10]:
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [16]:
model = LinearSVC(max_iter=10000, C= 0.01, class_weight= None, loss= 'squared_hinge', tol= 0.0001)
model.fit(X_train, y_train)

In [17]:
y_pred = model.predict(X_test)
accuracy_score(y_test, y_pred)

0.7535211267605634

## Training improved model

In [11]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from scipy.sparse import csr_matrix, hstack

In [12]:
char_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), sublinear_tf=True, min_df=2)
word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), sublinear_tf=True, min_df=2)

combined_features = FeatureUnion([
    ('char', char_vectorizer),
    ('word', word_vectorizer)
])

In [13]:
tfidf_features_train = combined_features.fit_transform(combined_pan15_train['text'])
tfidf_features_test =  combined_features.transform(combined_pan15_test['text'])

In [14]:
X_train = hstack([tfidf_features_train, csr_matrix(combined_pan15_train_features_scaled)])
X_test = hstack([tfidf_features_test, csr_matrix(combined_pan15_test_features_scaled)])

In [15]:
model = LinearSVC(max_iter=10000, C= 0.01, class_weight= None, loss= 'squared_hinge', tol= 0.0001)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)
print("Accuracy:", round(accuracy_score(y_test, y_pred), 3))
print("Precision:", round(precision_score(y_test, y_pred, pos_label='F'), 3))
print("Recall:", round(recall_score(y_test, y_pred, pos_label='F'), 3))

Accuracy: 0.754
Precision: 0.743
Recall: 0.775


In [16]:
model_lr = LogisticRegression()
model_lr.fit(X_train, y_train)

# Make predictions
y_pred_lr = model_lr.predict(X_test)

# Print evaluation metrics
print("Logistic Regression:")
print("Accuracy:", round(accuracy_score(y_test, y_pred_lr), 3))
print("Precision:", round(precision_score(y_test, y_pred_lr, pos_label='F'), 3))
print("Recall:", round(recall_score(y_test, y_pred_lr, pos_label='F'), 3))

Logistic Regression:
Accuracy: 0.754
Precision: 0.725
Recall: 0.817


In [17]:
model_rf = RandomForestClassifier()
model_rf.fit(X_train, y_train)

# Make predictions
y_pred_rf = model_rf.predict(X_test)

# Print evaluation metrics
print("Random Forest:")
print("Accuracy:", round(accuracy_score(y_test, y_pred_rf), 3))
print("Precision:", round(precision_score(y_test, y_pred_rf, pos_label='F'), 3))
print("Recall:", round(recall_score(y_test, y_pred_rf, pos_label='F'), 3))

Random Forest:
Accuracy: 0.732
Precision: 0.754
Recall: 0.69


### Predict age

In [50]:
y_train_age = combined_pan15_train['age']
y_test_age = combined_pan15_test['age']

In [69]:
model_age = LinearSVC(max_iter=10000, C= 0.01, class_weight= None, loss= 'squared_hinge', tol= 0.0001)
model_age.fit(X_train, y_train_age)
y_pred_age = model_age.predict(X_test)
print("Accuracy:", round(accuracy_score(y_test_age, y_pred_age), 3))
print("Precision:", round(precision_score(y_test_age, y_pred_age, average='weighted'), 3))
print("Recall:", round(recall_score(y_test_age, y_pred_age, average='weighted'), 3))

Accuracy: 0.697
Precision: 0.695
Recall: 0.697


In [62]:
model_lr_age = LogisticRegression()
model_lr_age.fit(X_train, y_train_age)

# Make predictions
y_pred_lr_age = model_lr_age.predict(X_test)

# Print evaluation metrics
print("Logistic Regression:")
print("Accuracy:", round(accuracy_score(y_test_age, y_pred_lr_age), 3))
print("Precision:", round(precision_score(y_test_age, y_pred_lr_age, average='weighted'), 3))
print("Recall:", round(recall_score(y_test_age, y_pred_lr_age, average='weighted'), 3))

Logistic Regression:
Accuracy: 0.655
Precision: 0.677
Recall: 0.655


In [70]:
model_rf_age = RandomForestClassifier()
model_rf_age.fit(X_train, y_train_age)

# Make predictions
y_pred_rf_age = model_rf_age.predict(X_test)

# Print evaluation metrics
print("Random Forest:")
print("Accuracy:", round(accuracy_score(y_test_age, y_pred_rf_age), 3))
print("Precision:", round(precision_score(y_test_age, y_pred_rf_age, average='weighted'), 3))
print("Recall:", round(recall_score(y_test_age, y_pred_rf_age, average='weighted'), 3))

Random Forest:
Accuracy: 0.669
Precision: 0.675
Recall: 0.669


  _warn_prf(average, modifier, msg_start, len(result))


In [54]:
Counter(y_pred)

Counter({'25-34': 66, '18-24': 52, '35-49': 15, '50-XX': 9})

## Predicting LLM data

In [18]:
gpt_features = extract_features(df_gpt, 'text')
gemini_features = extract_features(df_gemini, 'text')
llama_features = extract_features(df_llama, 'text')

In [19]:
gpt_features_scaled = pd.DataFrame(scaler.transform(gpt_features), columns=gpt_features.columns)
gemini_features_scaled = pd.DataFrame(scaler.transform(gemini_features), columns=gemini_features.columns)
llama_features_scaled = pd.DataFrame(scaler.transform(llama_features), columns=llama_features.columns)

In [20]:
gpt_features_tfidf = combined_features.transform(df_gpt['text'])
gpt_features_combined = hstack([gpt_features_tfidf, csr_matrix(gpt_features_scaled)])

In [None]:
gemini_features_tfidf = combined_features.transform(df_gemini['text'])
gemini_features_combined = hstack([gemini_features_tfidf, csr_matrix(gemini_features_scaled)])

In [None]:
llama_features_tfidf = combined_features.transform(df_llama['text'])
llama_features_combined = hstack([llama_features_tfidf, csr_matrix(llama_features_scaled)])

### Gender

In [21]:
gpt_pred = model.predict(gpt_features_combined)
Counter(gpt_pred)

Counter({'M': 109, 'F': 141})

In [23]:
gemini_pred = model.predict(gemini_features_combined)
Counter(gemini_pred)

Counter({'M': 131, 'F': 6})

In [28]:
llama_pred = model.predict(llama_features_combined)
Counter(llama_pred)

Counter({'F': 60, 'M': 129})

## Analysis

In [97]:
import numpy as np
from scipy.stats import chi2_contingency, chisquare

In [33]:
contingency_table = pd.DataFrame({
    "GPT" : pd.Series(gpt_pred).value_counts(),
    "Gemini" : pd.Series(gemini_pred).value_counts(),
    "Llama" : pd.Series(llama_pred).value_counts()
})

In [34]:
contingency_table

Unnamed: 0,GPT,Gemini,Llama
F,141,6,60
M,109,131,129


In [45]:
chi2, p, dof, expected = chi2_contingency(contingency_table.T)

In [46]:
print("\nChi-Square Test Results:")
print(f"Chi-Square Statistic: {chi2}")
print(f"P-Value: {p}")
print(f"Degrees of Freedom: {dof}")
print("Expected Frequencies:")
print(pd.DataFrame(expected, index=contingency_table.columns, columns=contingency_table.index))


Chi-Square Test Results:
Chi-Square Statistic: 106.17356605188847
P-Value: 8.804466996342262e-24
Degrees of Freedom: 2
Expected Frequencies:
                F           M
GPT     89.843750  160.156250
Gemini  49.234375   87.765625
Llama   67.921875  121.078125


In [101]:
expected_frequencies = [len(gpt_pred)/2, len(gpt_pred)/2]
chi2_stat, p_val = chisquare(pd.Series(gpt_pred).value_counts(), expected_frequencies)

print(f"Chi-square statistic: {chi2_stat}")
print(f"P-value: {p_val}")

Chi-square statistic: 4.096
P-value: 0.042984795070858665


In [106]:
expected_frequencies = [len(gemini_pred)/2, len(gemini_pred)/2]
chi2_stat, p_val = chisquare(pd.Series(gemini_pred).value_counts(), expected_frequencies)

print(f"Chi-square statistic: {chi2_stat}")
print(f"P-value: {p_val}")

Chi-square statistic: 114.05109489051095
P-value: 1.2699362592974781e-26


In [107]:
expected_frequencies = [len(llama_pred)/2, len(llama_pred)/2]
chi2_stat, p_val = chisquare(pd.Series(llama_pred).value_counts(), expected_frequencies)

print(f"Chi-square statistic: {chi2_stat}")
print(f"P-value: {p_val}")

Chi-square statistic: 25.19047619047619
P-value: 5.193804760844568e-07


### Age

In [79]:
gpt_pred_age = model_age.predict(gpt_features_combined)
Counter(gpt_pred_age)

Counter({'25-34': 250})

In [77]:
gemini_pred_age = model_age.predict(gemini_features_combined)
Counter(gemini_pred_age)

Counter({'25-34': 115, '35-49': 22})

In [78]:
llama_pred_age = model_age.predict(llama_features_combined)
Counter(llama_pred_age)

Counter({'18-24': 21, '25-34': 147, '35-49': 21})

In [115]:
contingency_table_age = pd.DataFrame({
    "GPT" : pd.Series(gpt_pred_age).value_counts(),
    "Gemini" : pd.Series(gemini_pred_age).value_counts(),
    "Llama" : pd.Series(llama_pred_age).value_counts()
}).fillna(0).astype(int)

In [125]:
contingency_table_age.loc['50+'] = [0,0,0]

In [129]:
contingency_table_age

Unnamed: 0,GPT,Gemini,Llama
18-24,0,0,21
25-34,250,115,147
35-49,0,22,21
50+,0,0,0


In [117]:
chi2, p, dof, expected = chi2_contingency(contingency_table_age)
print("\nChi-Square Test Results:")
print(f"Chi-Square Statistic: {chi2}")
print(f"P-Value: {p}")
print(f"Degrees of Freedom: {dof}")
print("Expected Frequencies:")
print(pd.DataFrame(expected))


Chi-Square Test Results:
Chi-Square Statistic: 85.05398064844678
P-Value: 1.4774793140887388e-17
Degrees of Freedom: 4
Expected Frequencies:
            0           1           2
0    9.114583    4.994792    6.890625
1  222.222222  121.777778  168.000000
2   18.663194   10.227431   14.109375


In [121]:
pd.Series(gpt_pred_age).value_counts()

25-34    250
dtype: int64

In [128]:
expected_frequencies = [len(gpt_pred_age)/4, len(gpt_pred_age)/4, len(gpt_pred_age)/4, len(gpt_pred_age)/4]
chi2_stat, p_val = chisquare(contingency_table_age['GPT'], expected_frequencies)

print(f"Chi-square statistic: {chi2_stat}")
print(f"P-value: {p_val}")

Chi-square statistic: 750.0
P-value: 3.017295778113645e-162


In [130]:
expected_frequencies = [len(gemini_pred_age)/4, len(gemini_pred_age)/4, len(gemini_pred_age)/4, len(gemini_pred_age)/4]
chi2_stat, p_val = chisquare(contingency_table_age['Gemini'], expected_frequencies)

print(f"Chi-square statistic: {chi2_stat}")
print(f"P-value: {p_val}")

Chi-square statistic: 263.26277372262774
P-value: 8.850956215110382e-57


In [131]:
expected_frequencies = [len(llama_pred_age)/4, len(llama_pred_age)/4, len(llama_pred_age)/4, len(llama_pred_age)/4]
chi2_stat, p_val = chisquare(contingency_table_age['Llama'], expected_frequencies)

print(f"Chi-square statistic: {chi2_stat}")
print(f"P-value: {p_val}")

Chi-square statistic: 287.0
P-value: 6.473337456540893e-62
