## Creation of the variables used in the classification

In [1]:
import pandas as pd

In [2]:
data_pan15_training = pd.read_csv(r'C:\Users\Sten\Documents\EUR BIM\thesis\data\data\raw_data\PAN_15_training.csv')

In [3]:
data_pan15_test = pd.read_csv(r'C:\Users\Sten\Documents\EUR BIM\thesis\data\data\raw_data\PAN_15_test.csv')

In [4]:
data_pan17_train = pd.read_csv(r'C:\Users\Sten\Documents\EUR BIM\thesis\data\data\raw_data\PAN_17_training.csv')

In [5]:
data_pan17_test = pd.read_csv(r'C:\Users\Sten\Documents\EUR BIM\thesis\data\data\raw_data\PAN_17_test.csv')

In [7]:
combined_pan17_train = data_pan17_train.groupby('author').agg({
    'text': ' '.join,
    'gender': 'first'
}).reset_index()
combined_pan17_test = data_pan17_test.groupby('author').agg({
    'text': ' '.join,
    'gender': 'first'
}).reset_index()

In [13]:
combined_pan17_train

Unnamed: 0,author,text,gender
0,1003de26f870d27f79887272a1eb3612,One to watch … \nAvailable on 10th Feb. https:...,male
1,102cce280df9f6e0e78bfdd266f1abb5,Are we living in a holographic universe? New s...,female
2,10488b3700fa9d2db22961ab064e4d38,"Museum focus, but still great pieces of advice...",female
3,1064bd0b78f14bea5b851e2a995dd4e5,Best half time show EVER! @jannarden not the ...,female
4,106aa0abb81873d09028b01658c37611,Does this mean @WaitakereUnited are top of the...,male
...,...,...,...
3595,ffa8289a14683e00a607a2d9bb5d1367,@LauraAnthony7 @UCBerkeley eooks likes lynch m...,female
3596,ffbd53773c792831d5b6322b775faa3a,@groubes such a classy player! Was awesome to ...,male
3597,ffc349a1e4c9a3b37fd8798e82d703a2,Great weekend spent with family! Thoroughly en...,male
3598,fff01fe00dae086650e48f265468e483,@SavageFc602 @TellEmSteveDave he IS the disord...,female


## Creating feature functions

In [12]:
import re
from nltk.probability import FreqDist
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import statistics


# Character-based features
def character_count(text):
    return len(text)

def alphabetic_ratio(text):
    alphabetic = sum(c.isalpha() for c in text)
    return alphabetic/len(text)

def uppercase_ratio(text):
    upper = sum(c.isupper() for c in text)
    return upper/len(text)

def digit_ratio(text):
    digit = sum(c.isdigit() for c in text)
    return digit/len(text)

def whitespace_ratio(text):
    whitespace = sum(c.isspace() for c in text)
    return whitespace/len(text)

def tab_ratio(text):
    tabs = text.count('\t')
    return tabs/len(text)

def letter_ratio(text, letter):
    text = text.lower()
    letter_count = text.count(letter)
    return letter_count/len(text)

def specialcharacter_ratio(text, character):
    spec_count = text.count(character)
    return spec_count/len(text)

# Word-based features
def number_words(text):
    words = re.findall(r'\b\w+\b', text)
    return len(words)

def word_length(text):
    words = re.findall(r'\b\w+\b', text)
    num_words = len(words)
    if num_words == 0:
        return 0

    total_length = sum(len(word) for word in words)
    return total_length/num_words

def vocabulary_richness(text):
    words = re.findall(r'\b\w+\b', text)
    num_words = len(words)
    if num_words == 0:
        return 0
    
    num_uniq_words = len(set(words))
    return num_uniq_words/num_words

def long_words(text):
    words = re.findall(r'\b\w+\b', text)
    long_words_list = [word for word in words if len(word) > 6]
    return len(long_words_list)/len(words)

def short_words(text):
    words = re.findall(r'\b\w+\b', text)
    short_words_list = [word for word in words if 1 <= len(word) <= 3]
    return len(short_words_list)/len(words)

def legomena(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    legomena = [word for word in freq if freq[word] == 1]
    return len(legomena)/len(words)

def dislegomena(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    dislegomena = [word for word in freq if freq[word] == 2]
    return len(dislegomena)/len(words)

def yules_k(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    N = len(words)
    Vi = FreqDist(freq.values())
    K = 10**4 * ((-N + sum(i**2 * Vi[i] for i in Vi))/N**2)
    return K

def simpson_d(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    N = len(words)
    if N < 2:
        return 0
    D = sum(fr * (fr - 1) / (N * (N - 1)) for fr in freq.values())
    return D

def sichel_s(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    dislegomena = [word for word in freq if freq[word] == 2]
    S = len(dislegomena)/len(freq.values())
    return S

def honores_r(text):
    words = re.findall(r'\b\w+\b', text.lower())
    if not words:
        return 0
    freq = FreqDist(words)
    N = len(words)
    V = len(freq.values())
    legomena = [word for word in freq if freq[word] == 1]
    unique_count_ratio = len(legomena) / V if V > 0 else 0
    if unique_count_ratio == 1 or N == 0:
        return 0
    R = (100*np.log(N)/(1-(len(legomena)/V)))
    return R

def entropy(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    N = len(words)
    E = -sum((fr / N) * np.log(fr/N) for fr in freq.values())
    return E

# Syntatic features
def punctuations_ratio(text, punctuation):
    punctuation_list = re.findall(punctuation, text)
    return len(punctuation_list)/len(text)

# Structural features
def lines(text):
    return len(text.split('\n'))

def sentences(text):
    return len(sent_tokenize(text))

def paragraphs(text):
    return len([par for par in text.split('\n\n') if par.strip()])

def sentence_paragraph(text):
    pars = [par for par in text.split('\n\n') if par.strip()]
    return statistics.mean([len(sent_tokenize(par)) for par in pars])

def words_paragraph(text):
    pars = [par for par in text.split('\n\n') if par.strip()]
    return statistics.mean([len(re.findall(r'\b\w+\b', par)) for par in pars])

def chars_paragraph(text):
    pars = [par for par in text.split('\n\n') if par.strip()]
    return statistics.mean([len(par) for par in pars])

def words_sentences(text):
    sents = sent_tokenize(text)
    return statistics.mean([len(word_tokenize(sentence)) for sentence in sents])

def uppercase_start(text):
    sents = sent_tokenize(text)
    return (sum(1 for sentence in sents if sentence[0].isupper()) / len(sents))


In [14]:
def extract_features(dataframe, text_column):
    features = pd.DataFrame()

    # Character-based features
    features['total_characters'] = dataframe[text_column].apply(character_count)
    features['ratio_alphabetic'] = dataframe[text_column].apply(alphabetic_ratio)
    features['ratio_uppercase'] = dataframe[text_column].apply(uppercase_ratio)
    features['ratio_digit'] = dataframe[text_column].apply(digit_ratio)
    features['ratio_whitespace'] = dataframe[text_column].apply(whitespace_ratio)
    features['ratio_tabspace'] = dataframe[text_column].apply(tab_ratio)
    letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
    for letter in letters:
        features[letter+'_frequency'] = dataframe[text_column].apply(letter_ratio, args=(letter,))
    special_characters = ['~', '@', '#', '$', '%', '^', '&', '*', '-', '_', '=', '+', '>', '<', '[', ']', '{', '}', '/', '\\', '|']
    for character in special_characters:
        features[character+'_frequency'] = dataframe[text_column].apply(specialcharacter_ratio, args=(character,))
    
    # Word-based features
    features['total_words'] = dataframe[text_column].apply(number_words)
    features['word_length'] = dataframe[text_column].apply(word_length)
    features['vocabulary_richness'] = dataframe[text_column].apply(vocabulary_richness)
    features['long_words'] = dataframe[text_column].apply(long_words)
    features['short_words'] = dataframe[text_column].apply(short_words)
    features['hapax_legomena'] = dataframe[text_column].apply(legomena)
    features['hapax_dislegomena'] = dataframe[text_column].apply(dislegomena)
    features['yules_k'] = dataframe[text_column].apply(yules_k)
    features['simpson_d'] = dataframe[text_column].apply(simpson_d)
    features['sichel_s'] = dataframe[text_column].apply(sichel_s)
    #features['honore_r'] = dataframe[text_column].apply(honores_r)
    features['entropy'] =  dataframe[text_column].apply(entropy)
    # Brunet W?
    # word length frequency distribution

    
    # Syntactic features
    punctuations = [r"’", r",", r"\.", r":", r";", r"\?", r"\?{2,}", r"!", r"!{2,}", r"\.{3}"]
    for punctuation in punctuations:
        features[punctuation+"_frequency"] = dataframe[text_column].apply(punctuations_ratio, args=(punctuation,))

    # Structural features
    features['number_lines'] = dataframe[text_column].apply(lines)
    features['number_sentences'] = dataframe[text_column].apply(sentences)
    features['number_paragraphs'] = dataframe[text_column].apply(paragraphs)
    features['sentences_per_paragraph'] = dataframe[text_column].apply(sentence_paragraph)
    features['word_per_paragraph'] = dataframe[text_column].apply(words_paragraph)
    features['character_per_paragraph'] = dataframe[text_column].apply(chars_paragraph)
    features['word_per_sentence'] = dataframe[text_column].apply(words_sentences)
    features['ratio_sentencestart_uppercase'] = dataframe[text_column].apply(uppercase_start)
    features['gender'] = dataframe['gender']
        
    return features

In [48]:
pan15_train = extract_features(data_pan15_training, 'text')

In [49]:
pan15_test = extract_features(data_pan15_test, 'text')

In [113]:
pan17_train = extract_features(data_pan17_train, 'text')

In [115]:
pan17_test = extract_features(data_pan17_test, 'text')

In [15]:
combined_pan17_train_features = extract_features(combined_pan17_train, 'text')

In [17]:
combined_pan17_test_features = extract_features(combined_pan17_test, 'text')

In [18]:
combined_pan17_test_features

Unnamed: 0,total_characters,ratio_alphabetic,ratio_uppercase,ratio_digit,ratio_whitespace,ratio_tabspace,a_frequency,b_frequency,c_frequency,d_frequency,...,\.{3}_frequency,number_lines,number_sentences,number_paragraphs,sentences_per_paragraph,word_per_paragraph,character_per_paragraph,word_per_sentence,ratio_sentencestart_uppercase,gender
0,9587,0.761135,0.071242,0.013038,0.158652,0.0,0.053927,0.013664,0.024512,0.021487,...,0.000730,4,83,1,83.00,1737.0,9587.0,22.650602,0.674699,female
1,9713,0.755379,0.082879,0.016782,0.155977,0.0,0.060743,0.013075,0.030989,0.021312,...,0.000309,2,145,1,145.00,1686.0,9713.0,13.862069,0.517241,female
2,9408,0.773278,0.068240,0.018282,0.144770,0.0,0.060587,0.012117,0.030293,0.022215,...,0.000744,4,121,2,61.00,753.5,4703.0,14.694215,0.479339,male
3,7483,0.755579,0.080583,0.013631,0.162903,0.0,0.058533,0.018709,0.023386,0.020847,...,0.000000,7,40,2,20.50,672.5,3740.5,36.525000,0.450000,female
4,10457,0.776991,0.058334,0.009754,0.145931,0.0,0.057665,0.013006,0.023047,0.023429,...,0.000096,12,91,1,91.00,1690.0,10457.0,21.483516,0.560440,female
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2395,9873,0.771903,0.056518,0.012357,0.154664,0.0,0.054289,0.016915,0.023600,0.022384,...,0.000506,1,81,1,81.00,1724.0,9873.0,23.938272,0.580247,male
2396,6326,0.772210,0.051533,0.012646,0.160765,0.0,0.060070,0.008694,0.018179,0.039203,...,0.000790,2,43,1,43.00,1057.0,6326.0,28.627907,0.441860,female
2397,7751,0.758741,0.050187,0.013160,0.171204,0.0,0.056896,0.013676,0.024255,0.023481,...,0.000645,16,60,1,60.00,1431.0,7751.0,26.283333,0.650000,female
2398,7189,0.744749,0.059814,0.013632,0.177076,0.0,0.055223,0.014188,0.020865,0.025595,...,0.000000,6,34,1,34.00,1359.0,7189.0,43.676471,0.117647,female


In [50]:
pan15_test.to_csv(r'data\pan15_features_test.csv', index=False)

In [51]:
pan15_train.to_csv(r'data\pan15_features_training.csv', index=False)

In [119]:
pan17_test.to_csv(r'data\pan17_features_test.csv', index=False)

In [120]:
pan17_train.to_csv(r'data\pan17_features_training.csv', index=False)

## Create TF-IDF

In [80]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

In [121]:
tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))

In [94]:
pan15_train_tfidf_array = tfidf.fit_transform(data_pan15_training['text'])
pan15_test_tfidf_array = tfidf.transform(data_pan15_test['text'])

In [123]:
pan17_train_tfidf_array = tfidf.fit_transform(data_pan17_train['text'])
pan17_test_tfidf_array = tfidf.transform(data_pan17_test['text'])

In [124]:
svd = TruncatedSVD(n_components=100, random_state=42)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

In [96]:
pan15_train_lsa = lsa.fit_transform(pan15_train_tfidf_array)
pan15_test_lsa = lsa.transform(pan15_test_tfidf_array)

In [125]:
pan17_train_lsa = lsa.fit_transform(pan17_train_tfidf_array)
pan17_test_lsa = lsa.transform(pan17_test_tfidf_array)

In [78]:
feature_names = tfidf.get_feature_names_out()
pan15_train_tfidf = pd.DataFrame(pan15_train_tfidf_array.toarray(), columns=feature_names)
pan15_test_tfidf = pd.DataFrame(pan15_test_tfidf_array.toarray(), columns=feature_names)

In [79]:
pan15_train_tfidf.to_csv(r'data\pan15_tfidf_train.csv', index=False)
pan15_test_tfidf.to_csv(r'data\pan15_tfidf_test.csv', index=False)

## Classifier

In [21]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [98]:
len(data_pan15_training)

14166

In [99]:
X_train = pan15_train_lsa
y_train = data_pan15_training['gender']
X_test = pan15_test_lsa
y_test = data_pan15_test['gender']

In [126]:
X_train = pan17_train_lsa
y_train = data_pan17_train['gender']
X_test = pan17_test_lsa
y_test = data_pan17_test['gender']

In [19]:
X_train = combined_pan17_train_features.drop(columns=['gender'])
y_train = combined_pan17_train_features['gender']
X_test = combined_pan17_test_features.drop(columns=['gender'])
y_test = combined_pan17_test_features['gender']

In [41]:

model = LinearSVC()

In [42]:
model.fit(X_train, y_train)



In [40]:
pred = model.predict(X_test)
accuracy_score(y_test, pred)

0.6933333333333334

## Classifier 2

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [48]:
char_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), sublinear_tf=True, min_df=2)
#char_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), min_df=2)
word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), sublinear_tf=True, min_df=2)
#word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=2)

combined_features = FeatureUnion([
    ('char', char_vectorizer),
    ('word', word_vectorizer)
])

pipeline = Pipeline([
    ('features', combined_features),
    ('svm', LinearSVC())
])

In [49]:
X_train = combined_pan17_train['text']
y_train = combined_pan17_train['gender']
X_test = combined_pan17_test['text']
y_test = combined_pan17_test['gender']

In [51]:
pipeline.fit(X_train, y_train)

In [52]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.8195833333333333

In [2]:
combined_pan17_train = data_pan17_train.groupby('author').agg({
    'text': ' '.join,
    'gender': 'first'
}).reset_index()
combined_pan17_test = data_pan17_test.groupby('author').agg({
    'text': ' '.join,
    'gender': 'first'
}).reset_index()

NameError: name 'data_pan17_train' is not defined

In [50]:
combined_pan17_test

Unnamed: 0,author,text,gender
0,100c885443c4d2a32075e10cbca9a27e,Less than 2 weeks until Valentine's Day https:...,female
1,1017647e21e7e73d900e1dfacbf95a0f,"@drakefenton FWIW, you were wrong four hours ...",female
2,1023e1e534622e28d29dfdc0ee45cdac,I'm marching in CBR. Your big city ain't going...,male
3,10283d7f37d33b063e3734b740f5229d,More pics for BTS' comeback Cr. Naver https://...,female
4,1069f66c9d5862f860277d32780ac459,@griffski Eh? Great message to be sending out....,female
...,...,...,...
2395,ffa1dd4f39b1e32dcd1274327af39eac,"@aliceisms fake news By this point, surely my ...",male
2396,ffd6c6cfe9c484d3538fb9d9a628af74,@timkaine now that he is Sec of state how can ...,female
2397,ffd9b325eddf0c9374ad5eabeca6860a,.@acquia_support Is dev desktop supported on M...,female
2398,ffe4df0924e74cdc1d0ec6b980529ae7,anyway i'm super gay and @downtongaby is super...,female
