## Creation of the variables used in the classification

In [1]:
import pandas as pd

In [2]:
data_pan15_training = pd.read_csv('data\PAN_15_training.csv')

In [3]:
data_pan15_training

Unnamed: 0.1,Unnamed: 0,author,text,gender,age
0,0,02ae95de-7ee3-453a-978d-25d28b3f1a88,Things I want for my business cards but are to...,M,25-34
1,1,02ae95de-7ee3-453a-978d-25d28b3f1a88,"""painters produced their most highly valued wo...",M,25-34
2,2,02ae95de-7ee3-453a-978d-25d28b3f1a88,@username your new discussion layout is confus...,M,25-34
3,3,02ae95de-7ee3-453a-978d-25d28b3f1a88,I never really understood why game environment...,M,25-34
4,4,02ae95de-7ee3-453a-978d-25d28b3f1a88,"@username 20k and 2048² on a gun, fine. But th...",M,25-34
...,...,...,...,...,...
14161,14161,fde8eb00-0444-4159-9b65-1ead60c2dc88,Fifty Writing Tools: Quick List | Poynter. htt...,F,25-34
14162,14162,fde8eb00-0444-4159-9b65-1ead60c2dc88,Video: How To Make Vietnamese Coffee (by HighB...,F,25-34
14163,14163,fde8eb00-0444-4159-9b65-1ead60c2dc88,lyx is soooo awesome!!! finally figured out ho...,F,25-34
14164,14164,fde8eb00-0444-4159-9b65-1ead60c2dc88,Impact Algorithms: Strategies Remarkable Peopl...,F,25-34


## Creating feature functions

In [90]:
import re
from nltk.probability import FreqDist
import numpy as np


# Character-based features
def character_count(text):
    return len(text)

def alphabetic_ratio(text):
    alphabetic = sum(c.isalpha() for c in text)
    return alphabetic/len(text)

def uppercase_ratio(text):
    upper = sum(c.isupper() for c in text)
    return upper/len(text)

def digit_ratio(text):
    digit = sum(c.isdigit() for c in text)
    return digit/len(text)

def whitespace_ratio(text):
    whitespace = sum(c.isspace() for c in text)
    return whitespace/len(text)

def tab_ratio(text):
    tabs = text.count('\t')
    return tabs/len(text)

def letter_ratio(text, letter):
    text = text.lower()
    letter_count = text.count(letter)
    return letter_count/len(text)

def specialcharacter_ratio(text, character):
    spec_count = text.count(character)
    return spec_count/len(text)

# Word-based features
def number_words(text):
    words = re.findall(r'\b\w+\b', text)
    return len(words)

def word_length(text):
    words = re.findall(r'\b\w+\b', text)
    num_words = len(words)
    if num_words == 0:
        return 0

    total_length = sum(len(word) for word in words)
    return total_length/num_words

def vocabulary_richness(text):
    words = re.findall(r'\b\w+\b', text)
    num_words = len(words)
    if num_words == 0:
        return 0
    
    num_uniq_words = len(set(words))
    return num_uniq_words/num_words

def long_words(text):
    words = re.findall(r'\b\w+\b', text)
    long_words_list = [word for word in words if len(word) > 6]
    return len(long_words_list)/len(words)

def short_words(text):
    words = re.findall(r'\b\w+\b', text)
    short_words_list = [word for word in words if 1 <= len(word) <= 3]
    return len(short_words_list)/len(words)

def legomena(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    legomena = [word for word in freq if freq[word] == 1]
    return len(legomena)/len(words)

def dislegomena(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    dislegomena = [word for word in freq if freq[word] == 2]
    return len(dislegomena)/len(words)

def yules_k(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    N = len(words)
    Vi = FreqDist(freq.values())
    K = 10**4 * ((-N + sum(i**2 * Vi[i] for i in Vi))/N**2)
    return K

def simpson_d(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    N = len(words)
    if N < 2:
        return 0
    D = sum(fr * (fr - 1) / (N * (N - 1)) for fr in freq.values())
    return D

def sichel_s(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    dislegomena = [word for word in freq if freq[word] == 2]
    S = len(dislegomena)/len(freq.values())
    return S

def honores_r(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    N = len(words)
    V = len(freq.values())
    legomena = [word for word in freq if freq[word] == 1]
    R = (100*np.log(N)/(1-(len(legomena)/V)))
    return R

def entropy(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    N = len(words)
    E = -sum((fr / N) * np.log(fr/N) for fr in freq.values())
    return E

# Syntatic features
def punctuations_ratio(text, punctuation):
    punctuation_list = re.findall(punctuation, text)
    return len(punctuation_list)/len(text)

# Structural features


    

In [108]:
def extract_features(dataframe, text_column):
    features = pd.DataFrame()

    # Character-based features
    features['total_characters'] = dataframe[text_column].apply(character_count)
    features['ratio_alphabetic'] = dataframe[text_column].apply(alphabetic_ratio)
    features['ratio_uppercase'] = dataframe[text_column].apply(uppercase_ratio)
    features['ratio_digit'] = dataframe[text_column].apply(digit_ratio)
    features['ratio_whitespace'] = dataframe[text_column].apply(whitespace_ratio)
    features['ratio_tabspace'] = dataframe[text_column].apply(tab_ratio)
    letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
    for letter in letters:
        features[letter+'_frequency'] = dataframe[text_column].apply(letter_ratio, args=(letter,))
    special_characters = ['~', '@', '#', '$', '%', '^', '&', '*', '-', '_', '=', '+', '>', '<', '[', ']', '{', '}', '/', '\\', '|']
    for character in special_characters:
        features[character+'_frequency'] = dataframe[text_column].apply(specialcharacter_ratio, args=(character,))
    
    # Word-based features
    features['total_words'] = dataframe[text_column].apply(number_words)
    features['word_length'] = dataframe[text_column].apply(word_length)
    features['vocabulary_richness'] = dataframe[text_column].apply(vocabulary_richness)
    features['long_words'] = dataframe[text_column].apply(long_words)
    features['short_words'] = dataframe[text_column].apply(short_words)
    features['hapax_legomena'] = dataframe[text_column].apply(legomena)
    features['hapax_dislegomena'] = dataframe[text_column].apply(dislegomena)
    features['yules_k'] = dataframe[text_column].apply(yules_k)
    features['simpson_d'] = dataframe[text_column].apply(simpson_d)
    features['sichel_s'] = dataframe[text_column].apply(sichel_s)
    features['honore_r'] = dataframe[text_column].apply(honores_r)
    features['entropy'] =  dataframe[text_column].apply(entropy)
    # Brunet W?
    # word length frequency distribution

    
    # Syntactic features
    punctuations = [r"’", r",", r"\.", r":", r";", r"\?", r"\?{2,}", r"!", r"!{2,}", r"\.{3}"]
    for punctuation in punctuations:
        features[punctuation+"_frequency"] = dataframe[text_column].apply(punctuations_ratio, args=(punctuation,))

    


    return features

In [112]:
extract_features(data_pan15_training, 'text').describe()

  diff_b_a = subtract(b, a)


Unnamed: 0,total_characters,ratio_alphabetic,ratio_uppercase,ratio_digit,ratio_whitespace,ratio_tabspace,a_frequency,b_frequency,c_frequency,d_frequency,...,’_frequency,",_frequency",\._frequency,:_frequency,;_frequency,\?_frequency,"\?{2,}_frequency",!_frequency,"!{2,}_frequency",\.{3}_frequency
count,14166.0,14166.0,14166.0,14166.0,14166.0,14166.0,14166.0,14166.0,14166.0,14166.0,...,14166.0,14166.0,14166.0,14166.0,14166.0,14166.0,14166.0,14166.0,14166.0,14166.0
mean,79.278413,0.7258,0.078179,0.015046,0.169612,0.034335,0.057319,0.011804,0.023637,0.020879,...,0.000212,0.003146,0.015857,0.009037,0.000135,0.012518,0.001805,0.004123,0.000696,0.001237
std,36.187452,0.062443,0.09735,0.024739,0.040156,0.026361,0.028882,0.014406,0.020108,0.019016,...,0.00173,0.007327,0.021167,0.010958,0.001596,0.044127,0.006643,0.013953,0.004016,0.005494
min,3.0,0.161972,0.0,0.0,0.021739,0.010989,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,49.0,0.7,0.025316,0.0,0.140187,0.018349,0.038961,0.0,0.00813,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,77.0,0.736842,0.054945,0.0,0.165414,0.025974,0.055556,0.008547,0.021739,0.018519,...,0.0,0.0,0.011628,0.007299,0.0,0.0,0.0,0.0,0.0,0.0
75%,109.0,0.765152,0.102941,0.023256,0.196429,0.040816,0.073684,0.019231,0.035398,0.03125,...,0.0,0.0,0.020833,0.015385,0.0,0.0,0.0,0.0,0.0,0.0
max,182.0,0.965517,0.965517,0.352941,0.666667,0.666667,0.380952,0.181818,0.1875,0.227273,...,0.04918,0.090909,0.634921,0.111111,0.052632,0.681818,0.130435,0.323529,0.083333,0.206349


In [100]:
data_pan15_training.loc[9333, 'text']

'Wolf???? vs Cougar????\t\t'

In [6]:
set((1,1,2,3, 2, 6, 9))

{1, 2, 3, 6, 9}

In [105]:
test = "Hello, world!!! What’s going on??? This is an example..... Isn’t it great?!!!!"
re.findall(r"\.{3}", test)

['...']

In [88]:
    [r"test"]

['test']

In [12]:
text = 'test + test'
print(c.isalpha() for c in text)

<generator object <genexpr> at 0x000002249EE0B040>
