## Creation of the variables used in the classification

In [1]:
import pandas as pd

In [2]:
data_pan15_training = pd.read_csv('data\raw_data\PAN_15_training.csv')

In [3]:
data_pan15_training

Unnamed: 0.1,Unnamed: 0,author,text,gender,age
0,0,02ae95de-7ee3-453a-978d-25d28b3f1a88,Things I want for my business cards but are to...,M,25-34
1,1,02ae95de-7ee3-453a-978d-25d28b3f1a88,"""painters produced their most highly valued wo...",M,25-34
2,2,02ae95de-7ee3-453a-978d-25d28b3f1a88,@username your new discussion layout is confus...,M,25-34
3,3,02ae95de-7ee3-453a-978d-25d28b3f1a88,I never really understood why game environment...,M,25-34
4,4,02ae95de-7ee3-453a-978d-25d28b3f1a88,"@username 20k and 2048² on a gun, fine. But th...",M,25-34
...,...,...,...,...,...
14161,14161,fde8eb00-0444-4159-9b65-1ead60c2dc88,Fifty Writing Tools: Quick List | Poynter. htt...,F,25-34
14162,14162,fde8eb00-0444-4159-9b65-1ead60c2dc88,Video: How To Make Vietnamese Coffee (by HighB...,F,25-34
14163,14163,fde8eb00-0444-4159-9b65-1ead60c2dc88,lyx is soooo awesome!!! finally figured out ho...,F,25-34
14164,14164,fde8eb00-0444-4159-9b65-1ead60c2dc88,Impact Algorithms: Strategies Remarkable Peopl...,F,25-34


## Creating feature functions

In [19]:
import re
from nltk.probability import FreqDist
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import statistics


# Character-based features
def character_count(text):
    return len(text)

def alphabetic_ratio(text):
    alphabetic = sum(c.isalpha() for c in text)
    return alphabetic/len(text)

def uppercase_ratio(text):
    upper = sum(c.isupper() for c in text)
    return upper/len(text)

def digit_ratio(text):
    digit = sum(c.isdigit() for c in text)
    return digit/len(text)

def whitespace_ratio(text):
    whitespace = sum(c.isspace() for c in text)
    return whitespace/len(text)

def tab_ratio(text):
    tabs = text.count('\t')
    return tabs/len(text)

def letter_ratio(text, letter):
    text = text.lower()
    letter_count = text.count(letter)
    return letter_count/len(text)

def specialcharacter_ratio(text, character):
    spec_count = text.count(character)
    return spec_count/len(text)

# Word-based features
def number_words(text):
    words = re.findall(r'\b\w+\b', text)
    return len(words)

def word_length(text):
    words = re.findall(r'\b\w+\b', text)
    num_words = len(words)
    if num_words == 0:
        return 0

    total_length = sum(len(word) for word in words)
    return total_length/num_words

def vocabulary_richness(text):
    words = re.findall(r'\b\w+\b', text)
    num_words = len(words)
    if num_words == 0:
        return 0
    
    num_uniq_words = len(set(words))
    return num_uniq_words/num_words

def long_words(text):
    words = re.findall(r'\b\w+\b', text)
    long_words_list = [word for word in words if len(word) > 6]
    return len(long_words_list)/len(words)

def short_words(text):
    words = re.findall(r'\b\w+\b', text)
    short_words_list = [word for word in words if 1 <= len(word) <= 3]
    return len(short_words_list)/len(words)

def legomena(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    legomena = [word for word in freq if freq[word] == 1]
    return len(legomena)/len(words)

def dislegomena(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    dislegomena = [word for word in freq if freq[word] == 2]
    return len(dislegomena)/len(words)

def yules_k(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    N = len(words)
    Vi = FreqDist(freq.values())
    K = 10**4 * ((-N + sum(i**2 * Vi[i] for i in Vi))/N**2)
    return K

def simpson_d(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    N = len(words)
    if N < 2:
        return 0
    D = sum(fr * (fr - 1) / (N * (N - 1)) for fr in freq.values())
    return D

def sichel_s(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    dislegomena = [word for word in freq if freq[word] == 2]
    S = len(dislegomena)/len(freq.values())
    return S

def honores_r(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    N = len(words)
    V = len(freq.values())
    legomena = [word for word in freq if freq[word] == 1]
    R = (100*np.log(N)/(1-(len(legomena)/V)))
    return R

def entropy(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    N = len(words)
    E = -sum((fr / N) * np.log(fr/N) for fr in freq.values())
    return E

# Syntatic features
def punctuations_ratio(text, punctuation):
    punctuation_list = re.findall(punctuation, text)
    return len(punctuation_list)/len(text)

# Structural features
def lines(text):
    return len(text.split('\n'))

def sentences(text):
    return len(sent_tokenize(text))

def paragraphs(text):
    return len([par for par in text.split('\n\n') if par.strip()])

def sentence_paragraph(text):
    pars = [par for par in text.split('\n\n') if par.strip()]
    return statistics.mean([len(sent_tokenize(par)) for par in pars])

def words_paragraph(text):
    pars = [par for par in text.split('\n\n') if par.strip()]
    return statistics.mean([len(re.findall(r'\b\w+\b', par)) for par in pars])

def chars_paragraph(text):
    pars = [par for par in text.split('\n\n') if par.strip()]
    return statistics.mean([len(par) for par in pars])

def words_sentences(text):
    sents = sent_tokenize(text)
    return statistics.mean([len(word_tokenize(sentence)) for sentence in sents])

def uppercase_start(text):
    sents = sent_tokenize(text)
    return (sum(1 for sentence in sents if sentence[0].isupper()) / len(sents))


In [20]:
def extract_features(dataframe, text_column):
    features = pd.DataFrame()

    # Character-based features
    features['total_characters'] = dataframe[text_column].apply(character_count)
    features['ratio_alphabetic'] = dataframe[text_column].apply(alphabetic_ratio)
    features['ratio_uppercase'] = dataframe[text_column].apply(uppercase_ratio)
    features['ratio_digit'] = dataframe[text_column].apply(digit_ratio)
    features['ratio_whitespace'] = dataframe[text_column].apply(whitespace_ratio)
    features['ratio_tabspace'] = dataframe[text_column].apply(tab_ratio)
    letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
    for letter in letters:
        features[letter+'_frequency'] = dataframe[text_column].apply(letter_ratio, args=(letter,))
    special_characters = ['~', '@', '#', '$', '%', '^', '&', '*', '-', '_', '=', '+', '>', '<', '[', ']', '{', '}', '/', '\\', '|']
    for character in special_characters:
        features[character+'_frequency'] = dataframe[text_column].apply(specialcharacter_ratio, args=(character,))
    
    # Word-based features
    features['total_words'] = dataframe[text_column].apply(number_words)
    features['word_length'] = dataframe[text_column].apply(word_length)
    features['vocabulary_richness'] = dataframe[text_column].apply(vocabulary_richness)
    features['long_words'] = dataframe[text_column].apply(long_words)
    features['short_words'] = dataframe[text_column].apply(short_words)
    features['hapax_legomena'] = dataframe[text_column].apply(legomena)
    features['hapax_dislegomena'] = dataframe[text_column].apply(dislegomena)
    features['yules_k'] = dataframe[text_column].apply(yules_k)
    features['simpson_d'] = dataframe[text_column].apply(simpson_d)
    features['sichel_s'] = dataframe[text_column].apply(sichel_s)
    features['honore_r'] = dataframe[text_column].apply(honores_r)
    features['entropy'] =  dataframe[text_column].apply(entropy)
    # Brunet W?
    # word length frequency distribution

    
    # Syntactic features
    punctuations = [r"’", r",", r"\.", r":", r";", r"\?", r"\?{2,}", r"!", r"!{2,}", r"\.{3}"]
    for punctuation in punctuations:
        features[punctuation+"_frequency"] = dataframe[text_column].apply(punctuations_ratio, args=(punctuation,))

    # Structural features
    features['number_lines'] = dataframe[text_column].apply(lines)
    features['number_sentences'] = dataframe[text_column].apply(sentences)
    features['number_paragraphs'] = dataframe[text_column].apply(paragraphs)
    features['sentences_per_paragraph'] = dataframe[text_column].apply(sentence_paragraph)
    features['word_per_paragraph'] = dataframe[text_column].apply(words_paragraph)
    features['character_per_paragraph'] = dataframe[text_column].apply(chars_paragraph)
    features['word_per_sentence'] = dataframe[text_column].apply(words_sentences)
    features['ratio_sentencestart_uppercase'] = dataframe[text_column].apply(uppercase_start)
    
    return features

In [21]:
extract_features(data_pan15_training, 'text')

Unnamed: 0,total_characters,ratio_alphabetic,ratio_uppercase,ratio_digit,ratio_whitespace,ratio_tabspace,a_frequency,b_frequency,c_frequency,d_frequency,...,"!{2,}_frequency",\.{3}_frequency,number_lines,number_sentences,number_paragraphs,sentences_per_paragraph,word_per_paragraph,character_per_paragraph,word_per_sentence,ratio_sentencestart_uppercase
0,140,0.750000,0.050000,0.021429,0.178571,0.014286,0.028571,0.014286,0.042857,0.028571,...,0.000000,0.000000,1,2,1,2.0,25.0,140.0,15.0,0.5
1,141,0.680851,0.042553,0.063830,0.163121,0.014184,0.028369,0.000000,0.014184,0.028369,...,0.000000,0.007092,4,2,2,1.5,13.0,69.5,14.0,0.5
2,142,0.781690,0.007042,0.021127,0.133803,0.014085,0.049296,0.007042,0.042254,0.028169,...,0.000000,0.000000,1,2,1,2.0,22.0,142.0,12.0,0.0
3,142,0.774648,0.028169,0.021127,0.147887,0.014085,0.042254,0.000000,0.042254,0.021127,...,0.000000,0.000000,1,2,1,2.0,23.0,142.0,12.5,0.5
4,123,0.699187,0.008130,0.056911,0.211382,0.016260,0.048780,0.016260,0.008130,0.008130,...,0.000000,0.000000,1,2,1,2.0,25.0,123.0,14.5,0.5
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14161,73,0.712329,0.082192,0.054795,0.123288,0.027397,0.000000,0.027397,0.027397,0.000000,...,0.000000,0.000000,1,2,1,2.0,10.0,73.0,6.0,0.5
14162,78,0.730769,0.128205,0.038462,0.128205,0.025641,0.038462,0.025641,0.025641,0.012821,...,0.000000,0.000000,1,1,1,1.0,12.0,78.0,14.0,1.0
14163,68,0.735294,0.000000,0.000000,0.191176,0.029412,0.029412,0.000000,0.014706,0.014706,...,0.014706,0.000000,1,2,1,2.0,11.0,68.0,8.0,0.0
14164,124,0.790323,0.112903,0.024194,0.129032,0.016129,0.080645,0.024194,0.040323,0.008065,...,0.000000,0.000000,1,1,1,1.0,17.0,124.0,18.0,1.0


In [17]:
sent_tokenize(data_pan15_training.loc[0, 'text'])

['Things I want for my business cards but are too expensive: 3 PMS colors.',
 'colored edges, soft touch finish, raised spot UV, cut 45°corners.']

In [114]:
import nltk
from nltk.tokenize import sent_tokenize

In [6]:
set((1,1,2,3, 2, 6, 9))

{1, 2, 3, 6, 9}

In [156]:
test = """This is the first line. This is the first sentence of the second line.
This is the second sentence of the second line.

This is the first line of the second paragraph.

This is the third paragraph. It has one line."""
sent = sent_tokenize(test)
print(sent)
statistics.mean([len(word_tokenize(sentence)) for sentence in sent])

['This is the first line.', 'This is the first sentence of the second line.', 'This is the second sentence of the second line.', 'This is the first line of the second paragraph.', 'This is the third paragraph.', 'It has one line.']


7.833333333333333

In [88]:
    [r"test"]

['test']

In [12]:
text = 'test + test'
print(c.isalpha() for c in text)

<generator object <genexpr> at 0x000002249EE0B040>
