## Creation of the variables used in the classification

In [1]:
import pandas as pd

In [2]:
data_pan15_training = pd.read_csv('data\PAN_15_training.csv')

In [3]:
data_pan15_training

Unnamed: 0.1,Unnamed: 0,author,text,gender,age
0,0,02ae95de-7ee3-453a-978d-25d28b3f1a88,Things I want for my business cards but are to...,M,25-34
1,1,02ae95de-7ee3-453a-978d-25d28b3f1a88,"""painters produced their most highly valued wo...",M,25-34
2,2,02ae95de-7ee3-453a-978d-25d28b3f1a88,@username your new discussion layout is confus...,M,25-34
3,3,02ae95de-7ee3-453a-978d-25d28b3f1a88,I never really understood why game environment...,M,25-34
4,4,02ae95de-7ee3-453a-978d-25d28b3f1a88,"@username 20k and 2048² on a gun, fine. But th...",M,25-34
...,...,...,...,...,...
14161,14161,fde8eb00-0444-4159-9b65-1ead60c2dc88,Fifty Writing Tools: Quick List | Poynter. htt...,F,25-34
14162,14162,fde8eb00-0444-4159-9b65-1ead60c2dc88,Video: How To Make Vietnamese Coffee (by HighB...,F,25-34
14163,14163,fde8eb00-0444-4159-9b65-1ead60c2dc88,lyx is soooo awesome!!! finally figured out ho...,F,25-34
14164,14164,fde8eb00-0444-4159-9b65-1ead60c2dc88,Impact Algorithms: Strategies Remarkable Peopl...,F,25-34


## Creating feature functions

In [4]:
import re
from nltk.probability import FreqDist

# Character-based features
def character_count(text):
    return len(text)

def alphabetic_ratio(text):
    alphabetic = sum(c.isalpha() for c in text)
    return alphabetic/len(text)

def uppercase_ratio(text):
    upper = sum(c.isupper() for c in text)
    return upper/len(text)

def digit_ratio(text):
    digit = sum(c.isdigit() for c in text)
    return digit/len(text)

def whitespace_ratio(text):
    whitespace = sum(c.isspace() for c in text)
    return whitespace/len(text)

def tab_ratio(text):
    tabs = text.count('\t')
    return tabs/len(text)

def letter_ratio(text, letter):
    text = text.lower()
    letter_count = text.count(letter)
    return letter_count/len(text)

def specialcharacter_ratio(text, character):
    spec_count = text.count(character)
    return spec_count/len(text)

# Word-based features
def number_words(text):
    words = re.findall(r'\b\w+\b', text)
    return len(words)

def word_length(text):
    words = re.findall(r'\b\w+\b', text)
    num_words = len(words)
    if num_words == 0:
        return 0

    total_length = sum(len(word) for word in words)
    return total_length/num_words

def vocabulary_richness(text):
    words = re.findall(r'\b\w+\b', text)
    num_words = len(words)
    if num_words == 0:
        return 0
    
    num_uniq_words = len(set(words))
    return num_uniq_words/num_words

def long_words(text):
    words = re.findall(r'\b\w+\b', text)
    long_words_list = [word for word in words if len(word) > 6]
    return len(long_words_list)/len(words)

def short_words(text):
    words = re.findall(r'\b\w+\b', text)
    short_words_list = [word for word in words if 1 <= len(word) <= 3]
    return len(short_words_list)/len(words)


In [78]:
def extract_features(dataframe, text_column):
    features = pd.DataFrame()

    # Character-based features
    features['total_characters'] = dataframe[text_column].apply(character_count)
    features['ratio_alphabetic'] = dataframe[text_column].apply(alphabetic_ratio)
    features['ratio_uppercase'] = dataframe[text_column].apply(uppercase_ratio)
    features['ratio_digit'] = dataframe[text_column].apply(digit_ratio)
    features['ratio_whitespace'] = dataframe[text_column].apply(whitespace_ratio)
    features['ratio_tabspace'] = dataframe[text_column].apply(tab_ratio)
    letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
    for letter in letters:
        features[letter+'_frequency'] = dataframe[text_column].apply(letter_ratio, args=(letter,))
    special_characters = ['~', '@', '#', '$', '%', '^', '&', '*', '-', '_', '=', '+', '>', '<', '[', ']', '{', '}', '/', '\\', '|']
    for character in special_characters:
        features[character+'_frequency'] = dataframe[text_column].apply(specialcharacter_ratio, args=(character,))
    
    # Word-based features
    features['total_words'] = dataframe[text_column].apply(number_words)
    features['word_length'] = dataframe[text_column].apply(word_length)
    features['vocabulary_richness'] = dataframe[text_column].apply(vocabulary_richness)
    features['long_words'] = dataframe[text_column].apply(long_words)
    features['short_words'] = dataframe[text_column].apply(short_words)

    return features

In [79]:
extract_features(data_pan15_training, 'text')

Unnamed: 0,total_characters,ratio_alphabetic,ratio_uppercase,ratio_digit,ratio_whitespace,ratio_tabspace,a_frequency,b_frequency,c_frequency,d_frequency,...,{_frequency,}_frequency,/_frequency,\_frequency,|_frequency,total_words,word_length,vocabulary_richness,long_words,short_words
0,140,0.750000,0.050000,0.021429,0.178571,0.014286,0.028571,0.014286,0.042857,0.028571,...,0.0,0.0,0.000000,0.0,0.000000,25,4.320000,1.000000,0.160000,0.440000
1,141,0.680851,0.042553,0.063830,0.163121,0.014184,0.028369,0.000000,0.014184,0.028369,...,0.0,0.0,0.021277,0.0,0.000000,26,4.038462,0.961538,0.115385,0.384615
2,142,0.781690,0.007042,0.021127,0.133803,0.014085,0.049296,0.007042,0.042254,0.028169,...,0.0,0.0,0.021127,0.0,0.000000,22,5.181818,0.954545,0.363636,0.363636
3,142,0.774648,0.028169,0.021127,0.147887,0.014085,0.042254,0.000000,0.042254,0.021127,...,0.0,0.0,0.021127,0.0,0.000000,23,4.913043,1.000000,0.260870,0.478261
4,123,0.699187,0.008130,0.056911,0.211382,0.016260,0.048780,0.016260,0.008130,0.008130,...,0.0,0.0,0.000000,0.0,0.000000,25,3.720000,0.960000,0.080000,0.560000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14161,73,0.712329,0.082192,0.054795,0.123288,0.027397,0.000000,0.027397,0.027397,0.000000,...,0.0,0.0,0.041096,0.0,0.013699,10,5.600000,1.000000,0.300000,0.100000
14162,78,0.730769,0.128205,0.038462,0.128205,0.025641,0.038462,0.025641,0.025641,0.012821,...,0.0,0.0,0.038462,0.0,0.000000,12,5.000000,1.000000,0.250000,0.416667
14163,68,0.735294,0.000000,0.000000,0.191176,0.029412,0.029412,0.000000,0.014706,0.014706,...,0.0,0.0,0.000000,0.0,0.000000,11,4.545455,1.000000,0.363636,0.545455
14164,124,0.790323,0.112903,0.024194,0.129032,0.016129,0.080645,0.024194,0.040323,0.008065,...,0.0,0.0,0.024194,0.0,0.000000,17,5.941176,0.941176,0.352941,0.294118


In [36]:
data_pan15_training.loc[500, 'text']

"@username @username not sure if there are notes for this particular presentation but anne's website has stuff: http://t.co/TmZwP9nMo1\t\t"

In [61]:
set((1,1,2))

{1, 2}

In [12]:
text = 'test + test'
print(c.isalpha() for c in text)

<generator object <genexpr> at 0x000002249EE0B040>
