## Creation of the variables used in the classification

In [1]:
import pandas as pd

In [2]:
data_pan15_training = pd.read_csv(r'data\raw_data\PAN_15_training.csv')

In [3]:
data_pan15_test = pd.read_csv(r'data\raw_data\PAN_15_test.csv')

In [17]:
data_pan17_train = pd.read_csv(r'data\raw_data\PAN_17_training.csv')

In [18]:
data_pan17_test = pd.read_csv(r'data\raw_data\PAN_17_test.csv')

In [4]:
data_pan17_test

Unnamed: 0.1,Unnamed: 0,author,text,gender,country
0,0,100c885443c4d2a32075e10cbca9a27e,Less than 2 weeks until Valentine's Day https:...,female,australia
1,1,100c885443c4d2a32075e10cbca9a27e,"Omg now I remember, that photo was from when I...",female,australia
2,2,100c885443c4d2a32075e10cbca9a27e,when you eat an entire bag of popcorn and fami...,female,australia
3,3,100c885443c4d2a32075e10cbca9a27e,@tartecosmetics my fav shade has gone 😭😭😭😭 htt...,female,australia
4,4,100c885443c4d2a32075e10cbca9a27e,OMFG this is happening to me rn!!!!!!!! https:...,female,australia
...,...,...,...,...,...
239995,239995,fff5a17288a8ab173e493c90bf4b39a4,"Saw a seal in the wild, can you tell I'm happy...",male,australia
239996,239996,fff5a17288a8ab173e493c90bf4b39a4,Note: everyone is up getting ready and he has ...,male,australia
239997,239997,fff5a17288a8ab173e493c90bf4b39a4,@dorkfaceblog Just happy to be here 😂✌🏻#thegir...,male,australia
239998,239998,fff5a17288a8ab173e493c90bf4b39a4,@Step2Adulthood @dorkfaceblog It was but thank...,male,australia


## Creating feature functions

In [4]:
import re
from nltk.probability import FreqDist
import numpy as np
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
import statistics


# Character-based features
def character_count(text):
    return len(text)

def alphabetic_ratio(text):
    alphabetic = sum(c.isalpha() for c in text)
    return alphabetic/len(text)

def uppercase_ratio(text):
    upper = sum(c.isupper() for c in text)
    return upper/len(text)

def digit_ratio(text):
    digit = sum(c.isdigit() for c in text)
    return digit/len(text)

def whitespace_ratio(text):
    whitespace = sum(c.isspace() for c in text)
    return whitespace/len(text)

def tab_ratio(text):
    tabs = text.count('\t')
    return tabs/len(text)

def letter_ratio(text, letter):
    text = text.lower()
    letter_count = text.count(letter)
    return letter_count/len(text)

def specialcharacter_ratio(text, character):
    spec_count = text.count(character)
    return spec_count/len(text)

# Word-based features
def number_words(text):
    words = re.findall(r'\b\w+\b', text)
    return len(words)

def word_length(text):
    words = re.findall(r'\b\w+\b', text)
    num_words = len(words)
    if num_words == 0:
        return 0

    total_length = sum(len(word) for word in words)
    return total_length/num_words

def vocabulary_richness(text):
    words = re.findall(r'\b\w+\b', text)
    num_words = len(words)
    if num_words == 0:
        return 0
    
    num_uniq_words = len(set(words))
    return num_uniq_words/num_words

def long_words(text):
    words = re.findall(r'\b\w+\b', text)
    long_words_list = [word for word in words if len(word) > 6]
    return len(long_words_list)/len(words)

def short_words(text):
    words = re.findall(r'\b\w+\b', text)
    short_words_list = [word for word in words if 1 <= len(word) <= 3]
    return len(short_words_list)/len(words)

def legomena(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    legomena = [word for word in freq if freq[word] == 1]
    return len(legomena)/len(words)

def dislegomena(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    dislegomena = [word for word in freq if freq[word] == 2]
    return len(dislegomena)/len(words)

def yules_k(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    N = len(words)
    Vi = FreqDist(freq.values())
    K = 10**4 * ((-N + sum(i**2 * Vi[i] for i in Vi))/N**2)
    return K

def simpson_d(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    N = len(words)
    if N < 2:
        return 0
    D = sum(fr * (fr - 1) / (N * (N - 1)) for fr in freq.values())
    return D

def sichel_s(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    dislegomena = [word for word in freq if freq[word] == 2]
    S = len(dislegomena)/len(freq.values())
    return S

def honores_r(text):
    words = re.findall(r'\b\w+\b', text.lower())
    if not words:
        return 0
    freq = FreqDist(words)
    N = len(words)
    V = len(freq.values())
    legomena = [word for word in freq if freq[word] == 1]
    unique_count_ratio = len(legomena) / V if V > 0 else 0
    if unique_count_ratio == 1 or N == 0:
        return 0
    R = (100*np.log(N)/(1-(len(legomena)/V)))
    return R

def entropy(text):
    words = re.findall(r'\b\w+\b', text.lower())
    freq = FreqDist(words)
    N = len(words)
    E = -sum((fr / N) * np.log(fr/N) for fr in freq.values())
    return E

# Syntatic features
def punctuations_ratio(text, punctuation):
    punctuation_list = re.findall(punctuation, text)
    return len(punctuation_list)/len(text)

# Structural features
def lines(text):
    return len(text.split('\n'))

def sentences(text):
    return len(sent_tokenize(text))

def paragraphs(text):
    return len([par for par in text.split('\n\n') if par.strip()])

def sentence_paragraph(text):
    pars = [par for par in text.split('\n\n') if par.strip()]
    return statistics.mean([len(sent_tokenize(par)) for par in pars])

def words_paragraph(text):
    pars = [par for par in text.split('\n\n') if par.strip()]
    return statistics.mean([len(re.findall(r'\b\w+\b', par)) for par in pars])

def chars_paragraph(text):
    pars = [par for par in text.split('\n\n') if par.strip()]
    return statistics.mean([len(par) for par in pars])

def words_sentences(text):
    sents = sent_tokenize(text)
    return statistics.mean([len(word_tokenize(sentence)) for sentence in sents])

def uppercase_start(text):
    sents = sent_tokenize(text)
    return (sum(1 for sentence in sents if sentence[0].isupper()) / len(sents))


In [112]:
def extract_features(dataframe, text_column):
    features = pd.DataFrame()

    # Character-based features
    features['total_characters'] = dataframe[text_column].apply(character_count)
    features['ratio_alphabetic'] = dataframe[text_column].apply(alphabetic_ratio)
    features['ratio_uppercase'] = dataframe[text_column].apply(uppercase_ratio)
    features['ratio_digit'] = dataframe[text_column].apply(digit_ratio)
    features['ratio_whitespace'] = dataframe[text_column].apply(whitespace_ratio)
    features['ratio_tabspace'] = dataframe[text_column].apply(tab_ratio)
    letters = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
    for letter in letters:
        features[letter+'_frequency'] = dataframe[text_column].apply(letter_ratio, args=(letter,))
    special_characters = ['~', '@', '#', '$', '%', '^', '&', '*', '-', '_', '=', '+', '>', '<', '[', ']', '{', '}', '/', '\\', '|']
    for character in special_characters:
        features[character+'_frequency'] = dataframe[text_column].apply(specialcharacter_ratio, args=(character,))
    
    # Word-based features
    features['total_words'] = dataframe[text_column].apply(number_words)
    features['word_length'] = dataframe[text_column].apply(word_length)
    features['vocabulary_richness'] = dataframe[text_column].apply(vocabulary_richness)
    features['long_words'] = dataframe[text_column].apply(long_words)
    features['short_words'] = dataframe[text_column].apply(short_words)
    features['hapax_legomena'] = dataframe[text_column].apply(legomena)
    features['hapax_dislegomena'] = dataframe[text_column].apply(dislegomena)
    features['yules_k'] = dataframe[text_column].apply(yules_k)
    features['simpson_d'] = dataframe[text_column].apply(simpson_d)
    features['sichel_s'] = dataframe[text_column].apply(sichel_s)
    #features['honore_r'] = dataframe[text_column].apply(honores_r)
    features['entropy'] =  dataframe[text_column].apply(entropy)
    # Brunet W?
    # word length frequency distribution

    
    # Syntactic features
    punctuations = [r"’", r",", r"\.", r":", r";", r"\?", r"\?{2,}", r"!", r"!{2,}", r"\.{3}"]
    for punctuation in punctuations:
        features[punctuation+"_frequency"] = dataframe[text_column].apply(punctuations_ratio, args=(punctuation,))

    # Structural features
    features['number_lines'] = dataframe[text_column].apply(lines)
    features['number_sentences'] = dataframe[text_column].apply(sentences)
    features['number_paragraphs'] = dataframe[text_column].apply(paragraphs)
    features['sentences_per_paragraph'] = dataframe[text_column].apply(sentence_paragraph)
    features['word_per_paragraph'] = dataframe[text_column].apply(words_paragraph)
    features['character_per_paragraph'] = dataframe[text_column].apply(chars_paragraph)
    features['word_per_sentence'] = dataframe[text_column].apply(words_sentences)
    features['ratio_sentencestart_uppercase'] = dataframe[text_column].apply(uppercase_start)
    features['gender'] = dataframe['gender']
        
    return features

In [48]:
pan15_train = extract_features(data_pan15_training, 'text')

In [49]:
pan15_test = extract_features(data_pan15_test, 'text')

In [113]:
pan17_train = extract_features(data_pan17_train, 'text')

In [115]:
pan17_test = extract_features(data_pan17_test, 'text')

In [50]:
pan15_test.to_csv(r'data\pan15_features_test.csv', index=False)

In [51]:
pan15_train.to_csv(r'data\pan15_features_training.csv', index=False)

In [119]:
pan17_test.to_csv(r'data\pan17_features_test.csv', index=False)

In [120]:
pan17_train.to_csv(r'data\pan17_features_training.csv', index=False)

## Create TF-IDF

In [80]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

In [121]:
tfidf = TfidfVectorizer(max_features=1000, ngram_range=(1, 2))

In [94]:
pan15_train_tfidf_array = tfidf.fit_transform(data_pan15_training['text'])
pan15_test_tfidf_array = tfidf.transform(data_pan15_test['text'])

In [123]:
pan17_train_tfidf_array = tfidf.fit_transform(data_pan17_train['text'])
pan17_test_tfidf_array = tfidf.transform(data_pan17_test['text'])

In [124]:
svd = TruncatedSVD(n_components=100, random_state=42)
normalizer = Normalizer(copy=False)
lsa = make_pipeline(svd, normalizer)

In [96]:
pan15_train_lsa = lsa.fit_transform(pan15_train_tfidf_array)
pan15_test_lsa = lsa.transform(pan15_test_tfidf_array)

In [125]:
pan17_train_lsa = lsa.fit_transform(pan17_train_tfidf_array)
pan17_test_lsa = lsa.transform(pan17_test_tfidf_array)

In [78]:
feature_names = tfidf.get_feature_names_out()
pan15_train_tfidf = pd.DataFrame(pan15_train_tfidf_array.toarray(), columns=feature_names)
pan15_test_tfidf = pd.DataFrame(pan15_test_tfidf_array.toarray(), columns=feature_names)

In [79]:
pan15_train_tfidf.to_csv(r'data\pan15_tfidf_train.csv', index=False)
pan15_test_tfidf.to_csv(r'data\pan15_tfidf_test.csv', index=False)

## Classifier

In [89]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [98]:
len(data_pan15_training)

14166

In [99]:
X_train = pan15_train_lsa
y_train = data_pan15_training['gender']
X_test = pan15_test_lsa
y_test = data_pan15_test['gender']

In [126]:
X_train = pan17_train_lsa
y_train = data_pan17_train['gender']
X_test = pan17_test_lsa
y_test = data_pan17_test['gender']

In [127]:
model = LinearSVC(random_state=42, max_iter=5000)

In [128]:
model.fit(X_train, y_train)

In [129]:
pred = model.predict(X_test)
accuracy_score(y_test, pred)

0.5699625

## Classifier 2

In [22]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

In [48]:
char_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), sublinear_tf=True, min_df=2)
#char_vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(3, 5), min_df=2)
word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), sublinear_tf=True, min_df=2)
#word_vectorizer = TfidfVectorizer(analyzer='word', ngram_range=(1, 2), min_df=2)

combined_features = FeatureUnion([
    ('char', char_vectorizer),
    ('word', word_vectorizer)
])

pipeline = Pipeline([
    ('features', combined_features),
    ('svm', LinearSVC())
])

In [49]:
X_train = combined_pan17_train['text']
y_train = combined_pan17_train['gender']
X_test = combined_pan17_test['text']
y_test = combined_pan17_test['gender']

In [51]:
pipeline.fit(X_train, y_train)

In [52]:
y_pred = pipeline.predict(X_test)
accuracy_score(y_test, y_pred)

0.8195833333333333

In [47]:
combined_pan17_train = data_pan17_train.groupby('author').agg({
    'text': ' '.join,
    'gender': 'first'
}).reset_index()
combined_pan17_test = data_pan17_test.groupby('author').agg({
    'text': ' '.join,
    'gender': 'first'
}).reset_index()

In [50]:
combined_pan17_test

Unnamed: 0,author,text,gender
0,100c885443c4d2a32075e10cbca9a27e,Less than 2 weeks until Valentine's Day https:...,female
1,1017647e21e7e73d900e1dfacbf95a0f,"@drakefenton FWIW, you were wrong four hours ...",female
2,1023e1e534622e28d29dfdc0ee45cdac,I'm marching in CBR. Your big city ain't going...,male
3,10283d7f37d33b063e3734b740f5229d,More pics for BTS' comeback Cr. Naver https://...,female
4,1069f66c9d5862f860277d32780ac459,@griffski Eh? Great message to be sending out....,female
...,...,...,...
2395,ffa1dd4f39b1e32dcd1274327af39eac,"@aliceisms fake news By this point, surely my ...",male
2396,ffd6c6cfe9c484d3538fb9d9a628af74,@timkaine now that he is Sec of state how can ...,female
2397,ffd9b325eddf0c9374ad5eabeca6860a,.@acquia_support Is dev desktop supported on M...,female
2398,ffe4df0924e74cdc1d0ec6b980529ae7,anyway i'm super gay and @downtongaby is super...,female
