# Exploring data files

In [1]:
import pandas as pd
import xml.etree.ElementTree as et
import html
import os
import numpy as np
import textstat

## Loading data

In [2]:
items = os.listdir(r"C:\Users\Sten\Documents\EUR BIM\thesis\data\PAN 15\pan15-author-profiling-training-dataset-2015-04-23\pan15-author-profiling-training-dataset-english-2015-04-23")
documents = [item for item in items if item.endswith('.xml')]

In [3]:
data_pan15 = pd.DataFrame(columns=['author', 'text'])

In [4]:
for file in documents:
    xtree = et.parse(r"C:\Users\Sten\Documents\EUR BIM\thesis\data\PAN 15\pan15-author-profiling-training-dataset-2015-04-23\pan15-author-profiling-training-dataset-english-2015-04-23" + "\\" + file)
    xroot = xtree.getroot()
    documents_texts = [html.unescape(doc.text) for doc in xroot.findall(".//document")]
    temp_frame = pd.DataFrame(documents_texts, columns=['text'])
    temp_frame['author'] = file.replace('.xml', '')
    data_pan15 = pd.concat([data_pan15, temp_frame], axis=0).reset_index(drop=True)    

In [5]:
truth_pan15 = pd.read_csv(r'C:\Users\Sten\Documents\EUR BIM\thesis\data\PAN 15\pan15-author-profiling-training-dataset-2015-04-23\pan15-author-profiling-training-dataset-english-2015-04-23\truth.txt',
                          delimiter=':::', engine='python', header=None)
truth_pan15.columns = ['author', 'gender', 'age', 'extroverted', 'stable', 'agreeable', 'conscientious', 'open']

In [6]:
data_pan15 = pd.merge(data_pan15, truth_pan15[['author', 'gender', 'age']], on='author', how='left')

In [7]:
data_pan15

Unnamed: 0,author,text,gender,age
0,02ae95de-7ee3-453a-978d-25d28b3f1a88,Things I want for my business cards but are to...,M,25-34
1,02ae95de-7ee3-453a-978d-25d28b3f1a88,"""painters produced their most highly valued wo...",M,25-34
2,02ae95de-7ee3-453a-978d-25d28b3f1a88,@username your new discussion layout is confus...,M,25-34
3,02ae95de-7ee3-453a-978d-25d28b3f1a88,I never really understood why game environment...,M,25-34
4,02ae95de-7ee3-453a-978d-25d28b3f1a88,"@username 20k and 2048² on a gun, fine. But th...",M,25-34
...,...,...,...,...
14161,fde8eb00-0444-4159-9b65-1ead60c2dc88,Fifty Writing Tools: Quick List | Poynter. htt...,F,25-34
14162,fde8eb00-0444-4159-9b65-1ead60c2dc88,Video: How To Make Vietnamese Coffee (by HighB...,F,25-34
14163,fde8eb00-0444-4159-9b65-1ead60c2dc88,lyx is soooo awesome!!! finally figured out ho...,F,25-34
14164,fde8eb00-0444-4159-9b65-1ead60c2dc88,Impact Algorithms: Strategies Remarkable Peopl...,F,25-34


## Statistics

In [8]:
data_pan15_stats = pd.DataFrame()
data_pan15_stats['total_word_count'] = data_pan15['text'].apply(lambda x: len(x.split()))
data_pan15_stats['unique_word_count'] = data_pan15['text'].apply(lambda x: len(set(x.split())))
data_pan15_stats['average_word_length'] = data_pan15['text'].apply(lambda x: np.mean([len(word) for word in x.split()]))
data_pan15_stats['flesh_readability'] = data_pan15['text'].apply(textstat.flesch_reading_ease)

In [9]:
data_pan15_stats.describe().T.round(3).to_latex( caption="PAN 15 AP task descriptive statistics", label="tab:your_label")

  data_pan15_stats.describe().T.round(3).to_latex( caption="PAN 15 AP task descriptive statistics", label="tab:your_label")


'\\begin{table}\n\\centering\n\\caption{PAN 15 AP task descriptive statistics}\n\\label{tab:your_label}\n\\begin{tabular}{lrrrrrrrr}\n\\toprule\n{} &    count &    mean &     std &      min &     25\\% &    50\\% &     75\\% &     max \\\\\n\\midrule\ntotal\\_word\\_count    &  14166.0 &  11.854 &   5.921 &     1.00 &   7.000 &  11.00 &  16.000 &   33.00 \\\\\nunique\\_word\\_count   &  14166.0 &  11.305 &   5.481 &     1.00 &   7.000 &  11.00 &  15.000 &   30.00 \\\\\naverage\\_word\\_length &  14166.0 &   5.925 &   2.133 &     1.00 &   4.529 &   5.60 &   6.889 &   90.00 \\\\\nflesh\\_readability   &  14166.0 &  66.038 &  36.361 & -1147.79 &  49.480 &  72.16 &  89.040 &  121.22 \\\\\n\\bottomrule\n\\end{tabular}\n\\end{table}\n'

In [10]:
data_pan15_stats.describe().T.round(3)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_word_count,14166.0,11.854,5.921,1.0,7.0,11.0,16.0,33.0
unique_word_count,14166.0,11.305,5.481,1.0,7.0,11.0,15.0,30.0
average_word_length,14166.0,5.925,2.133,1.0,4.529,5.6,6.889,90.0
flesh_readability,14166.0,66.038,36.361,-1147.79,49.48,72.16,89.04,121.22


In [11]:
data_pan15['gender'].value_counts()

F    7180
M    6986
Name: gender, dtype: int64

In [12]:
truth_pan15['gender'].value_counts()

M    76
F    76
Name: gender, dtype: int64

In [13]:
data_pan15['age'].value_counts()

25-34    5624
18-24    5588
35-49    1865
50-XX    1089
Name: age, dtype: int64

In [14]:
truth_pan15['age'].value_counts()

25-34    60
18-24    58
35-49    22
50-XX    12
Name: age, dtype: int64

## Loading test dataset

In [15]:
items_test = os.listdir(r'C:\Users\Sten\Documents\EUR BIM\thesis\data\PAN 15\test_data\pan15-author-profiling-test-dataset2-english-2015-04-23')
documents_test = [item for item in items_test if item.endswith('.xml')]

In [16]:
data_pan15_test = pd.DataFrame(columns=['author', 'text'])

for file in documents_test:
    xtree = et.parse(r'C:\Users\Sten\Documents\EUR BIM\thesis\data\PAN 15\test_data\pan15-author-profiling-test-dataset2-english-2015-04-23' + "\\" + file)
    xroot = xtree.getroot()
    documents_texts = [html.unescape(doc.text) for doc in xroot.findall(".//document")]
    temp_frame = pd.DataFrame(documents_texts, columns=['text'])
    temp_frame['author'] = file.replace('.xml', '')
    data_pan15_test = pd.concat([data_pan15_test, temp_frame], axis=0).reset_index(drop=True)    

In [17]:
truth_pan15_test = pd.read_csv(r'C:\Users\Sten\Documents\EUR BIM\thesis\data\PAN 15\test_data\pan15-author-profiling-test-dataset2-english-2015-04-23\truth.txt',
                          delimiter=':::', engine='python', header=None)
truth_pan15_test.columns = ['author', 'gender', 'age', 'extroverted', 'stable', 'agreeable', 'conscientious', 'open']

In [18]:
data_pan15_test = pd.merge(data_pan15_test, truth_pan15_test[['author', 'gender', 'age']], on='author', how='left')

## Creating data files

In [19]:
data_pan15

Unnamed: 0,author,text,gender,age
0,02ae95de-7ee3-453a-978d-25d28b3f1a88,Things I want for my business cards but are to...,M,25-34
1,02ae95de-7ee3-453a-978d-25d28b3f1a88,"""painters produced their most highly valued wo...",M,25-34
2,02ae95de-7ee3-453a-978d-25d28b3f1a88,@username your new discussion layout is confus...,M,25-34
3,02ae95de-7ee3-453a-978d-25d28b3f1a88,I never really understood why game environment...,M,25-34
4,02ae95de-7ee3-453a-978d-25d28b3f1a88,"@username 20k and 2048² on a gun, fine. But th...",M,25-34
...,...,...,...,...
14161,fde8eb00-0444-4159-9b65-1ead60c2dc88,Fifty Writing Tools: Quick List | Poynter. htt...,F,25-34
14162,fde8eb00-0444-4159-9b65-1ead60c2dc88,Video: How To Make Vietnamese Coffee (by HighB...,F,25-34
14163,fde8eb00-0444-4159-9b65-1ead60c2dc88,lyx is soooo awesome!!! finally figured out ho...,F,25-34
14164,fde8eb00-0444-4159-9b65-1ead60c2dc88,Impact Algorithms: Strategies Remarkable Peopl...,F,25-34


In [20]:
data_pan15.to_csv('data\PAN_15_training.csv')

In [21]:
data_pan15_test.to_csv('data\PAN_15_test.csv')