# Exploring data files

In [2]:
import pandas as pd
import xml.etree.ElementTree as et
import html
import os
import numpy as np
import textstat

## Loading data

In [3]:
items = os.listdir(r'C:\Users\Sten\Documents\EUR BIM\thesis\data\PAN 17\training')
documents = [item for item in items if item.endswith('.xml')]

In [5]:
len(documents)

3600

In [6]:
data_pan17 = pd.DataFrame(columns=['author', 'text'])

In [7]:
for file in documents:
    xtree = et.parse(r"C:\Users\Sten\Documents\EUR BIM\thesis\data\PAN 17\training" + "\\" + file)
    xroot = xtree.getroot()
    documents_texts = [html.unescape(doc.text) for doc in xroot.findall(".//document")]
    temp_frame = pd.DataFrame(documents_texts, columns=['text'])
    temp_frame['author'] = file.replace('.xml', '')
    data_pan17 = pd.concat([data_pan17, temp_frame], axis=0).reset_index(drop=True)    

In [8]:
data_pan17

Unnamed: 0,author,text
0,1003de26f870d27f79887272a1eb3612,One to watch … \nAvailable on 10th Feb. https:...
1,1003de26f870d27f79887272a1eb3612,@kirkj @deleifd There are 3 complete sets in e...
2,1003de26f870d27f79887272a1eb3612,"@deleifd If you have 1, 3, (maybe 5), 10 or 14..."
3,1003de26f870d27f79887272a1eb3612,@deleifd I've heard that numerous other toywor...
4,1003de26f870d27f79887272a1eb3612,Just heard about a new Web Dev meetup in The H...
...,...,...
359995,fff1359e0dc1eba31a10120bf16834b7,@thehitsnz @MadMaclegend @richmaori. Hey! Is @...
359996,fff1359e0dc1eba31a10120bf16834b7,@MadMaclegend @richmaori - God @nzherald - let...
359997,fff1359e0dc1eba31a10120bf16834b7,@MadMaclegend @PonekeFC Next week it's #Hottie...
359998,fff1359e0dc1eba31a10120bf16834b7,@MadMaclegend @nzherald Story is relevant cos ...


In [9]:
truth_pan17 = pd.read_csv(r"C:\Users\Sten\Documents\EUR BIM\thesis\data\PAN 17\training\truth.txt",
                          delimiter=':::', engine='python', header=None)
truth_pan17.columns = ['author', 'gender', 'country']

In [10]:
data_pan17 = pd.merge(data_pan17, truth_pan17[['author', 'gender', 'country']], on='author', how='left')

In [11]:
data_pan17

Unnamed: 0,author,text,gender,country
0,1003de26f870d27f79887272a1eb3612,One to watch … \nAvailable on 10th Feb. https:...,male,new zealand
1,1003de26f870d27f79887272a1eb3612,@kirkj @deleifd There are 3 complete sets in e...,male,new zealand
2,1003de26f870d27f79887272a1eb3612,"@deleifd If you have 1, 3, (maybe 5), 10 or 14...",male,new zealand
3,1003de26f870d27f79887272a1eb3612,@deleifd I've heard that numerous other toywor...,male,new zealand
4,1003de26f870d27f79887272a1eb3612,Just heard about a new Web Dev meetup in The H...,male,new zealand
...,...,...,...,...
359995,fff1359e0dc1eba31a10120bf16834b7,@thehitsnz @MadMaclegend @richmaori. Hey! Is @...,female,new zealand
359996,fff1359e0dc1eba31a10120bf16834b7,@MadMaclegend @richmaori - God @nzherald - let...,female,new zealand
359997,fff1359e0dc1eba31a10120bf16834b7,@MadMaclegend @PonekeFC Next week it's #Hottie...,female,new zealand
359998,fff1359e0dc1eba31a10120bf16834b7,@MadMaclegend @nzherald Story is relevant cos ...,female,new zealand


## Statistics

In [8]:
data_pan15_stats = pd.DataFrame()
data_pan15_stats['total_word_count'] = data_pan15['text'].apply(lambda x: len(x.split()))
data_pan15_stats['unique_word_count'] = data_pan15['text'].apply(lambda x: len(set(x.split())))
data_pan15_stats['average_word_length'] = data_pan15['text'].apply(lambda x: np.mean([len(word) for word in x.split()]))
data_pan15_stats['flesh_readability'] = data_pan15['text'].apply(textstat.flesch_reading_ease)

In [9]:
data_pan15_stats.describe().T.round(3).to_latex( caption="PAN 15 AP task descriptive statistics", label="tab:your_label")

  data_pan15_stats.describe().T.round(3).to_latex( caption="PAN 15 AP task descriptive statistics", label="tab:your_label")


'\\begin{table}\n\\centering\n\\caption{PAN 15 AP task descriptive statistics}\n\\label{tab:your_label}\n\\begin{tabular}{lrrrrrrrr}\n\\toprule\n{} &    count &    mean &     std &      min &     25\\% &    50\\% &     75\\% &     max \\\\\n\\midrule\ntotal\\_word\\_count    &  14166.0 &  11.854 &   5.921 &     1.00 &   7.000 &  11.00 &  16.000 &   33.00 \\\\\nunique\\_word\\_count   &  14166.0 &  11.305 &   5.481 &     1.00 &   7.000 &  11.00 &  15.000 &   30.00 \\\\\naverage\\_word\\_length &  14166.0 &   5.925 &   2.133 &     1.00 &   4.529 &   5.60 &   6.889 &   90.00 \\\\\nflesh\\_readability   &  14166.0 &  66.038 &  36.361 & -1147.79 &  49.480 &  72.16 &  89.040 &  121.22 \\\\\n\\bottomrule\n\\end{tabular}\n\\end{table}\n'

In [10]:
data_pan15_stats.describe().T.round(3)

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
total_word_count,14166.0,11.854,5.921,1.0,7.0,11.0,16.0,33.0
unique_word_count,14166.0,11.305,5.481,1.0,7.0,11.0,15.0,30.0
average_word_length,14166.0,5.925,2.133,1.0,4.529,5.6,6.889,90.0
flesh_readability,14166.0,66.038,36.361,-1147.79,49.48,72.16,89.04,121.22


In [11]:
data_pan15['gender'].value_counts()

F    7180
M    6986
Name: gender, dtype: int64

In [12]:
truth_pan15['gender'].value_counts()

M    76
F    76
Name: gender, dtype: int64

In [13]:
data_pan15['age'].value_counts()

25-34    5624
18-24    5588
35-49    1865
50-XX    1089
Name: age, dtype: int64

In [14]:
truth_pan15['age'].value_counts()

25-34    60
18-24    58
35-49    22
50-XX    12
Name: age, dtype: int64

## Loading test dataset

In [14]:
items_test = os.listdir(r'C:\Users\Sten\Documents\EUR BIM\thesis\data\PAN 17\test')
documents_test = [item for item in items_test if item.endswith('.xml')]

In [15]:
data_pan17_test = pd.DataFrame(columns=['author', 'text'])

for file in documents_test:
    xtree = et.parse(r'C:\Users\Sten\Documents\EUR BIM\thesis\data\PAN 17\test' + "\\" + file)
    xroot = xtree.getroot()
    documents_texts = [html.unescape(doc.text) for doc in xroot.findall(".//document")]
    temp_frame = pd.DataFrame(documents_texts, columns=['text'])
    temp_frame['author'] = file.replace('.xml', '')
    data_pan17_test = pd.concat([data_pan17_test, temp_frame], axis=0).reset_index(drop=True)    

In [16]:
truth_pan17_test = pd.read_csv(r'C:\Users\Sten\Documents\EUR BIM\thesis\data\PAN 17\test\truth.txt',
                          delimiter=':::', engine='python', header=None)
truth_pan17_test.columns = ['author', 'gender', 'country']

In [17]:
data_pan17_test = pd.merge(data_pan17_test, truth_pan17_test[['author', 'gender', 'country']], on='author', how='left')

## Creating data files

In [19]:
data_pan17_test

Unnamed: 0,author,text,gender,country
0,100c885443c4d2a32075e10cbca9a27e,Less than 2 weeks until Valentine's Day https:...,female,australia
1,100c885443c4d2a32075e10cbca9a27e,"Omg now I remember, that photo was from when I...",female,australia
2,100c885443c4d2a32075e10cbca9a27e,when you eat an entire bag of popcorn and fami...,female,australia
3,100c885443c4d2a32075e10cbca9a27e,@tartecosmetics my fav shade has gone 😭😭😭😭 htt...,female,australia
4,100c885443c4d2a32075e10cbca9a27e,OMFG this is happening to me rn!!!!!!!! https:...,female,australia
...,...,...,...,...
239995,fff5a17288a8ab173e493c90bf4b39a4,"Saw a seal in the wild, can you tell I'm happy...",male,australia
239996,fff5a17288a8ab173e493c90bf4b39a4,Note: everyone is up getting ready and he has ...,male,australia
239997,fff5a17288a8ab173e493c90bf4b39a4,@dorkfaceblog Just happy to be here 😂✌🏻#thegir...,male,australia
239998,fff5a17288a8ab173e493c90bf4b39a4,@Step2Adulthood @dorkfaceblog It was but thank...,male,australia


In [23]:
data_pan17.to_csv('data/raw_data/PAN_17_training.csv')

In [24]:
data_pan17_test.to_csv('data/raw_data/PAN_17_test.csv')