In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import nltk
nltk.download('punkt')
from google.colab import drive
drive.mount('/content/drive')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Mounted at /content/drive


In [4]:
# Se debe importar la carpeta en donde se encuentra el código para cargar las características estilometricas
from drive.MyDrive.stylometry.stylometry.extract import *

In [5]:
# Visualizamos el dataset original
data = pd.read_json('/content/drive/MyDrive/train_dataset/subtask_2/subtask_2.jsonl', lines=True)
etiquetas = ['A', 'B', 'C', 'D', 'E', 'F']
for i, clase in enumerate(etiquetas):
    data['label'] = np.where(data['label'] == clase, i, data['label'])
data

Unnamed: 0,id,text,label
0,15442,Es un tribunal de suprema instanza que forma p...,1
1,12777,Doug mcadam es un científico estadista estadou...,1
2,63467,"El president de la generalitat, carles puigdem...",3
3,41050,"Traídas de francia, tan estrafalarias que una ...",2
4,91814,A saída trives – baiona será ás 8 30h da mañá ...,2
...,...,...,...
58749,122155,"Marianne, once recovered from her injury, was ...",4
58750,51914,El corazón le latía con fuerza en el pecho. no...,5
58751,76637,"Bere burua egokitzen saiatu zuen, aurreko mome...",5
58752,144741,"The irish president has signed the ""long-await...",1


In [9]:
from pathlib import Path
#función para pasar de los datos de entrenamiento y prueba en formato .csv a .txt (sin subdirectorios)

def csv_to_txt(df_path, dir_path):
    """Genera un directorio para la biblioteca stylometry a partir de un dataset.

    Parámetros
    ----------
    df_path: str or Path
        Ruta del dataset en formato JSON, debe contar con las columnas 'label',
        el modelo que generó el texto, y 'text'.
    dir_path: str or Path
        Ruta del directorio donde se guardarán los archivos generados.
    """

    with open(df_path) as f:
        #data = pd.read_json(path_or_buf=f, lines=True)
        data = pd.read_csv(filepath_or_buffer=f)

    p = Path(dir_path)
    p.mkdir(exist_ok=True, parents=True)

    for _, d in data.iterrows():
        # Guardar directamente en el directorio especificado, sin subdirectorios
        file_path = p / (str(d['id']) + '.txt')
        with open(file_path, 'w') as f:
            f.write(d['text'])


In [14]:
#Comenzamos pasando el conjunto de entrenamiento a .txt
csv_to_txt('/content/drive/MyDrive/train_dataset/subtask_2/Train/train_S2.csv','./dir')

In [None]:
# Se separaron los registros del corpus en cada .txt individualmente para poder usar StyloCorpus
drive_path='./'
train_corpus = StyloCorpus.from_glob_pattern('./dir/*.txt')
train_corpus.output_csv(drive_path + 'train_stylometry_S2.csv')

In [16]:
df1=pd.read_csv('./train_stylometry_S2.csv')
df1

Unnamed: 0,Author,Title,LexicalDiversity,MeanWordLen,MeanSentenceLen,StdevSentenceLen,MeanParagraphLen,DocumentLen,Commas,Semicolons,...,Ands,Buts,Howevers,Ifs,Thats,Mores,Musts,Mights,This,Verys
0,dir,./dir/7352.txt,54.4304,5.03876,28.0000,1.69031,196,1077,105.4850,0.0,...,0.0000,0.00000,0.0,0.0,0.00000,0.00000,0.0,0.0,0.00000,0.0
1,dir,./dir/60802.txt,62.6087,4.95833,17.4000,9.35094,87,520,60.8696,0.0,...,0.0000,0.00000,0.0,0.0,0.00000,0.00000,0.0,0.0,0.00000,0.0
2,dir,./dir/104087.txt,43.4343,5.32558,16.3000,5.15849,163,957,65.6566,0.0,...,30.3030,0.00000,0.0,0.0,0.00000,0.00000,0.0,0.0,0.00000,0.0
3,dir,./dir/26789.txt,65.5172,4.57895,12.0000,6.16441,96,537,60.3448,0.0,...,0.0000,8.62069,0.0,0.0,8.62069,0.00000,0.0,0.0,0.00000,0.0
4,dir,./dir/119809.txt,49.2212,6.62025,26.8182,10.07980,295,1848,37.3832,0.0,...,0.0000,0.00000,0.0,0.0,0.00000,0.00000,0.0,0.0,0.00000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
41122,dir,./dir/96350.txt,50.6775,6.56150,22.0667,7.46964,331,2163,37.9404,0.0,...,0.0000,0.00000,0.0,0.0,0.00000,0.00000,0.0,0.0,0.00000,0.0
41123,dir,./dir/148720.txt,47.0000,6.01418,18.0000,7.13676,270,1673,46.6667,0.0,...,40.0000,0.00000,0.0,0.0,0.00000,0.00000,0.0,0.0,0.00000,0.0
41124,dir,./dir/63560.txt,45.8753,6.25877,24.8333,8.36162,447,2819,42.2535,0.0,...,40.2414,2.01207,0.0,0.0,6.03622,2.01207,0.0,0.0,4.02414,0.0
41125,dir,./dir/54915.txt,52.7697,5.85635,19.8667,7.53540,298,1831,67.0554,0.0,...,0.0000,0.00000,0.0,0.0,0.00000,0.00000,0.0,0.0,0.00000,0.0


In [17]:
# Ahora vamos a recuperar sus índices y etiquetas
df1['Title']=df1['Title'].str.replace('./dir/', '')
df1['Title']=df1['Title'].str.replace('.txt', '')
df1.rename(columns={'Title':'id'},inplace=True)
df1.rename(columns={'Author':'label'},inplace=True)
df1['id']=df1['id'].astype(int)

In [18]:
id=data['id']
df1 = pd.merge(id, df1, on='id')

In [20]:
from sklearn.preprocessing import MinMaxScaler
# Y ahora vamos a normalizar el conjunto de entrenamiento con MinMaxScaler()
df1=df1.iloc[:,2:15]
columns = list(df1.columns)
scaler = MinMaxScaler()
scaler.fit(df1[columns])
df1[columns] = scaler.transform(df1[columns])
df1

Unnamed: 0,LexicalDiversity,MeanWordLen,MeanSentenceLen,StdevSentenceLen,MeanParagraphLen,DocumentLen,Commas,Semicolons,Quotes,Exclamations,Colons,Dashes,Mdashes
0,0.460716,0.096794,0.038736,0.053599,0.275820,0.330396,0.152331,0.000000,0.0,0.006917,0.000000,0.000000,0.010522
1,0.523395,0.100348,0.036697,0.051304,0.200486,0.255275,0.070633,0.000000,0.0,0.000000,0.014989,0.000000,0.000000
2,0.826037,0.047147,0.039755,0.008753,0.053463,0.052168,0.034621,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
3,0.504308,0.096918,0.025059,0.025690,0.368165,0.413633,0.081721,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
4,0.447175,0.102170,0.032805,0.018413,0.287971,0.361929,0.078156,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
41122,0.383776,0.066289,0.027523,0.041320,0.149453,0.159518,0.053906,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
41123,0.527134,0.103624,0.031892,0.022306,0.359660,0.418270,0.083916,0.000000,0.0,0.000000,0.000000,0.000000,0.000000
41124,0.448452,0.073234,0.016412,0.023430,0.415553,0.450730,0.152451,0.000000,0.0,0.004351,0.000000,0.006074,0.000000
41125,0.518183,0.093934,0.021625,0.026476,0.374241,0.415488,0.095888,0.015169,0.0,0.000000,0.000000,0.000000,0.000000


In [25]:
df1.to_csv('./train_stylometry_S2.csv',index=False)

In [26]:
# Lo mismo se hace para el conjunto de prueba
#Comenzamos pasando el conjunto de prueba a .txt
csv_to_txt('/content/drive/MyDrive/train_dataset/subtask_2/Test/test_S2.csv','./dir_test')


In [None]:
# Se separaron los registros del corpus en cada .txt individualmente para poder usar StyloCorpus
# El .csv generado se guardará directamente en drive
drive_path='./'
train_corpus = StyloCorpus.from_glob_pattern('./dir_test/*.txt')
train_corpus.output_csv(drive_path + 'test_stylometry_S2.csv')

In [30]:
df2=pd.read_csv('./test_stylometry_S2.csv') #ciencias
df2

Unnamed: 0,Author,Title,LexicalDiversity,MeanWordLen,MeanSentenceLen,StdevSentenceLen,MeanParagraphLen,DocumentLen,Commas,Semicolons,...,Ands,Buts,Howevers,Ifs,Thats,Mores,Musts,Mights,This,Verys
0,dir_test,./dir_test/146367.txt,47.0790,5.35036,18.2143,7.66419,255,1466,44.6735,0.0,...,0.0000,0.00000,0.0,0.00000,0.00000,0.00000,0.00000,0.0,0.00000,0.0
1,dir_test,./dir_test/104876.txt,57.6923,5.95333,17.7692,8.58580,231,1362,61.5385,0.0,...,0.0000,0.00000,0.0,0.00000,0.00000,0.00000,0.00000,0.0,0.00000,0.0
2,dir_test,./dir_test/354.txt,41.3712,4.94286,12.7586,5.91698,370,1930,37.8251,0.0,...,30.7329,4.72813,0.0,4.72813,23.64070,4.72813,0.00000,0.0,4.72813,0.0
3,dir_test,./dir_test/72823.txt,72.2222,5.87179,14.0000,0.00000,42,326,129.6300,0.0,...,0.0000,0.00000,0.0,0.00000,0.00000,0.00000,0.00000,0.0,0.00000,0.0
4,dir_test,./dir_test/150815.txt,45.5696,5.92593,27.3750,3.70599,219,1381,42.1941,0.0,...,0.0000,0.00000,0.0,0.00000,0.00000,0.00000,0.00000,0.0,0.00000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17622,dir_test,./dir_test/64185.txt,55.5921,6.57396,40.1429,11.29380,281,1812,36.1842,0.0,...,0.0000,0.00000,0.0,0.00000,0.00000,0.00000,0.00000,0.0,0.00000,0.0
17623,dir_test,./dir_test/74344.txt,80.9524,4.76471,41.0000,0.00000,41,219,23.8095,0.0,...,0.0000,0.00000,0.0,0.00000,0.00000,0.00000,0.00000,0.0,0.00000,0.0
17624,dir_test,./dir_test/46382.txt,51.9417,4.67290,25.1667,11.90820,151,888,24.2718,0.0,...,0.0000,0.00000,0.0,0.00000,0.00000,0.00000,0.00000,0.0,0.00000,0.0
17625,dir_test,./dir_test/79109.txt,44.0909,4.77835,11.9643,7.11378,335,1790,56.8182,0.0,...,25.0000,11.36360,0.0,0.00000,9.09091,0.00000,0.00000,0.0,2.27273,0.0


In [31]:
# Ahora vamos a recuperar sus índices y etiquetas
df2['Title']=df2['Title'].str.replace('./dir_test/', '')
df2['Title']=df2['Title'].str.replace('.txt', '')
df2.rename(columns={'Title':'id'},inplace=True)
df2.rename(columns={'Author':'label'},inplace=True)
df2['id']=df2['id'].astype(int)

In [32]:
id=data['id']
df2 = pd.merge(id, df2, on='id')

In [35]:
# Y por último vamos a normalizar el conjunto de prueba
df2=df2.iloc[:,2:15]
df2[columns] = scaler.transform(df2[columns])

In [36]:
df2

Unnamed: 0,LexicalDiversity,MeanWordLen,MeanSentenceLen,StdevSentenceLen,MeanParagraphLen,DocumentLen,Commas,Semicolons,Quotes,Exclamations,Colons,Dashes,Mdashes
0,0.488957,0.101978,0.035168,0.021682,0.308627,0.370971,0.143429,0.000000,0.0,0.0,0.009684,0.0,0.0
1,0.579590,0.074305,0.043323,0.032423,0.094775,0.106422,0.125500,0.000000,0.0,0.0,0.000000,0.0,0.0
2,0.707678,0.073642,0.036315,0.036795,0.108141,0.122421,0.056299,0.000000,0.0,0.0,0.000000,0.0,0.0
3,0.516582,0.106862,0.035372,0.022250,0.427704,0.537213,0.083872,0.000000,0.0,0.0,0.000000,0.0,0.0
4,0.407002,0.098370,0.030340,0.025608,0.469016,0.562486,0.096877,0.000000,0.0,0.0,0.039247,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
17622,0.683752,0.061892,0.019878,0.008187,0.055893,0.062601,0.059940,0.000000,0.0,0.0,0.000000,0.0,0.0
17623,0.636403,0.073344,0.011954,0.008095,0.105711,0.111755,0.072360,0.000000,0.0,0.0,0.000000,0.0,0.0
17624,0.442292,0.097106,0.031498,0.032350,0.250304,0.281243,0.101671,0.000000,0.0,0.0,0.000000,0.0,0.0
17625,0.557536,0.105506,0.016460,0.014671,0.230863,0.314862,0.213966,0.022131,0.0,0.0,0.000000,0.0,0.0


In [37]:
df1.to_csv('./test_stylometry_S2.csv',index=False)