In [1]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
import nltk
nltk.download('punkt')
from google.colab import drive
drive.mount('/content/drive')


In [2]:
# Se debe importar la carpeta en donde se encuentra el código para cargar las características estilometricas
from drive.MyDrive.stylometry.stylometry.extract import *

In [5]:
# Visualizamos el dataset original
data = pd.read_json('/content/drive/MyDrive/train_dataset/subtask_1/subtask_1.jsonl', lines=True)
data['label'] = np.where(data['label']=='generated',1,0)


In [10]:
# Se define la función para pasar los datos de entrenamiento y prueba en formato .csv a .txt
from pathlib import Path

def csv_to_txt(df_path, dir_path):
  """ Genera un directorio para la biblioteca stylometry a partir de un dataset

  Parámetros
  ----------
  df_path: str or Path
    Ruta del dataset en formato csv, debe contar con las colmnas 'label', el
    modelo que generó el texto, y 'text'.
  dir_path: str or Path
    Ruta del directorio donde se guardarán los archivos generados.
  """

  with open(df_path) as f:
    data = pd.read_csv(filepath_or_buffer=f)

  p = Path(dir_path)
  p.mkdir(exist_ok=True, parents=True)

  for _, d in data.iterrows():
    model_dir = p / d['label'] # Por cada label se crea un directorio donde se guardaran cada texto por separado en un .txt
    if not model_dir.is_dir():
      model_dir.mkdir()
    with open(model_dir / (str(d['id']) + '.txt'), 'w') as f:
      f.write(d['text'])


In [14]:
#Comenzamos pasando el conjunto de entrenamiento a .txt
csv_to_txt('/content/drive/MyDrive/train_dataset/subtask_1/Train_finales /train_S1.csv','./dir')

In [4]:
# Se separaron los registros del corpus en cada .txt individualmente para poder usar StyloCorpus
drive_path='./'
train_corpus = StyloCorpus.from_glob_pattern('./dir/*/*.txt')
train_corpus.output_csv(drive_path + 'train_stylometry_S1.csv')

In [66]:
df1=pd.read_csv('./train_stylometry_S1.csv') 
df1.head(2)

Unnamed: 0,Author,Title,LexicalDiversity,MeanWordLen,MeanSentenceLen,StdevSentenceLen,MeanParagraphLen,DocumentLen,Commas,Semicolons,...,Ands,Buts,Howevers,Ifs,Thats,Mores,Musts,Mights,This,Verys
0,human,./dir/human/114333.txt,71.0526,4.30864,11.2222,8.81637,101,498,8.77193,0.0,...,8.77193,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.77193,0.0
1,human,./dir/human/65765.txt,62.6741,6.17778,16.9412,8.01815,288,2016,139.276,13.9276,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [67]:
# Ahora vamos a recuperar sus índices y etiquetas
df1['Title']=df1['Title'].str.replace('./dir/human/', '')
df1['Title']=df1['Title'].str.replace('./dir/generated/', '')
df1['Title']=df1['Title'].str.replace('.txt', '')
df1.rename(columns={'Title':'id'},inplace=True)
df1.rename(columns={'Author':'label'},inplace=True)
df1['id']=df1['id'].astype(int)

In [68]:
id=data['id']
df1 = pd.merge(id, df1, on='id')

In [69]:
from sklearn.preprocessing import MinMaxScaler
# Y ahora vamos a normalizar el conjunto de entrenamiento con MinMaxScaler()
df1=df1.iloc[:,2:15]
columns = list(df1.columns)
scaler = MinMaxScaler()
scaler.fit(df1[columns])
df1[columns] = scaler.transform(df1[columns])
df1

Unnamed: 0,LexicalDiversity,MeanWordLen,MeanSentenceLen,StdevSentenceLen,MeanParagraphLen,DocumentLen,Commas,Semicolons,Quotes,Exclamations,Colons,Dashes,Mdashes
0,0.459051,0.096794,0.038736,0.053599,0.298292,0.289281,0.152331,0.000000,0.0,0.004537,0.000000,0.0,0.007216
1,1.000000,0.053534,0.018349,0.000000,0.003942,0.007714,0.133867,0.000000,0.0,0.000000,0.000000,0.0,0.000000
2,0.521924,0.100348,0.036697,0.051304,0.216820,0.223508,0.070633,0.000000,0.0,0.000000,0.010183,0.0,0.000000
3,0.578293,0.074305,0.043323,0.032423,0.102497,0.093179,0.125500,0.000000,0.0,0.000000,0.000000,0.0,0.000000
4,0.825500,0.047147,0.039755,0.008753,0.057819,0.045676,0.034621,0.000000,0.0,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...
76759,0.556170,0.105506,0.016460,0.014671,0.249671,0.275680,0.213966,0.022131,0.0,0.000000,0.000000,0.0,0.000000
76760,0.812575,0.066365,0.023955,0.033962,0.052562,0.048112,0.074370,0.000000,0.0,0.000000,0.000000,0.0,0.000000
76761,0.391419,0.086032,0.034719,0.038527,0.516426,0.471376,0.083120,0.000000,0.0,0.000000,0.008829,0.0,0.000000
76762,0.420763,0.101449,0.156728,0.263319,0.530880,0.484369,0.224033,0.000000,0.0,0.000000,0.000000,0.0,0.000000


In [30]:
df1.to_csv('./train_stylometry_S1.csv',index=False)

In [89]:
# Lo mismo se hace para el conjunto de prueba
#Comenzamos pasando el conjunto de prueba a .txt
csv_to_txt('/content/drive/MyDrive/train_dataset/subtask_1/Test_finales/test_S1.csv','./dir_test')

In [6]:
# Se separaron los registros del corpus en cada .txt individualmente para poder usar StyloCorpus
drive_path='./'
train_corpus = StyloCorpus.from_glob_pattern('./dir_test/*/*.txt')
train_corpus.output_csv(drive_path + 'test_stylometry_S1.csv')

In [91]:
df2=pd.read_csv('./test_stylometry_S1.csv') 
df2

Unnamed: 0,Author,Title,LexicalDiversity,MeanWordLen,MeanSentenceLen,StdevSentenceLen,MeanParagraphLen,DocumentLen,Commas,Semicolons,...,Ands,Buts,Howevers,Ifs,Thats,Mores,Musts,Mights,This,Verys
0,human,./dir_test/human/42590.txt,51.4412,5.00431,12.1667,11.75330,365,1993,53.2151,2.21729,...,17.7384,4.43459,0.0,0.0,8.86918,2.21729,0.0,0.0,6.65188,0.0
1,human,./dir_test/human/99604.txt,71.8750,6.19022,14.4667,7.07923,217,1520,101.5620,0.00000,...,0.0000,0.00000,0.0,0.0,0.00000,0.00000,0.0,0.0,0.00000,0.0
2,human,./dir_test/human/3562.txt,54.8961,5.32973,17.8125,14.77840,285,1542,83.0861,14.83680,...,0.0000,0.00000,0.0,0.0,0.00000,0.00000,0.0,0.0,0.00000,0.0
3,human,./dir_test/human/67574.txt,100.0000,3.94444,8.0000,2.00000,16,85,55.5556,0.00000,...,0.0000,0.00000,0.0,0.0,0.00000,0.00000,0.0,0.0,0.00000,0.0
4,human,./dir_test/human/126798.txt,77.5000,5.48387,8.2500,3.89711,33,226,100.0000,0.00000,...,0.0000,0.00000,0.0,0.0,0.00000,0.00000,0.0,0.0,0.00000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32894,generated,./dir_test/generated/121528.txt,42.8058,5.63025,13.3889,4.47386,241,1303,64.7482,0.00000,...,0.0000,0.00000,0.0,0.0,0.00000,0.00000,0.0,0.0,0.00000,0.0
32895,generated,./dir_test/generated/68418.txt,68.1529,5.73832,23.5000,5.18813,141,852,44.5860,0.00000,...,0.0000,0.00000,0.0,0.0,0.00000,0.00000,0.0,0.0,0.00000,0.0
32896,generated,./dir_test/generated/25090.txt,58.2915,5.76724,25.1429,7.88178,176,1081,50.2513,0.00000,...,45.2261,0.00000,0.0,0.0,0.00000,0.00000,0.0,0.0,0.00000,0.0
32897,generated,./dir_test/generated/52667.txt,85.7143,3.75000,25.0000,0.00000,25,119,71.4286,0.00000,...,0.0000,0.00000,0.0,0.0,0.00000,0.00000,0.0,0.0,0.00000,0.0


In [92]:
# Ahora vamos a recuperar sus índices y etiquetas
df2['Title']=df2['Title'].str.replace('./dir_test/human/', '')
df2['Title']=df2['Title'].str.replace('./dir_test/generated/', '')
df2['Title']=df2['Title'].str.replace('.txt', '')
df2.rename(columns={'Title':'id'},inplace=True)
df2.rename(columns={'Author':'label'},inplace=True)
df2['id']=df2['id'].astype(int)

In [93]:
id=data['id']
df2 = pd.merge(id, df2, on='id')

In [94]:
# Y por último vamos a normalizar el conjunto de prueba 
df2=df2.iloc[:,2:15]
df2[columns] = scaler.transform(df2[columns])

In [95]:
df2

Unnamed: 0,LexicalDiversity,MeanWordLen,MeanSentenceLen,StdevSentenceLen,MeanParagraphLen,DocumentLen,Commas,Semicolons,Quotes,Exclamations,Colons,Dashes,Mdashes
0,0.487379,0.101978,0.035168,0.021682,0.333771,0.324807,0.143429,0.000000,0.0,0.000000,0.006579,0.000000,0.0
1,0.502778,0.096918,0.025059,0.025690,0.398160,0.362160,0.081721,0.000000,0.0,0.000000,0.000000,0.000000,0.0
2,0.838065,0.059096,0.011468,0.010941,0.009198,0.015428,0.080320,0.000000,0.0,0.000000,0.000000,0.000000,0.0
3,0.793449,0.037436,0.007900,0.012987,0.035480,0.029842,0.040980,0.000000,0.0,0.000000,0.000000,0.000000,0.0
4,0.700120,0.064036,0.022936,0.000000,0.007884,0.015022,0.371851,0.000000,0.0,0.000000,0.375244,0.000000,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
32894,0.635281,0.073344,0.011954,0.008095,0.114323,0.097848,0.072360,0.000000,0.0,0.000000,0.000000,0.000000,0.0
32895,0.492282,0.081865,0.020280,0.047196,0.342970,0.289890,0.092779,0.000000,0.0,0.000000,0.000000,0.000000,0.0
32896,0.440571,0.097106,0.031498,0.032350,0.270696,0.246244,0.101671,0.000000,0.0,0.000000,0.000000,0.000000,0.0
32897,0.566245,0.086648,0.021611,0.051754,0.285151,0.253553,0.108541,0.041699,0.0,0.035562,0.007824,0.018954,0.0


In [None]:
df2.to_csv('./test_stylometry_S1.csv',index=False)