In [7]:
# Celdas para leer directamente un archivo .arff en Python

# 1. Importar librerías necesarias
import pandas as pd
from scipy.io import arff
# Removed incorrect import of load_arff
from pathlib import Path
from sklearn.preprocessing import LabelEncoder

In [8]:
# 2. Definir la ruta al archivo .arff
darff_path = Path('Sapfile1.arff')  # <-- Ajusta si está en otra carpeta

# 3. Leer el .arff usando scipy
#    Devuelve una tupla (data, meta)
data_scipy, meta_scipy = arff.loadarff(str(darff_path))
# Convertir a DataFrame
df_scipy = pd.DataFrame(data_scipy)
# Decodificar columnas byte a texto si es necesario
for col in df_scipy.select_dtypes([object]):
    df_scipy[col] = df_scipy[col].str.decode('utf-8')
print("Primeras filas con scipy:")
print(df_scipy.head())

# Removed scikit-learn related code as load_arff is not available

# Opcional: mostrar información general
def resumen(df):
    print(df.info())
    print(df.describe())

# resumen(df_scipy)


Primeras filas con scipy:
  ge   cst   tnp   twp iap   esp arr         ms ls    as  ...  fq  mq  \
0  F     G  Good  Good  Vg  Good   Y  Unmarried  V  Paid  ...  Um  10   
1  M   OBC    Vg    Vg  Vg    Vg   N  Unmarried  V  Paid  ...  Um  Il   
2  F   OBC  Good  Good  Vg  Good   N  Unmarried  V  Paid  ...  12  10   
3  M  MOBC  Pass  Good  Vg  Good   N  Unmarried  V  Paid  ...  12  Um   
4  M     G  Good  Good  Vg    Vg   N  Unmarried  V  Paid  ...  10  12   

         fo         mo       nf       sh       ss   me       tt      atd  
0    Farmer  Housewife    Large     Poor     Govt  Asm    Small     Good  
1   Service    Service    Small     Poor     Govt  Asm  Average  Average  
2   Service  Housewife  Average  Average     Govt  Asm    Large     Good  
3  Business   Business    Large     Poor     Govt  Asm  Average  Average  
4   Service  Housewife    Large     Poor  Private  Asm    Small     Good  

[5 rows x 22 columns]


In [9]:
# Verificar cantidad de filas (instancias)
print("Número de instancias:", df_scipy.shape[0])

# Verificar valores nulos reales (NaN)
print("\nValores nulos por columna:")
print(df_scipy.isnull().sum())

# Buscar valores '?' como posibles datos faltantes
print("\nValores '?' por columna:")
for col in df_scipy.columns:
    if df_scipy[col].dtype == object:
        print(f"{col}: {(df_scipy[col] == '?').sum()}")


Número de instancias: 131

Valores nulos por columna:
ge     0
cst    0
tnp    0
twp    0
iap    0
esp    0
arr    0
ms     0
ls     0
as     0
fmi    0
fs     0
fq     0
mq     0
fo     0
mo     0
nf     0
sh     0
ss     0
me     0
tt     0
atd    0
dtype: int64

Valores '?' por columna:
ge: 0
cst: 0
tnp: 0
twp: 0
iap: 0
esp: 0
arr: 0
ms: 0
ls: 0
as: 0
fmi: 0
fs: 0
fq: 0
mq: 0
fo: 0
mo: 0
nf: 0
sh: 0
ss: 0
me: 0
tt: 0
atd: 0


In [10]:
print("Tipos de datos del DataFrame:")
print(df_scipy.dtypes)


Tipos de datos del DataFrame:
ge     object
cst    object
tnp    object
twp    object
iap    object
esp    object
arr    object
ms     object
ls     object
as     object
fmi    object
fs     object
fq     object
mq     object
fo     object
mo     object
nf     object
sh     object
ss     object
me     object
tt     object
atd    object
dtype: object


In [11]:
# Análisis de la variable objetivo
print("Valores únicos en 'atd':")
print(df_scipy['atd'].value_counts())


Valores únicos en 'atd':
atd
Good       56
Average    47
Poor       28
Name: count, dtype: int64


In [12]:
# Llevar la columna 'atd' al final
target = 'atd'
df_scipy = df_scipy[[col for col in df_scipy.columns if col != target] + [target]]

# Crear copia para no tocar el original directamente
df_encoded = df_scipy.copy()

# Aplicar LabelEncoder a todas las columnas (incluyendo el target)
label_encoders = {}
for col in df_encoded.columns:
    le = LabelEncoder()
    df_encoded[col] = le.fit_transform(df_encoded[col])
    label_encoders[col] = le  # guardar por si luego se necesita invertir

# Mostrar las primeras filas codificadas
df_encoded.head()

Unnamed: 0,ge,cst,tnp,twp,iap,esp,arr,ms,ls,as,...,fq,mq,fo,mo,nf,sh,ss,me,tt,atd
0,0,0,1,1,3,1,1,0,1,1,...,5,0,1,1,1,2,0,0,2,1
1,1,2,3,3,3,3,0,0,1,1,...,5,3,4,4,2,2,0,0,0,0
2,0,2,1,1,3,1,0,0,1,1,...,1,0,4,1,0,0,0,0,1,1
3,1,1,2,1,3,1,0,0,1,1,...,1,5,0,0,1,2,0,0,0,0
4,1,0,1,1,3,3,0,0,1,1,...,0,1,4,1,1,2,1,0,2,1
