In [15]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
from sklearn.preprocessing import LabelEncoder

data = pd.read_csv('/content/dbcompleta.csv')

print (f"****Columnas antes de la limpieza y trasformacion de datos****")
print(data.head())

# Elimina las columnas especificadas
columns_to_drop = ['animal_id', 'animal_id','age_upon_intake', 'age_upon_outcome', 'animal_type.1','found_location','animal_id.1','color.1', 'breed.1', 'datetime_obj', 'datetime_obj.1' ]
data.drop(columns=columns_to_drop, inplace=True)

# Separa la columna sex_upon_intake en sex y reprod.
data[['sex_intake', 'reprod_intake']] = data['sex_upon_intake'].str.split(' ', expand=True)

# Separa la columna sex_upon_outcome en sex y reprod.
data[['sex_outcome', 'reprod_outcome']] = data['sex_upon_outcome'].str.split(' ', expand=True)


# Elimina las columnas originales
data.drop(columns=['sex_upon_intake', 'sex_upon_outcome'], inplace=True)


# Selección de las columnas categóricas a codificar
categorical_cols = ['outcome_type', 'animal_type', 'intake_condition', 'intake_type','sex_outcome', 'reprod_outcome', 'sex_intake', 'reprod_intake']

label_encoder = LabelEncoder()
# Iterar columnas categóricas y aplicar el Label Encoding
for col in categorical_cols:
    data[col] = label_encoder.fit_transform(data[col])


# Verifica los valores nulos en las columnas sex_intake y sex_outcomes
null_sex_intake = data[data['sex_intake'].isnull()]
null_sex_outcome = data[data['sex_outcome'].isnull()]

# Imprime la cantidad de valores nulos en las columnas sex_intake y sex_outcomes
print("Cantidad de valores nulos en la columna 'sex_intake':", null_sex_intake.shape[0])
print("Cantidad de valores nulos en la columna 'sex_outcome':", null_sex_outcome.shape[0])

# Calcula la mediana de las columnas sex_intake y sex_outcome
median_sex_intake = data['sex_intake'].median()
median_sex_outcome = data['sex_outcome'].median()

# Rellena los valores nulos con la mediana
data['sex_intake'].fillna(median_sex_intake, inplace=True)
data['sex_outcome'].fillna(median_sex_outcome, inplace=True)




# map outcome_types  0 or 1
def map_adopted(outcome_type):
    if outcome_type in ['Adoption', 'Return to Owner', 'Rto-Adopt']:
        return 1
    else:
        return 0

# Create the 'adopted' column based on the conditions
data['adopted'] = data['outcome_type'].apply(lambda x: map_adopted(x))

print (f"****Columnas despues de la limpieza y trasformacion de datos****")
print(data.head())

# Guarda el DataFrame resultante en un nuevo archivo CSV
data.to_csv('/content/drive/MyDrive/dbcompleta_limpio', index=False)



Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
****Columnas antes de la limpieza y trasformacion de datos****
  animal_id         datetime_obj  \
0   A786884  2019-01-03 16:19:00   
1   A706918  2015-07-05 12:59:00   
2   A682524  2014-06-29 10:38:00   
3   A823407  2020-09-23 10:49:00   
4   A823407  2020-09-23 10:49:00   

                                      found_location intake_type  \
0                2501 Magin Meadow Dr in Austin (TX)       Stray   
1                   9409 Bluegrass Dr in Austin (TX)       Stray   
2                      800 Grove Blvd in Austin (TX)       Stray   
3  Rosewood Avenue And Poquito Street in Austin (TX)       Stray   
4  Rosewood Avenue And Poquito Street in Austin (TX)       Stray   

  intake_condition animal_type sex_upon_intake age_upon_intake  \
0           Normal         Dog   Neutered Male         2 years   
1           Normal         Dog   Spayed Female    

In [16]:

# valores nulos en las columnas color y breed y Rellena con 'other'
null_color = data[data['color'].isnull()]
data['color'].fillna('other', inplace=True)

null_breed = data[data['breed'].isnull()]
data['breed'].fillna('other', inplace=True)# Reemplaza '/' con un espacio en blanco en las columnas color y breed
data['color'] = data['color'].str.replace('/', ' ')
data['breed'] = data['breed'].str.replace('/', ' ')

# Divide las columnas color y breed en el primer espacio
split_color = data['color'].str.split(n=1, expand=True)
split_breed = data['breed'].str.split(n=1, expand=True)

# Asigna las partes divididas a las columnas correspondientes
data['color'] = split_color[0]
data['color1'] = split_color[1] if len(split_color.columns) > 1 else None
data['breed'] = split_breed[0]
data['breed1'] = split_breed[1] if len(split_breed.columns) > 1 else None

print(data.head)


<bound method NDFrame.head of         intake_type  intake_condition  animal_type     breed     color  \
0                 4                11            2    Beagle  Tricolor   
1                 4                11            2   English     White   
2                 4                11            2  Doberman       Tan   
3                 4                11            1  Domestic     Brown   
4                 4                11            1  Domestic     Brown   
...             ...               ...          ...       ...       ...   
205632            6                20            5     other     other   
205633            6                20            5     other     other   
205634            6                20            5     other     other   
205635            6                20            5     other     other   
205636            6                20            5     other     other   

        outcome_type  sex_intake  reprod_intake  sex_outcome  reprod_outcome  \
0