A continuacionn varios utilities que se usaran durante todo el proceso. Cada funcion tiene una breve descripcion de lo que hace y como lo hace.

In [44]:
import pandas as pd
import numpy as np
import math

# Cargar archivo
df = pd.read_csv('./data/wur-2023.csv')

# Utiles de columnas
cols={
    'univ_rank':'University Rank',
    'univ_name':'Name of University',
    'loc':'Location',
    'students':'No of student',
    'students_staff':'No of student per staff',
    'inter_students':'International Student',
    'male_fem_ratio':'Female:Male Ratio',
    'overall_score':'OverAll Score',
    'teaching_score':'Teaching Score',
    'research_score':'Research Score',
    'citation_score':'Citations Score',
    'industry_score':'Industry Income Score',
    'inter_score':'International Outlook Score'
}

# Convierte una columna con valores de tipo enum a valores numericos (codificacion)
def column_as_enum(df, col_name:str):
    elems = {}
    def append_return(loc:str):
        size = len(elems)
        if loc in elems:
            return elems[loc]
        else:
            elems[loc] = size + 1
            return size + 1

    df[col_name] = df[col_name].map(append_return)
    
# Convierte cualquier valor en la columna que no sea numero a NaN
def non_numbers_as_nan(df, col_name):
    def to_nan_if_non_number(val):
        if val == 'nan':
            return np.nan
        try:
            return float(val)
        except ValueError:
            return np.nan
        except TypeError:
            return np.nan

    df[col_name] = df[col_name].apply(to_nan_if_non_number)

# Llena los valores de NaN con el promedio
def fill_with_mean(df,col_name):
    df[col_name].fillna(df[col_name].mean(), inplace=True)

# Llena los valores de NaN con la mediana
def fill_with_median(df,col_name):
    df[col_name].fillna(df[col_name].median(), inplace=True)

# Estandariza los valores numericos usando la media y la desviacion estandar
def standardize(df, col_name):
    mean = df[col_name].mean()
    std = df[col_name].std()
    standardized_column = (df[col_name] - mean) / std
    df[col_name] =  standardized_column

# convierte una columna de valores string a valores numericos. 
def col_to_num(df, col_name):
    def str_to_num(s):
        if isinstance(s,float) or isinstance(s,int):
            return s

        s = s.replace(',', '')
        s = s.replace(' ', '')
        s = s.replace('%', '')
        if s == '':
            return float('nan')
        return float(s) 
    
    df[col_name] = df[col_name].map(str_to_num)

# limpia la columna de la relacion 'mujeres':'hombres'
def clean_fem_male_col(df):
    def to_num(val):
        if isinstance(val,float) or isinstance(val,int):
            return val

        [fem,male] = val.replace(' ','').split(':')
        return float(fem) / 1 if float(male) == 0 else float(male)
    df[cols['male_fem_ratio']] = df[cols['male_fem_ratio']].map(to_num)

# limpia la columna de 'overall score'
def clean_overall_score(df):
    def to_num(val):
        if isinstance(val,float) or isinstance(val,int):
            return val
        res = val.replace(' ','').split('–')
        if len(res) == 1:
            return float(res[0])
        # take middle grounnd between ranges x-y
        return (float(res[0]) + float(res[1]))/2

    df[cols['overall_score']] = df[cols['overall_score']].map(to_num)

# limpia la columna del ranking de universidad
def clean_univ_rank(df):
    def to_num(val):
        # hay universidades con el valor 'Reporter' y '-'. Estas no recibieron ranking
        if val == 'Reporter' or val == '-':
            return np.nan
        # elimina el signo + del valor numerico en algunas observaciones
        val = val.replace('+','')
        # para los ranking entre dos valores asignamos el punto medio entre estos
        res = val.replace(' ','').split('–')
        
        if len(res) == 1:
            return float(res[0])
        return (float(res[0]) + float(res[1]))/2

    df[cols['univ_rank']] = df[cols['univ_rank']].map(to_num)

# llena la columna con NaN y estadariza los valores faltantes usando la media o mediana
def fill_and_standardize(df,col_name, use_mean=True):
    non_numbers_as_nan(df,col_name)

    if use_mean:
        fill_with_mean(df,col_name)
    else:
        fill_with_median(df,col_name)

    standardize(df, col_name)


A continuacion empezaremos con las transformaciones necesarias para nuestro dataset.
No encontramos razones para realizar transformaciones estructurales

In [45]:
#df.columns

# quitando las columnas que no usaremos
df = df.drop(columns = cols['univ_name'])

# columnas enum como valores numericos
column_as_enum(df,cols['loc'])

# limpiando columnas con valores numericos
col_to_num(df, cols['students'])
col_to_num(df, cols['inter_students'])
clean_univ_rank(df)
clean_fem_male_col(df)
clean_overall_score(df)

Queremos saber en que columnas es mejor reemplazar los valores NaN con el promedio o la mediana. El promedio se ve bastante afectado por los valores 'outliers' muy grandes. Asi que analizaremos los datos para ver que reemplazo aplicar a que columna.

Como saber si tenemos valores sobresalientes? si el promedio es pequeno comparado con el valor maximo entonces tenemos outliers.

Entonces las variables con valores sobresalientes son: students y students_staff

In [46]:
df.drop(columns=[cols['loc']]).describe()

Unnamed: 0,University Rank,No of student,No of student per staff,International Student,Female:Male Ratio,OverAll Score,Teaching Score,Research Score,Citations Score,Industry Income Score,International Outlook Score
count,1697.0,2209.0,2208.0,2206.0,2128.0,1799.0,1799.0,1799.0,1799.0,1799.0,1799.0
mean,858.422216,19617.416478,19.000408,10.126473,49.861842,34.16587,27.01801,23.016898,48.495887,47.104558,46.880378
std,481.473274,25191.725143,12.132224,13.414442,13.679716,16.162911,13.282243,16.763819,27.967185,15.093682,22.582401
min,1.0,115.0,0.4,0.0,1.0,14.35,11.6,7.4,0.8,36.9,14.1
25%,450.5,6880.0,12.6,1.0,41.0,21.35,18.0,11.3,23.1,37.8,27.9
50%,900.5,14292.0,16.6,5.0,47.0,31.85,22.7,17.0,47.2,40.5,42.1
75%,1350.5,25884.0,22.2,14.0,57.0,43.5,31.85,28.9,72.35,48.3,62.1
max,1501.0,460632.0,232.2,100.0,100.0,96.4,94.8,99.7,100.0,100.0,99.7


In [48]:

# estandarizando las columnas con valores numericos

# aqui en estas columnas llenaremos los valores con la media
fill_and_standardize(df,cols['students'], use_mean=False)
fill_and_standardize(df, cols['students_staff'], use_mean=False)

# para el resto donde no existen valores sobresalientes podemos usar el promedio
fill_and_standardize(df,cols['inter_students'])
fill_and_standardize(df,cols['male_fem_ratio'])
fill_and_standardize(df, cols['teaching_score'])
fill_and_standardize(df, cols['research_score'])
fill_and_standardize(df, cols['citation_score'])
fill_and_standardize(df, cols['industry_score'])
fill_and_standardize(df, cols['inter_score'])
fill_and_standardize(df, cols['overall_score'])

# para la columna 'University Rank' nos quedamos con valores NaN.
# Estos vienen de las universidades donde el valor de esta variable era 'Reporter'
# Dependiendo de la variable objetivo que queremos predecir deberiamos o no eliminar estos valores. 
# En este caso se asume que se quiere predecir la variable 'University Rank'. 
# Por lo tanto los datos sin esta variable no son muy utiles, los removemos
df = df.dropna()
# estandarizamos la variable para que 
standardize(df, cols['univ_rank'])

df

Unnamed: 0,University Rank,Location,No of student,No of student per staff,International Student,Female:Male Ratio,OverAll Score,Teaching Score,Research Score,Citations Score,Industry Income Score,International Outlook Score
0,-1.780830,1,-0.073435,-0.722641,2.544723,0.219737,3.790142,4.860342,4.487709,1.781368,1.849901,2.153997
1,-1.778753,2,-0.039862,-0.811125,1.164808,0.054794,3.716455,5.046879,4.446537,1.792201,0.165071,1.462859
2,-1.776676,1,-0.101838,-0.660703,2.301209,0.302208,3.691892,4.755881,4.475945,1.709147,0.476831,2.136388
3,-1.776676,2,-0.248257,-1.032334,1.083636,0.384679,3.691892,5.002110,4.311261,1.810256,1.193215,1.432044
4,-1.772523,2,-0.421185,-0.935002,1.814180,0.879506,3.655049,4.740959,4.128931,1.810256,2.911211,1.850248
...,...,...,...,...,...,...,...,...,...,...,...,...
1692,1.334607,39,0.024590,0.135651,-0.377451,1.291861,-1.248199,-0.690993,-0.435190,-1.255490,-0.100256,-0.879087
1693,1.334607,8,-0.446346,-0.678399,-0.296280,1.044448,-1.248199,-0.437303,-0.652809,-1.291600,-0.213020,-0.742620
1694,1.334607,11,0.570501,-0.492584,-0.783309,0.961977,-1.248199,-0.743223,-0.735151,-1.132716,-0.007391,-1.332509
1695,1.334607,41,-0.216431,0.834672,-0.458623,-0.357561,-1.248199,-0.623840,-0.658691,-1.414375,1.239648,-0.777837
