# **01. Estimados de locación y variabilidad**

In [1]:
import pandas as pd
from scipy import stats

## Limpieza de datos

In [11]:
survey = pd.read_csv('/content/drive/MyDrive/BEDU/COVID-19 Survey Student Responses.csv')
df = survey.copy()

In [12]:
df.isna().sum()

ID                                                                                     0
Region of residence                                                                    0
Age of Subject                                                                         0
Time spent on Online Class                                                             0
Rating of Online Class experience                                                     24
Medium for online class                                                               51
Time spent on self study                                                               0
Time spent on fitness                                                                  0
Time spent on sleep                                                                    0
Time spent on social media                                                             0
Prefered social media platform                                                         0
Time spent on TV     

In [13]:
df = df.drop('ID', axis=1)
#df.index.name = 'ID'

In [14]:
df['Medium for online class'] = df['Medium for online class'].fillna('No medium')

In [15]:
df['Rating of Online Class experience'] = df['Rating of Online Class experience'].fillna('No rating')

In [30]:
df[~df['Time spent on TV'].map(lambda x:x.isnumeric())]

Unnamed: 0,Region of residence,Age of Subject,Time spent on Online Class,Rating of Online Class experience,Medium for online class,Time spent on self study,Time spent on fitness,Time spent on sleep,Time spent on social media,Prefered social media platform,Time spent on TV,Number of meals per day,Change in your weight,Health issue during lockdown,Stress busters,Time utilized,"Do you find yourself more connected with your family, close friends , relatives ?",What you miss the most
13,Outside Delhi-NCR,22,1.0,Good,Laptop/Desktop,2.0,0.0,7.0,0.0,,0.5,2,Increased,NO,Reading,YES,YES,Travelling
18,Delhi-NCR,21,0.0,Very poor,Laptop/Desktop,4.0,0.0,7.0,7.0,Youtube,n,2,Remain Constant,NO,Reading books,NO,NO,"Friends , relatives"
28,Delhi-NCR,31,8.0,Average,Laptop/Desktop,4.0,1.0,6.0,2.0,Linkedin,0.5,3,Remain Constant,NO,Listening to music,YES,YES,Job
33,Delhi-NCR,22,5.0,Very poor,Laptop/Desktop,1.0,0.0,6.0,2.0,Linkedin,0.5,3,Decreased,YES,Watching web series,YES,YES,"Friends , relatives"
53,Delhi-NCR,19,0.0,Average,Smartphone,2.0,2.0,10.0,1.0,Instagram,0.5,4,Remain Constant,NO,Reading books,YES,YES,Roaming around freely
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1138,Delhi-NCR,15,8.0,Good,Laptop/Desktop,1.5,0.5,8.0,0.5,Whatsapp,0.3,3,Remain Constant,NO,Online gaming,NO,YES,Colleagues
1139,Delhi-NCR,14,5.0,Average,Smartphone or Laptop/Desktop,2.0,0.0,9.0,0.5,Whatsapp,0.5,4,Increased,YES,Listening to music,YES,YES,Travelling
1145,Outside Delhi-NCR,13,6.0,Good,Smartphone,2.0,0.0,11.0,1.0,Whatsapp,0.5,3,Remain Constant,NO,Online gaming,YES,YES,Colleagues
1146,Outside Delhi-NCR,11,5.0,Average,Smartphone,2.0,1.0,8.0,1.0,Whatsapp,0.5,1,Decreased,NO,Watching web series,YES,NO,School/college


In [7]:
df['Time spent on TV'] = pd.to_numeric(df['Time spent on TV'], downcast='float', errors='coerce')
df.isna().sum()

Region of residence                                                                    0
Age of Subject                                                                         0
Time spent on Online Class                                                             0
Rating of Online Class experience                                                      0
Medium for online class                                                                0
Time spent on self study                                                               0
Time spent on fitness                                                                  0
Time spent on sleep                                                                    0
Time spent on social media                                                             0
Prefered social media platform                                                         0
Time spent on TV                                                                      13
Number of meals per d

## Identificación de las columnas del dataset con datos numéricos.

In [None]:
df.to_csv('/content/drive/MyDrive/BEDU/analisis_datos_python_2021/COVID-19 Survey Student Responses-clean.csv', index=0)
df.iloc[18]

Region of residence                                                                             Delhi-NCR
Age of Subject                                                                                         21
Time spent on Online Class                                                                              0
Rating of Online Class experience                                                               Very poor
Medium for online class                                                                    Laptop/Desktop
Time spent on self study                                                                                4
Time spent on fitness                                                                                   0
Time spent on sleep                                                                                     7
Time spent on social media                                                                              7
Prefered social media platform                

La primera columna de interés para el análisis es `Age of subject`, es decir, las edades de los estudiantes encuestados. A partir de ella, nuestros datos numéricos son aquellos que no son de tipo objeto.

In [None]:
df.dtypes

Region of residence                                                                    object
Age of Subject                                                                          int64
Time spent on Online Class                                                            float64
Rating of Online Class experience                                                      object
Medium for online class                                                                object
Time spent on self study                                                              float64
Time spent on fitness                                                                 float64
Time spent on sleep                                                                   float64
Time spent on social media                                                            float64
Prefered social media platform                                                         object
Time spent on TV                                            

Ahora visualizamos sólo las columnas numéricas de nuestro Dataframe original.

In [None]:
df_num = df.select_dtypes('number')
df_num.head()

Unnamed: 0,Age of Subject,Time spent on Online Class,Time spent on self study,Time spent on fitness,Time spent on sleep,Time spent on social media,Number of meals per day
0,21,2.0,4.0,0.0,7.0,3.0,4
1,21,0.0,0.0,2.0,10.0,3.0,3
2,20,7.0,3.0,0.0,6.0,2.0,3
3,20,3.0,2.0,1.0,6.0,5.0,3
4,21,3.0,3.0,1.0,8.0,3.0,4



## Identificación de la relevancia de las columnas.


---


*   La columna `Age of subject` es importante para obtener conclusiones en relación a la edad, que está directamente relacionada con la madurez, pensamientos y comportamiento.
*   Las columnas `Time spent on Online Class` ... `Time spent on social media` nos permite estudiar los hábitos de estudio de los encuestados.
*   La columna `Number of meals per day` nos permite relacionar los hábitos alimenticios con la productividad, la salud y las relaciones personales.





## Obtención de estimados para las columnas numéricas

---

### *Promedio.*



In [None]:
df_num.mean()

Age of Subject                20.165821
Time spent on Online Class     3.208841
Time spent on self study       2.911591
Time spent on fitness          0.765821
Time spent on sleep            7.871235
Time spent on social media     2.365694
Number of meals per day        2.917936
dtype: float64

### *Mediana.*

In [None]:
df_num.median()

Age of Subject                20.0
Time spent on Online Class     3.0
Time spent on self study       2.0
Time spent on fitness          1.0
Time spent on sleep            8.0
Time spent on social media     2.0
Number of meals per day        3.0
dtype: float64

### *Media truncada.*

---

Al aumentar el proncentaje de datos extremos eliminados, la variación de la media para el 10% es casi imperceptible y para el 20% tampoco se encuentran diferencias significativas.


In [None]:
for i in range(7):
  print(df_num.columns[i], stats.trim_mean(df_num, 0.1)[i])

Age of Subject 19.54016913319239
Time spent on Online Class 3.1282769556025367
Time spent on self study 2.6421775898520083
Time spent on fitness 0.6751585623678648
Time spent on sleep 7.753488372093024
Time spent on social media 2.1028541226215642
Number of meals per day 2.9112050739957716


In [None]:
for i in range(7):
  print(df_num.columns[i], round(stats.trim_mean(df_num, 0.2)[i],2))

Age of Subject 19.71
Time spent on Online Class 3.14
Time spent on self study 2.57
Time spent on fitness 0.69
Time spent on sleep 7.75
Time spent on social media 1.96
Number of meals per day 2.88


### *Desviación estándar.*

In [None]:
df_num.std()

Age of Subject                5.516467
Time spent on Online Class    2.101756
Time spent on self study      2.140590
Time spent on fitness         0.724451
Time spent on sleep           1.615762
Time spent on social media    1.767336
Number of meals per day       0.828698
dtype: float64

### *Rango.*

Máximo

In [None]:
df_num.max(axis=0)

Age of Subject                59.0
Time spent on Online Class    10.0
Time spent on self study      18.0
Time spent on fitness          5.0
Time spent on sleep           15.0
Time spent on social media    10.0
Number of meals per day        8.0
dtype: float64

Mínimo

In [None]:
df_num.min(axis=0)

Age of Subject                7.0
Time spent on Online Class    0.0
Time spent on self study      0.0
Time spent on fitness         0.0
Time spent on sleep           4.0
Time spent on social media    0.0
Number of meals per day       1.0
dtype: float64

Rango

In [None]:
df_num.max(axis=0)-df_num.min(axis=0)

Age of Subject                52.0
Time spent on Online Class    10.0
Time spent on self study      18.0
Time spent on fitness          5.0
Time spent on sleep           11.0
Time spent on social media    10.0
Number of meals per day        7.0
dtype: float64

### *Percentiles.*

In [None]:
def per(df, value):
  return [df[i].quantile(value) for i in df_num]

datos = {
    'Percentil 10': per(df_num, 0.1),
    'Percentil 25': per(df_num, 0.25),
    'Percentil 50': per(df_num, 0.5),
    'Percentil 75': per(df_num, 0.75),
    'Percentil 90': per(df_num, 0.9)
}

df_per = pd.DataFrame(datos)
df_per.index = df_num.columns

df_per

Unnamed: 0,Percentil 10,Percentil 25,Percentil 50,Percentil 75,Percentil 90
Age of Subject,14.0,17.0,20.0,21.0,24.0
Time spent on Online Class,0.0,2.0,3.0,5.0,6.0
Time spent on self study,1.0,2.0,2.0,4.0,6.0
Time spent on fitness,0.0,0.0,1.0,1.0,2.0
Time spent on sleep,6.0,7.0,8.0,9.0,10.0
Time spent on social media,1.0,1.0,2.0,3.0,5.0
Number of meals per day,2.0,2.0,3.0,3.0,4.0


### *Rango intercuartil.*

In [None]:
df_num.quantile(0.75) - df_num.quantile(0.25)

Age of Subject                4.0
Time spent on Online Class    3.0
Time spent on self study      2.0
Time spent on fitness         1.0
Time spent on sleep           2.0
Time spent on social media    2.0
Number of meals per day       1.0
dtype: float64