In [3]:
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 50)
pd.set_option("display.width", 120)

"""
Un f-string es la forma moderna y directa de construir strings en Python insertando variables y expresiones dentro del texto.
    f"S{i:02d}" 
        "S" → texto fijo
        02d significa
            d → entero decimal
            2 → mínimo 2 dígitos
            0 → rellenar con ceros a la izquierda
    
    ["control"]*6  => multiplicación de listas en Python.
        ["control", "control", "control", "control", "control", "control"]
    
"""

df = pd.DataFrame({
    "sample_id": [f"S{i:02d}" for i in range(1, 13)],
    "group": ["control"]*6 + ["treated"]*6,
    "gene": ["NGAL"]*12,
    "expression": [10.2, 9.7, 11.1, 10.5, 9.9, 10.0, 14.2, 15.1, 13.8, 14.7, 16.0, 15.4],
    "sex": ["F","M","F","M","F","M","F","M","F","M","F","M"]
})

df



Unnamed: 0,sample_id,group,gene,expression,sex
0,S01,control,NGAL,10.2,F
1,S02,control,NGAL,9.7,M
2,S03,control,NGAL,11.1,F
3,S04,control,NGAL,10.5,M
4,S05,control,NGAL,9.9,F
5,S06,control,NGAL,10.0,M
6,S07,treated,NGAL,14.2,F
7,S08,treated,NGAL,15.1,M
8,S09,treated,NGAL,13.8,F
9,S10,treated,NGAL,14.7,M


In [4]:
df.shape

(12, 5)

In [5]:
df.head(3)

Unnamed: 0,sample_id,group,gene,expression,sex
0,S01,control,NGAL,10.2,F
1,S02,control,NGAL,9.7,M
2,S03,control,NGAL,11.1,F


In [6]:
df.tail(3)

Unnamed: 0,sample_id,group,gene,expression,sex
9,S10,treated,NGAL,14.7,M
10,S11,treated,NGAL,16.0,F
11,S12,treated,NGAL,15.4,M


In [7]:
df.columns

Index(['sample_id', 'group', 'gene', 'expression', 'sex'], dtype='object')

In [8]:
df.dtypes

sample_id      object
group          object
gene           object
expression    float64
sex            object
dtype: object

In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12 entries, 0 to 11
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   sample_id   12 non-null     object 
 1   group       12 non-null     object 
 2   gene        12 non-null     object 
 3   expression  12 non-null     float64
 4   sex         12 non-null     object 
dtypes: float64(1), object(4)
memory usage: 612.0+ bytes


In [10]:
df["expression"].head()

0    10.2
1     9.7
2    11.1
3    10.5
4     9.9
Name: expression, dtype: float64

In [11]:
df[["sample_id", "group", "expression"]].head()

Unnamed: 0,sample_id,group,expression
0,S01,control,10.2
1,S02,control,9.7
2,S03,control,11.1
3,S04,control,10.5
4,S05,control,9.9


In [12]:
df[df["group"] == "control"]

Unnamed: 0,sample_id,group,gene,expression,sex
0,S01,control,NGAL,10.2,F
1,S02,control,NGAL,9.7,M
2,S03,control,NGAL,11.1,F
3,S04,control,NGAL,10.5,M
4,S05,control,NGAL,9.9,F
5,S06,control,NGAL,10.0,M


In [None]:
# Selección de columnas
df[df["expression"] > 12]

Unnamed: 0,sample_id,group,gene,expression,sex
6,S07,treated,NGAL,14.2,F
7,S08,treated,NGAL,15.1,M
8,S09,treated,NGAL,13.8,F
9,S10,treated,NGAL,14.7,M
10,S11,treated,NGAL,16.0,F
11,S12,treated,NGAL,15.4,M


In [None]:
# Filtros
df[(df["group"] == "treated") & (df["sex"] == "F")]

Unnamed: 0,sample_id,group,gene,expression,sex
6,S07,treated,NGAL,14.2,F
8,S09,treated,NGAL,13.8,F
10,S11,treated,NGAL,16.0,F


In [15]:
df.loc[0:2, ["sample_id", "expression"]]

Unnamed: 0,sample_id,expression
0,S01,10.2
1,S02,9.7
2,S03,11.1


In [16]:
df.iloc[0:3, 0:4]

Unnamed: 0,sample_id,group,gene,expression
0,S01,control,NGAL,10.2
1,S02,control,NGAL,9.7
2,S03,control,NGAL,11.1


In [None]:
# Transformaciones típicas
df["log_expression"] = np.log1p(df["expression"])
df["is_treated"] = (df["group"] == "treated").astype(int)
df

Unnamed: 0,sample_id,group,gene,expression,sex,log_expression,is_treated
0,S01,control,NGAL,10.2,F,2.415914,0
1,S02,control,NGAL,9.7,M,2.370244,0
2,S03,control,NGAL,11.1,F,2.493205,0
3,S04,control,NGAL,10.5,M,2.442347,0
4,S05,control,NGAL,9.9,F,2.388763,0
5,S06,control,NGAL,10.0,M,2.397895,0
6,S07,treated,NGAL,14.2,F,2.721295,1
7,S08,treated,NGAL,15.1,M,2.778819,1
8,S09,treated,NGAL,13.8,F,2.694627,1
9,S10,treated,NGAL,14.7,M,2.753661,1


In [None]:
# Ordenar
df.sort_values("expression", ascending=False).head()

Unnamed: 0,sample_id,group,gene,expression,sex,log_expression,is_treated
10,S11,treated,NGAL,16.0,F,2.833213,1
11,S12,treated,NGAL,15.4,M,2.797281,1
7,S08,treated,NGAL,15.1,M,2.778819,1
9,S10,treated,NGAL,14.7,M,2.753661,1
6,S07,treated,NGAL,14.2,F,2.721295,1


In [None]:
# Descriptivo global
df["expression"].describe()

count    12.000000
mean     12.550000
std       2.502907
min       9.700000
25%      10.150000
50%      12.450000
75%      14.800000
max      16.000000
Name: expression, dtype: float64

In [None]:
# Descriptivo por grupo
df.groupby("group")["expression"].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
control,6.0,10.233333,0.504645,9.7,9.925,10.1,10.425,11.1
treated,6.0,14.866667,0.804156,13.8,14.325,14.9,15.325,16.0


In [None]:
# Agregación típica
summary = df.groupby("group").agg(
    n=("expression", "count"),
    mean=("expression", "mean"),
    std=("expression", "std"),
    median=("expression", "median"),
    p25=("expression", lambda s: s.quantile(0.25)),
    p75=("expression", lambda s: s.quantile(0.75)),
)
summary

Unnamed: 0_level_0,n,mean,std,median,p25,p75
group,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
control,6,10.233333,0.504645,10.1,9.925,10.425
treated,6,14.866667,0.804156,14.9,14.325,15.325


In [22]:
# Simula missing y aprende a detectarlo:
df2 = df.copy()
df2.loc[[2, 9], "expression"] = np.nan

df2.isna().sum()

sample_id         0
group             0
gene              0
expression        2
sex               0
log_expression    0
is_treated        0
dtype: int64

In [23]:
df2[df2["expression"].isna()]

Unnamed: 0,sample_id,group,gene,expression,sex,log_expression,is_treated
2,S03,control,NGAL,,F,2.493205,0
9,S10,treated,NGAL,,M,2.753661,1


In [24]:
# Opciones típicas
# dropna => drop nan => remueve los valores vacios
df2_drop = df2.dropna(subset=["expression"])

df2_fill = df2.copy()
# Llena los vacios con la mediana
df2_fill["expression"] = df2_fill["expression"].fillna(df2_fill["expression"].median())


In [25]:
# Datos largos vs anchos (long vs wide)

# Crea un “wide” (muestras como filas, genes como columnas)
df_long = pd.DataFrame({
    "sample_id": ["S01","S01","S02","S02","S03","S03"],
    "gene": ["G1","G2","G1","G2","G1","G2"],
    "expression": [10, 3, 12, 4, 9, 2],
    "group": ["control","control","control","control","treated","treated"]
})
df_long


Unnamed: 0,sample_id,gene,expression,group
0,S01,G1,10,control
1,S01,G2,3,control
2,S02,G1,12,control
3,S02,G2,4,control
4,S03,G1,9,treated
5,S03,G2,2,treated


In [26]:
# Pivote a wide
df_wide = df_long.pivot(index="sample_id", columns="gene", values="expression")
df_wide

gene,G1,G2
sample_id,Unnamed: 1_level_1,Unnamed: 2_level_1
S01,10,3
S02,12,4
S03,9,2


In [27]:
# Volver a long
df_long_again = df_wide.reset_index().melt(id_vars="sample_id", var_name="gene", value_name="expression")
df_long_again

Unnamed: 0,sample_id,gene,expression
0,S01,G1,10
1,S02,G1,12
2,S03,G1,9
3,S01,G2,3
4,S02,G2,4
5,S03,G2,2


In [28]:
# Merge/Join (para juntar metadata + mediciones)
meta = pd.DataFrame({
    "sample_id": ["S01","S02","S03"],
    "patient_id": ["P1","P2","P3"],
    "age": [45, 51, 39]
})

merged = df_long.merge(meta, on="sample_id", how="left")
merged

Unnamed: 0,sample_id,gene,expression,group,patient_id,age
0,S01,G1,10,control,P1,45
1,S01,G2,3,control,P1,45
2,S02,G1,12,control,P2,51
3,S02,G2,4,control,P2,51
4,S03,G1,9,treated,P3,39
5,S03,G2,2,treated,P3,39


In [29]:
merged["age"].isna().sum()

np.int64(0)

In [30]:
merged["sample_id"].nunique(), merged["patient_id"].nunique()

(3, 3)

In [None]:
# Guardar y cargar
# Por defecto, pandas escribe el índice como una columna más. Con index=False no lo hace.
df.to_csv("toy_expression.csv", index=False)
df_loaded = pd.read_csv("toy_expression.csv")
df_loaded.head()

Unnamed: 0,sample_id,group,gene,expression,sex,log_expression,is_treated
0,S01,control,NGAL,10.2,F,2.415914,0
1,S02,control,NGAL,9.7,M,2.370244,0
2,S03,control,NGAL,11.1,F,2.493205,0
3,S04,control,NGAL,10.5,M,2.442347,0
4,S05,control,NGAL,9.9,F,2.388763,0


In [33]:
# Ejercicio
summary = df.groupby("group").agg(
    n=("expression", "count"),
    mean=("expression", "mean"),
    std=("expression", "std"),
    median=("expression", "median"),
    p25=("expression", lambda s: s.quantile(0.25)),
    p75=("expression", lambda s: s.quantile(0.75)),
)
print(summary)

df["log_expression"] = np.log1p(df["expression"])

treated_sorted = df[df["group"] == "treated"].sort_values("expression", ascending=False)
print(treated_sorted[["sample_id","expression","log_expression"]])

df.to_csv("toy_expression.csv", index=False)
df_loaded = pd.read_csv("toy_expression.csv")
print(df.shape, df_loaded.shape, (df_loaded.shape == df.shape))

         n       mean       std  median     p25     p75
group                                                  
control  6  10.233333  0.504645    10.1   9.925  10.425
treated  6  14.866667  0.804156    14.9  14.325  15.325
   sample_id  expression  log_expression
10       S11        16.0        2.833213
11       S12        15.4        2.797281
7        S08        15.1        2.778819
9        S10        14.7        2.753661
6        S07        14.2        2.721295
8        S09        13.8        2.694627
(12, 7) (12, 7) True
