## Distrinuciones t
Msc Renzo Claure

In [None]:
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import norm, t
from ipywidgets import interact, IntSlider
%matplotlib inline

def plot_distributions(df):
    x = np.linspace(-4, 4, 1000)  # Rango de valores para x

    #distribución Normal
    y_normal = norm.pdf(x, loc=0, scale=1)  # Media=0, Desviación estándar=1

    #distribución t de Student
    y_t = t.pdf(x, df=df, loc=0, scale=1)  # Media=0, Desviación estándar=1, Grados de libertad=df


    plt.figure(figsize=(10, 6))
    plt.plot(x, y_normal, label='Normal')
    plt.plot(x, y_t, label='t de Student (df={})'.format(df))
    plt.xlabel('x')
    plt.ylabel('Densidad de probabilidad')
    plt.title('Comparación de distribuciones')
    plt.legend()
    plt.grid(True)
    plt.show()

interact(plot_distributions, df=IntSlider(min=1, max=30, step=1, value=1))


### Prueba t para diferencia de muestras relacionadas

In [None]:
#Prueba t para diferencia de muestras relacionadas
import pandas as pd
import scipy.stats as stats

# Cargar los datos desde el archivo CSV
data = pd.read_csv('sleep.csv')


In [None]:
data

In [None]:
#Dividir los datos en dos grupos
g1 = data['extra'].iloc[0:10].reset_index(drop=True)
g2 = data['extra'].iloc[10:20].reset_index(drop=True)

# Calcular la diferencia
difference = g2 - g1

difference

In [None]:
# media, desviación estándar y tamaño de muestra
mn = difference.mean()
s = difference.std()
n = len(difference)

#intervalo de confianza directamente
interval = stats.t.interval(0.95, n - 1, loc=mn, scale=s / (n ** 0.5))
print("Intervalo de confianza directo:", interval)

In [None]:
mn

### Muestras independientes (No pareadas)

In [None]:
# Dividir los datos en dos grupos
g1 = data['extra'].iloc[0:10].reset_index(drop=True)
g2 = data['extra'].iloc[10:20].reset_index(drop=True)

In [None]:
import scipy.stats as stats

# longitud de las series
n1 = len(g1)
n2 = len(g2)

# desviación estándar agrupada
sp = np.sqrt((((n1 - 1) * g1.std() ** 2) + ((n2 - 1) * g2.std() ** 2)) / (n1 + n2 - 2))

sp


In [None]:
#diferencia de medias
md = g2.mean() - g1.mean()

#error estándar de la diferencia de medias
semd = sp * np.sqrt((1 / n1) + (1 / n2))

print(md, semd)

In [None]:
#intervalos de confianza
t_value = stats.t.ppf(0.975, n1 + n2 - 2)
interval_1 = md + np.array([-1, 1]) * t_value * semd
interval_2 = (md - t_value * semd, md + t_value * semd)
interval_3 = stats.ttest_rel(g2, g1).confidence_interval()

#Tabla con los intervalos de confianza
confidence_intervals = pd.DataFrame([interval_1, interval_2, interval_3], columns=['Lower', 'Upper'], index=['No pareadas metodo 1', 'No pareadas metodo 2', 'Pareadas'])
print(confidence_intervals)

In [None]:
#Muestras independientes
import pandas as pd
from statsmodels import datasets

chickweight = datasets.get_rdataset('ChickWeight').data

chickweight.head()

In [None]:
# Pivotear la tabla para reformarla
wideCW = chickweight.pivot_table(index=['Diet', 'Chick'], columns='Time', values='weight').reset_index()
#renombrar las columnas de tiempo
wideCW.columns = ['Diet', 'Chick'] + ['time' + str(i) for i in wideCW.columns[2:]]

wideCW.head()

In [None]:
wideCW['Diet'].value_counts()

In [None]:
#Se calcula la ganacia de peso entre la semana 21 y la 0
wideCW = wideCW.assign(gain=wideCW['time21'] - wideCW['time0'])

#Mostrar el Dataframe
wideCW.head()

In [None]:
wideCW[wideCW['Diet']==4]

### ejemplo 2

In [None]:
import pandas as pd
import seaborn as sns
from statsmodels import datasets


chickweight['weight_gain'] = chickweight['weight'].groupby([chickweight['Diet'], chickweight['Chick']]).transform('last') - chickweight['weight'].groupby([chickweight['Diet'], chickweight['Chick']]).transform('first')

sns.boxplot(x='Diet', y='weight_gain', data=chickweight)
plt.title('Weight Gain by Diet')
plt.xlabel('Diet')
plt.ylabel('Weight Gain')
plt.show()


In [None]:
#pip install --upgrade scipy

In [None]:
import pandas as pd
from scipy.stats import ttest_ind
import scipy.stats



wideCW14 = wideCW[wideCW['Diet'].isin([1, 4])]

ttest_equal_var = ttest_ind(wideCW14[wideCW14['Diet'] == 1]['gain'], wideCW14[wideCW14['Diet'] == 4]['gain'], equal_var=True, nan_policy='omit')
ttest_unequal_var = ttest_ind(wideCW14[wideCW14['Diet'] == 1]['gain'], wideCW14[wideCW14['Diet'] == 4]['gain'], equal_var=False, nan_policy='omit')

conf1 = ttest_equal_var.confidence_interval(confidence_level=0.95)
conf2 = ttest_unequal_var.confidence_interval()

conf_df = pd.DataFrame({'conf': [conf1, conf2]})

print(conf_df)


## Pruebas de hipotesis

### Prueba de dos colas

In [None]:
#El metodo completo
import pandas as pd
import scipy.stats as stats
import numpy as np

# datos
X = pd.read_csv('galton.csv')
data = X['height'] - X['father']

# test de hipotesis
H0 = 0  # Ho

t_statistic, p_value = stats.ttest_1samp(data, H0)

# Analysis of p-value
alpha = 0.05  # Significance level

if abs(t_statistic) > stats.t.ppf(0.975, data.shape[0]*2-2):
    print("Se rechaza la H0. La media muestral es significativamente diferente de:", H0)
else:
    print("Se acepta la H0. La media de la muestra no es significativamente distinta de:", H0)

# Confidence intervals
confidence_level = 0.95  # Confidence level
n = len(data)  # Sample size
mean = np.mean(data)  # Sample mean
std_error = stats.sem(data)  # Standard error of the mean

margin_of_error = stats.t.ppf((1 + confidence_level) / 2, n - 2) * std_error
confidence_interval = (mean - margin_of_error, mean + margin_of_error)

print("Intervalo de confianza:", confidence_interval)


In [None]:
#metodo abreviado
from scipy import stats
import pandas as pd

X = pd.read_csv('galton.csv')

In [None]:
t_statistic = stats.ttest_1samp(X['height'] - X['father'], 0)
t_statistic_975 =  stats.t.ppf(0.975, X.shape[0]*2-2)

# Print the results
print("t-test:")
print("T-statistic:", t_statistic)
print('t_statistic_975:', t_statistic_975)

In [None]:
#la diferencia promedio es lejana de 0 (es decir son distintos)
np.mean(X['height'] - X['father'])

In [None]:
#grados de libertad
X.shape[0]*2-2

In [None]:
#los intervalos de confianza al 95%
t_statistic.confidence_interval(0.95)

In [None]:
#entonces aceptamos la hipótesis alternativa, la diferencia de medias es distinta de cero

#### Intervalos para dos grupos

In [1]:
#Ejemplo pollitos y pesos
import pandas as pd
from statsmodels import datasets

chickweight = datasets.get_rdataset('ChickWeight').data
wideCW = chickweight.pivot_table(index=['Diet', 'Chick'], columns='Time', values='weight').reset_index()
wideCW.columns = ['Diet', 'Chick'] + ['time' + str(i) for i in wideCW.columns[2:]]
wideCW = wideCW.assign(gain=wideCW['time21'] - wideCW['time0'])

wideCW.head()

ModuleNotFoundError: No module named 'statsmodels'

In [None]:
wideCW['Diet'].value_counts()

In [None]:
#Prueba para varianzas desiguales

from scipy import stats

#separando las muestras
wideCW14 = wideCW[wideCW['Diet'].isin([1, 4])]

#Prueba t de muestras independientes con varianzas desiguales
result = stats.ttest_ind(wideCW14[wideCW14['Diet'] == 1]['gain'],
                         wideCW14[wideCW14['Diet'] == 4]['gain'],
                         equal_var=False, nan_policy='omit')

# Imprimir los resultados
print("Two Sample t-test")
print("t-statistic:", result.statistic)
print("p-value:", result.pvalue)


In [None]:
result.df

In [None]:
result.confidence_interval(0.95)

In [None]:
wideCW[wideCW['Diet']==1]['gain'].mean()

In [None]:
wideCW[wideCW['Diet']==4]['gain'].mean()

In [None]:
result.pvalue

In [None]:
wideCW[wideCW['Diet']==4]['gain'].count()

In [None]:
wideCW[wideCW['Diet']==1]['gain'].count()