In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.impute import SimpleImputer  # Adicionado para imputação de valores nulos
import seaborn as sns
import statsmodels.api as sm
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import r2_score
import scipy.stats as stats

# 1. Carregando a base de dados
df = pd.read_csv('/home/gamba.csv')
df.head(5)

Unnamed: 0,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,f,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,f,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,f,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0


In [None]:
# Removendo linhas onde a idade ('age') está nula
df = df.dropna(subset=['age'])
df.head(5)

Unnamed: 0,sex,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly
0,m,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0
1,f,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0
2,f,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0
3,f,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0
4,f,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0


In [None]:
# 2. Reconstrução de valores nulos
isnulled = df.isnull().sum()
isnulled

Unnamed: 0,0
sex,0
age,0
hdlngth,15
skullw,0
totlngth,14
taill,0
footlgth,14
earconch,0
eye,0
chest,0


In [None]:
# Criando uma cópia do DataFrame para reconstrução
df1 = df.copy()

# Convertendo o gênero em uma variável dummy
df1['dummy_gender'] = df1.apply(lambda row: 1 if row['sex'] == 'm' else 0, axis=1)
del df1['sex']
df1.head(5)

Unnamed: 0,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly,dummy_gender
0,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0,1
1,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0,0
2,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0,0
3,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0,0
4,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0,0


In [None]:
def completar_cabeca(row):
  if pd.isna(row['hdlngth']):
    campos = row[['skullw','taill','earconch','eye','chest','belly','dummy_gender']]

    return round(model.predict([campos])[0])
  return row['hdlngth']
df1['nova_cabeca'] = df1.apply(completar_cabeca, axis=1)
df1.head(5)

Unnamed: 0,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly,dummy_gender,nova_cabeca
0,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0,1,94.1
1,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0,0,92.5
2,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0,0,94.0
3,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0,0,93.2
4,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0,0,91.5


In [None]:
# Filtrando colunas sem nulos para usar no modelo de reconstrução
filtered_df = df1[df1['hdlngth'].notnull() & df1['totlngth'].notnull() & df1['footlgth'].notnull()]
filtered_df.head(5)

Unnamed: 0,age,hdlngth,skullw,totlngth,taill,footlgth,earconch,eye,chest,belly,dummy_gender,nova_cabeca
0,8.0,94.1,60.4,89.0,36.0,74.5,54.5,15.2,28.0,36.0,1,94.1
1,6.0,92.5,57.6,91.5,36.5,72.5,51.2,16.0,28.5,33.0,0,92.5
2,6.0,94.0,60.0,95.5,39.0,75.4,51.9,15.5,30.0,34.0,0,94.0
3,6.0,93.2,57.1,92.0,38.0,76.1,52.2,15.2,28.0,34.0,0,93.2
4,2.0,91.5,56.3,85.5,36.0,71.0,53.2,15.1,28.5,33.0,0,91.5


In [None]:
# 3. Avaliação da reconstrução do modelo e cálculo do R² antes e depois
x = filtered_df.drop(['age', 'hdlngth', 'totlngth', 'footlgth'], axis=1).to_numpy()
y = filtered_df['hdlngth']
model = LinearRegression()
model.fit(x, y)
score = model.score(x, y).round(2)
print(f"R² do modelo: {score}")

R² do modelo: 1.0


In [None]:
# Avaliando o modelo antes da reconstrução
r2_before = model.score(x, y).round(2)
print(f"R² antes da reconstrução: {r2_before}")

# Função para completar o comprimento da cabeça (hdlngth) onde está nulo
def completar_cabeca(row):
    if pd.isna(row['hdlngth']):
        campos = row[['skullw', 'taill', 'earconch', 'eye', 'chest', 'belly', 'dummy_gender']]
        return round(model.predict([campos])[0])
    return row['hdlngth']

# Aplicando a função para reconstruir os valores nulos de 'hdlngth'
df1['hdlngth'] = df1.apply(completar_cabeca, axis=1)

# Verificando valores nulos após a reconstrução
isnulled_after = df1.isnull().sum()
print("Valores nulos após a reconstrução:")
print(isnulled_after)

# Avaliando o R² após a reconstrução
x_after = df1.drop(['age', 'hdlngth', 'totlngth', 'footlgth'], axis=1).to_numpy()
y_after = df1['hdlngth']
r2_after = model.score(x_after, y_after).round(2)
print(f"R² após a reconstrução: {r2_after}")

R² antes da reconstrução: 1.0
Valores nulos após a reconstrução:
age              0
hdlngth          0
skullw           0
totlngth        14
taill            0
footlgth        14
earconch         0
eye              0
chest            0
belly            0
dummy_gender     0
nova_cabeca      0
dtype: int64
R² após a reconstrução: 1.0


In [None]:
# 4. Regressão linear para prever o comprimento do crânio ('hdlngth') baseado nas demais informações
x_cranio = df1.drop(['hdlngth', 'age', 'totlngth', 'footlgth'], axis=1)
y_cranio = df1['hdlngth']
model_cranio = LinearRegression()
model_cranio.fit(x_cranio, y_cranio)
r2_cranio = model_cranio.score(x_cranio, y_cranio).round(2)
print(f"R² da regressão para comprimento do crânio: {r2_cranio}")

R² da regressão para comprimento do crânio: 1.0


In [None]:
# 5. Divisão da base de dados em treino e teste com imputação de valores nulos
X = df1.drop(['age'], axis=1)
y = df1['age']


# Imputação de valores nulos com a média das colunas
imputer = SimpleImputer(strategy='mean')
X_imputed = imputer.fit_transform(X)

# Divisão em treino e teste
X_train, X_test, y_train, y_test = train_test_split(X_imputed, y, test_size=0.2, random_state=42)

# Modelo para idade (Age)
model_age = LinearRegression()

# Bootstrap
n_iterations = 100
r2_train_scores = []
r2_test_scores = []

for i in range(n_iterations):
    # Bootstrap samples usando resample
    X_boot_train, y_boot_train = resample(X_train, y_train)

    # Treinando o modelo
    model_age.fit(X_boot_train, y_boot_train)

    # Avaliando no conjunto de treino e teste
    y_train_pred = model_age.predict(X_train)
    y_test_pred = model_age.predict(X_test)

    r2_train = r2_score(y_train, y_train_pred)
    r2_test = r2_score(y_test, y_test_pred)

    r2_train_scores.append(r2_train)
    r2_test_scores.append(r2_test)

# Comparando as distribuições do R² de treino e teste
train_mean = np.mean(r2_train_scores)
test_mean = np.mean(r2_test_scores)

print(f"R² médio - Treino: {train_mean:.3f}, Teste: {test_mean:.3f}")

# Teste de hipótese para comparar as médias de R² de treino e teste
t_stat, p_value = stats.ttest_ind(r2_train_scores, r2_test_scores)
print(f"Teste de hipóteses - t_stat: {t_stat:.3f}, p-value: {p_value:.3f}")

# Verificar se há overfitting
if p_value < 0.05:
    print("Há evidências de overfitting.")
else:
    print("Não foi detectado overfitting.")

R² médio - Treino: -0.004, Teste: -0.036
Teste de hipóteses - t_stat: 1.329, p-value: 0.185
Não foi detectado overfitting.
