<a href="https://colab.research.google.com/github/RobsonPalerma/energy_consumption.csv/blob/main/Untitled1.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import files
uploaded = files.upload()


üß† Etapa 1 - An√°lise Explorat√≥ria de Dados (EDA)
Dataset: energy_consumption.csv

In [None]:
import pandas as pd #Importa a biblioteca pandas e d√° a ela o apelido pd.

df = pd.read_csv("energy_consumption.csv")


In [None]:
df.head() #ajuda a ver como os dados come√ßam (colunas, tipos, primeiras entradas).
df.tail() #ajuda a verificar como os dados terminam (√∫ltimas entradas, se h√° valores nulos ou inconsist√™ncias).


In [None]:
df.shape #dimens√£o da tabela (linhas, colunas).
df.info() #resumo t√©cnico da estrutura e tipos de dados.


In [None]:
df.describe().T


In [None]:
df.isnull().sum()


In [None]:
(df.isnull().sum() / len(df)) * 100


In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

missing = df.isnull().sum().sort_values(ascending=False)

plt.figure(figsize=(10,5))
sns.barplot(x=missing.index, y=missing.values)
plt.xticks(rotation=90)
plt.title("Valores faltantes por coluna")
plt.show()


In [None]:
num_cols = df.select_dtypes(include=['int64','float64']).columns

plt.figure(figsize=(15,20))
for i, col in enumerate(num_cols, 1):
    plt.subplot(5,3,i)
    sns.boxplot(x=df[col])
    plt.title(col)
plt.tight_layout()
plt.show()


In [None]:
num_cols = df.select_dtypes(include=['int64','float64']).columns

for col in num_cols:
    plt.figure(figsize=(7,4))
    sns.histplot(df[col], kde=True)
    plt.title(f"Distribui√ß√£o de {col}")
    plt.show()


In [None]:
cat_cols = df.select_dtypes(include='object').columns

for col in cat_cols:
    plt.figure(figsize=(7,4))
    sns.countplot(x=df[col])
    plt.title(f"Distribui√ß√£o de {col}")
    plt.xticks(rotation=45)
    plt.show()


In [None]:
corr = df.corr(numeric_only=True)

plt.figure(figsize=(10,8))
sns.heatmap(corr, annot=True, cmap="coolwarm")
plt.title("Matriz de Correla√ß√£o")
plt.show()


In [None]:
for col in ["house_area_sqm", "num_residents", "num_appliances", "home_office_hours"]:
    plt.figure(figsize=(6,4))
    sns.scatterplot(x=df[col], y=df["monthly_consumption_kwh"])
    plt.title(f"{col} vs consumo mensal")
    plt.show()


In [None]:
for col in cat_cols:
    plt.figure(figsize=(7,4))
    sns.boxplot(x=df[col], y=df["monthly_consumption_kwh"])
    plt.title(f"Consumo mensal por categoria de {col}")
    plt.xticks(rotation=45)
    plt.show()


üß† Etapa 2 - Pr√©-processamento de Dados

In [None]:
import pandas as pd
import numpy as np

# Carregando o dataset
df = pd.read_csv('/content/energy_consumption.csv')

# Visualizando primeiras e √∫ltimas linhas
display(df.head())
display(df.tail())

# Estrutura do dataset
print("\nFormato (linhas, colunas):", df.shape)
print("\nInforma√ß√µes gerais:")
print(df.info())

# Estat√≠sticas descritivas
df.describe()


In [None]:
import matplotlib.pyplot as plt
import seaborn as sns

# Valores faltantes
faltantes = df.isnull().sum()
faltantes_percent = (df.isnull().sum() / len(df)) * 100

# Tabela com faltantes
display(pd.DataFrame({
    "Faltantes": faltantes,
    "Percentual (%)": faltantes_percent.round(2)
}))

# Gr√°fico de valores faltantes
faltantes_percent.plot(kind='bar', figsize=(12,5))
plt.title("Percentual de Valores Faltantes por Coluna")
plt.ylabel("%")
plt.show()

# Boxplots para vari√°veis num√©ricas
num_cols = df.select_dtypes(include=['int64','float64']).columns

plt.figure(figsize=(15,6))
df[num_cols].boxplot()
plt.title("Boxplot das Vari√°veis Num√©ricas")
plt.xticks(rotation=45)
plt.show()


In [None]:
# Histograma da vari√°vel alvo
sns.histplot(df['monthly_consumption_kwh'], kde=True)
plt.title("Distribui√ß√£o do Consumo Mensal (kWh)")
plt.show()

# Histograma de todas vari√°veis num√©ricas
df[num_cols].hist(figsize=(15,10), bins=20)
plt.tight_layout()
plt.show()

# Gr√°ficos para vari√°veis categ√≥ricas
cat_cols = df.select_dtypes(include=['object']).columns

for col in cat_cols:
    plt.figure(figsize=(6,4))
    sns.countplot(data=df, x=col)
    plt.title(f"Distribui√ß√£o da categoria: {col}")
    plt.xticks(rotation=45)
    plt.show()


In [None]:
# Matriz de correla√ß√£o
# Matriz de correla√ß√£o ‚Äî Apenas colunas num√©ricas
corr = df.select_dtypes(include=['int64','float64']).corr()


plt.figure(figsize=(12,7))
sns.heatmap(corr, annot=False, cmap='viridis')
plt.title("Heatmap de Correla√ß√£o")
plt.show()

# Identificar as 5 maiores correla√ß√µes com a vari√°vel alvo
target_corr = corr['monthly_consumption_kwh'].sort_values(ascending=False)
display(target_corr.head(6))

# Dispers√£o das mais correlacionadas
top_features = target_corr.index[1:4]

for col in top_features:
    sns.scatterplot(data=df, x=col, y='monthly_consumption_kwh')
    plt.title(f"{col} x monthly_consumption_kwh")
    plt.show()

# Boxplot com categ√≥ricas
for col in cat_cols:
    plt.figure(figsize=(6,4))
    sns.boxplot(data=df, x=col, y='monthly_consumption_kwh')
    plt.title(f"Consumo x {col}")
    plt.xticks(rotation=45)
    plt.show()


In [None]:
# Imports essenciais
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import joblib

# Ajustes gr√°ficos
plt.style.use('seaborn-v0_8-whitegrid')
