In [24]:
import pandas as pd
from IPython.display import display
import seaborn as sns
import matplotlib.pyplot as plt
from jinja2 import Template
import os
import numpy as np

In [25]:
# Abrir o arquivo CSV
csv_file = open("StudentPerformanceFactors.csv")

# Exibir o nome do arquivo
print(csv_file.name)

# Carregar os dados em um DataFrame
data = pd.read_csv(csv_file)

StudentPerformanceFactors.csv


In [26]:
# Exibir o DataFrame
display(data)

Unnamed: 0,Hours_Studied,Attendance,Parental_Involvement,Access_to_Resources,Extracurricular_Activities,Sleep_Hours,Previous_Scores,Motivation_Level,Internet_Access,Tutoring_Sessions,Family_Income,Teacher_Quality,School_Type,Peer_Influence,Physical_Activity,Learning_Disabilities,Parental_Education_Level,Distance_from_Home,Gender,Exam_Score
0,23,84,Low,High,No,7,73,Low,Yes,0,Low,Medium,Public,Positive,3,No,High School,Near,Male,67
1,19,64,Low,Medium,No,8,59,Low,Yes,2,Medium,Medium,Public,Negative,4,No,College,Moderate,Female,61
2,24,98,Medium,Medium,Yes,7,91,Medium,Yes,2,Medium,Medium,Public,Neutral,4,No,Postgraduate,Near,Male,74
3,29,89,Low,Medium,Yes,8,98,Medium,Yes,1,Medium,Medium,Public,Negative,4,No,High School,Moderate,Male,71
4,19,92,Medium,Medium,Yes,6,65,Medium,Yes,3,Medium,High,Public,Neutral,4,No,College,Near,Female,70
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6602,25,69,High,Medium,No,7,76,Medium,Yes,1,High,Medium,Public,Positive,2,No,High School,Near,Female,68
6603,23,76,High,Medium,No,8,81,Medium,Yes,3,Low,High,Public,Positive,2,No,High School,Near,Female,69
6604,20,90,Medium,Low,Yes,6,65,Low,Yes,3,Low,Medium,Public,Negative,2,No,Postgraduate,Near,Female,68
6605,10,86,High,High,Yes,6,91,High,Yes,2,Low,Medium,Private,Positive,3,No,High School,Far,Female,68


In [27]:
# Converter todas as colunas do DataFrame para o tipo 'category'
for col in data.columns:
    data[col] = data[col].astype('category')

In [28]:
# Função para gerar gráficos univariados
def generate_univariate_plot(df, column):
    plt.figure(figsize=(8, 4))
    sns.histplot(df[column].dropna(), kde=True)
    plt.title(f'Distribution of {column}')
    plot_path = f'univariate_{column}.png'
    plt.savefig(plot_path, bbox_inches='tight')
    plt.close()
    return plot_path

In [29]:
# Função para gerar gráfico de correlação entre horas de estudo e notas
def generate_study_score_correlation(df):
    plt.figure(figsize=(8, 6))
    sns.scatterplot(data=df, x='Hours_Studied', y='Exam_Score')
    plt.title('Correlation between Hours Studied and Exam Score')
    plt.xlabel('Hours Studied')
    plt.ylabel('Exam Score')
    plt.grid()
    plt.savefig('study_score_correlation.png', bbox_inches='tight')
    plt.close()

In [30]:
# Função para gerar o relatório HTML
def generate_html_report(df, title='Data Report'):
    summary = df.describe(include='all').T
    missing_report = df.isnull().sum()

    # Template HTML com 3 colunas usando Flexbox
    template = Template('''
    <html>
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>{{ title }}</title>
        <style>
            body { font-family: Arial, sans-serif; margin: 20px; }
            h1, h2, h3 { color: #333; }
            table { border-collapse: collapse; width: 100%; }
            th, td { padding: 8px; text-align: left; border: 1px solid #ddd; }
            th { background-color: #f2f2f2; }
            .container {
                display: flex;
                flex-wrap: wrap; /* Permite que as colunas se ajustem */
                gap: 20px; /* Espaçamento entre as colunas */
            }
            .col {
                flex: 1 1 30%; /* Flex-grow, flex-shrink, flex-basis */
                padding: 10px;
                border: 1px solid #ccc;
                box-sizing: border-box; /* Para incluir padding e border no tamanho total */
            }
        </style>
    </head>
    <body>
        <h1>{{ title }}</h1>

        <h2>Summary Statistics</h2>
        {{ summary_html | safe }}

        <h2>Missing Values</h2>
        {{ missing_report_html | safe }}

        <h2>Correlation Between Hours Studied and Exam Score</h2>
        <img src="study_score_correlation.png" width="600">

        <h2>Univariate Distributions</h2>
        <div class="container">
            {% for col in df.columns %}
                <div class="col">
                    <h3>{{ col }}</h3>
                    <img src="univariate_{{ col }}.png" width="100%">
                </div>
            {% endfor %}
        </div>
    </body>
    </html>
    ''')

    # Gerar gráfico de correlação entre horas de estudo e notas
    generate_study_score_correlation(df)

    # Gerar gráficos univariados para cada coluna
    for col in df.columns:
        generate_univariate_plot(df, col)

    # Renderizar o HTML com os dados
    html_content = template.render(
        title=title,
        summary_html=summary.to_html(classes='table table-striped'),
        missing_report_html=missing_report.to_frame().to_html(classes='table table-striped'),
        df=df
    )

    # Salvar o relatório em arquivo HTML
    report_path = 'data_report.html'
    with open(report_path, 'w') as f:
        f.write(html_content)
    print(f"Report generated and saved as {report_path}")

    return report_path

In [31]:
# Gerar e exibir o relatório
report_path = generate_html_report(data)

Report generated and saved as data_report.html
