# Importing CSV File from CSV via pandas library

In [None]:
import pandas as pd

df = pd.read_csv("gb_notes_v2.csv")
print("CSV file imported successfully")
print("=====================================")
print(df.head())

CSV file imported successfully
   index  math  eng group gender  count
0      0   2.0  6.0  lang      F      1
1      1   3.0  6.0  lang      F      1
2      2   3.0  7.0  lang      F      2
3      3   3.0  7.0  lang      M      2
4      4   3.0  8.0  lang      F      2


# Determining the variable type

In [2]:
def determine_variable_type(df):
    variable_types = {}
    for column in df.columns:
        if df[column].dtype == "object" or df[column].name == "index":
            variable_types[column] = "Qualitative nominale"
        else:
            if df[column].name == "eng" or df[column].name == "math":
                variable_types[column] = "Quantitative continue"
            else:
                variable_types[column] = "Quantitative discrète"
    return variable_types


variable_types = determine_variable_type(df)
print(variable_types)

{'index': 'Qualitative nominale', 'math': 'Quantitative continue', 'eng': 'Quantitative continue', 'group': 'Qualitative nominale', 'gender': 'Qualitative nominale', 'count': 'Quantitative discrète'}


# Calculating the mean, median, mode, and standard deviation for each variable

In [3]:
def average_calculation(df):
    averages = {}
    for column in df.columns:
        if df[column].dtype == "object" or column == "index" or column == "count":
            continue
        else:
            weighted_sum = sum(df[column] * df["count"])
            total_count = df["count"].sum()
            column_avg = weighted_sum / total_count
            averages[column] = column_avg
            print(f"{column} - Moyenne pondérée: {column_avg}")
    return averages


averages = average_calculation(df)

print("=====================================")
print("Average of each column")
print(averages)

math - Moyenne pondérée: 6.536
eng - Moyenne pondérée: 6.672
Average of each column
{'math': 6.536, 'eng': 6.672}


In [4]:
def median_calculation(df):
    medians = {}
    for column in df.columns:
        if df[column].dtype == "object" or column == "index" or column == "count":
            continue
        else:
            # Trier les données par la colonne d'intérêt
            df_sorted = df.sort_values(by=column)
            total_count = df_sorted["count"].sum()
            median_index = total_count / 2
            cumulative_sum = 0
            median = 0
            for index, row in df_sorted.iterrows():
                cumulative_sum += row["count"]
                if cumulative_sum >= median_index:
                    median = row[column]
                    break
            medians[column] = median
            print(f"{column} - Médiane pondérée: {median}")
    return medians


medians = median_calculation(df)

print("=====================================")
print("Médiane de chaque colonne")
print(medians)

math - Médiane pondérée: 6.0
eng - Médiane pondérée: 7.0
Médiane de chaque colonne
{'math': 6.0, 'eng': 7.0}


In [5]:
def mode_calculation(df):
    modes = {}
    for column in df.columns:
        if df[column].dtype == "object":
            mode = df[column].mode().values[0]
            modes[column] = mode
            print(f"{column} - Mode: {mode}")
    return modes


modes = mode_calculation(df)

print("=====================================")
print("Mode of each column")
print(modes)

group - Mode: science
gender - Mode: F
Mode of each column
{'group': 'science', 'gender': 'F'}


# Dispersion methods

In [6]:
def amplitudes_calculation(df):
    amplitudes = {}
    for column in df.columns:
        if df[column].dtype == "object" or column == "index" or column == "count":
            continue
        else:
            min_value = df[column].min()
            max_value = df[column].max()
            amplitude = max_value - min_value
            amplitudes[column] = amplitude
            print(f"{column} - Amplitude: {amplitude}")
    return amplitudes


amplitudes = amplitudes_calculation(df)
print("=====================================")
print("Amplitudes of each column")
print(amplitudes)

math - Amplitude: 8.0
eng - Amplitude: 9.0
Amplitudes of each column
{'math': 8.0, 'eng': 9.0}


In [7]:
def calculate_weighted_mean(df, column):
    weighted_sum = sum(df[column] * df["count"])
    total_count = df["count"].sum()
    return weighted_sum / total_count


def calculate_weighted_mad(df, column):
    mean = calculate_weighted_mean(df, column)
    absolute_deviations = abs(df[column] - mean)
    weighted_absolute_deviations = absolute_deviations * df["count"]
    mad = weighted_absolute_deviations.sum() / df["count"].sum()
    return mad


for column in df.columns:
    if df[column].dtype != "object" and column != "index" and column != "count":
        mad = calculate_weighted_mad(df, column)
        print(f"{column} - Déviation absolue moyenne pondérée: {mad}")

print("=====================================")

math - Déviation absolue moyenne pondérée: 1.262224
eng - Déviation absolue moyenne pondérée: 1.307456


In [8]:
def variances(df):
    variances = {}
    for column in df.columns:
        if df[column].dtype == "object" or column == "index" or column == "count":
            continue
        else:
            mean = calculate_weighted_mean(df, column)
            squared_deviations = (df[column] - mean) ** 2
            weighted_squared_deviations = squared_deviations * df["count"]
            variance = weighted_squared_deviations.sum() / df["count"].sum()
            variances[column] = variance
            print(f"{column} - Variance: {variance}")
    return variances


def standard_deviations(variances):
    std_devs = {}
    for column, variance in variances.items():
        std_dev = variance**0.5
        std_devs[column] = std_dev
        print(f"{column} - Écart-type: {std_dev}")
    return std_devs


variances = variances(df)
print("=====================================")
print("Variance of each column")
print(variances, "\n")

std_devs = standard_deviations(variances)
print("=====================================")
print("std of each column")
print(std_devs)

math - Variance: 2.3007039999999996
eng - Variance: 2.480416
Variance of each column
{'math': 2.3007039999999996, 'eng': 2.480416} 

math - Écart-type: 1.51680717297882
eng - Écart-type: 1.5749336493960626
std of each column
{'math': 1.51680717297882, 'eng': 1.5749336493960626}


In [9]:
def calculate_weighted_quantile(df, column, quantile):
    df_sorted = df.sort_values(by=column)
    total_count = df_sorted["count"].sum()
    quantile_index = total_count * quantile
    cumulative_sum = 0
    for index, row in df_sorted.iterrows():
        cumulative_sum += row["count"]
        if cumulative_sum >= quantile_index:
            return row[column]


def calculate_weighted_iqr(df, column):
    Q1 = calculate_weighted_quantile(df, column, 0.25)
    Q3 = calculate_weighted_quantile(df, column, 0.75)
    IQR = Q3 - Q1
    return Q1, Q3, IQR


for column in df.columns:
    if df[column].dtype != "object" and column != "index" and column != "count":
        Q1, Q3, IQR = calculate_weighted_iqr(df, column)
        print(f"{column} - Q1: {Q1}, Q3: {Q3}, IQR: {IQR}")

math - Q1: 5.0, Q3: 8.0, IQR: 3.0
eng - Q1: 6.0, Q3: 8.0, IQR: 2.0


# Some Graphics 

1. afficher un graphique univarié pour chaque colonne (hors "index") du dataset "notes_v2.csv", avec:
    - un titre
    - les labels des axes
    - la moyenne et la mediane (si applicable) en "red" et "purple"
    - les valeurs/hauteurs des bar-plots en chiffre
 
2. graphiques bivarié:
    - afficher un graphique en nuage de points pour les colonnes "math","eng"
    - afficher un histogramme (multi-couche superposé) des scores en "math" ventilé par "gender"
        - une couleur par "gender"
        - alpha de 0.5
 
3. sauver toutes les images dans 2 pdf
    - 1: univarié
    - 2: bivarié

In [38]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib.backends.backend_pdf import PdfPages

# Charger les données
df = pd.read_csv("notes_v2.csv")


# Étape 1 : Graphiques Univariés
def plot_univariate(df):
    with PdfPages("notes/univariate_plots.pdf") as pdf:
        for column in df.columns:
            if column != "index" and df[column].dtype != "object":
                plt.figure(figsize=(10, 6))
                sns.histplot(df, x=column, kde=False, color="blue")
                mean = df[column].mean()
                median = df[column].median()
                plt.axvline(mean, color="red", linestyle="dashed", linewidth=1)
                plt.axvline(median, color="purple", linestyle="dashed", linewidth=1)
                plt.title(f"Distribution de {column}")
                plt.xlabel(column)
                plt.ylabel("Fréquence")
                plt.legend({"Moyenne": mean, "Médiane": median})
                for p in plt.gca().patches:
                    plt.gca().annotate(
                        f"{p.get_height():.0f}",
                        (p.get_x() + p.get_width() / 2.0, p.get_height()),
                        ha="center",
                        va="center",
                        xytext=(0, 10),
                        textcoords="offset points",
                    )
                pdf.savefig()
                plt.close()


plot_univariate(df)


# Étape 2 : Graphiques Bivariés
def plot_bivariate(df):
    with PdfPages("notes/bivariate_plots.pdf") as pdf:
        # Nuage de points pour "math" et "eng"
        plt.figure(figsize=(10, 6))
        sns.scatterplot(data=df, x="math", y="eng", hue="gender")
        plt.title("Nuage de points des scores en math et eng")
        plt.xlabel("Math")
        plt.ylabel("Eng")
        pdf.savefig()
        plt.close()

        # Histogramme des scores en "math" ventilé par "gender"
        plt.figure(figsize=(10, 6))
        sns.histplot(
            data=df,
            x="math",
            hue="gender",
            multiple="stack",
            alpha=0.5,
        )
        plt.title("Histogramme des scores en math ventilé par gender")
        plt.xlabel("Math")
        plt.ylabel("Fréquence")
        pdf.savefig()
        plt.close()


plot_bivariate(df)

# Création des différents graphiques dans le dossier /Gb

In [25]:
import pandas as pd
import matplotlib.pyplot as plt


def diagnostiquer_colonnes(df):
    diagnostics = {}
    for col in df.columns:
        if col == "count":
            continue
        if pd.api.types.is_numeric_dtype(df[col]):
            if df[col].nunique() < 20:
                type_col = "Quantitative Discrète"
            else:
                type_col = "Quantitative Continue"
            moyenne = df[col].mean()
            mediane = df[col].median()
            ecart_type = df[col].std()
            etendue = df[col].max() - df[col].min()
            diagnostics[col] = {
                "Type": type_col,
                "Moyenne": moyenne,
                "Médiane": mediane,
                "Écart-type": ecart_type,
                "Étendue": etendue,
            }
        else:
            if df[col].nunique() < 20:
                type_col = "Qualitative Nominale"
            else:
                type_col = "Qualitative Ordinale"
            mode = df[col].mode()[0]
            effectifs = df[col].value_counts()
            frequences = df[col].value_counts(normalize=True)
            diagnostics[col] = {
                "Type": type_col,
                "Mode": mode,
                "Effectifs": effectifs,
                "Fréquences": frequences,
            }
    return diagnostics


def calculer_correls(df):
    df_numerique = df.select_dtypes(include=["number"])
    correlations = df_numerique.corr(method="pearson")
    return correlations


def calculer_frequences_conjointes(df, col1, col2):
    table_conjointe = pd.crosstab(
        df[col1], df[col2], values=df["count"], aggfunc="sum"
    ).fillna(0)
    frequences_conjointes = table_conjointe / table_conjointe.sum().sum()
    return table_conjointe, frequences_conjointes


def calculer_frequences_conditionnelles(df, col1, col2):
    table_conjointe = pd.crosstab(
        df[col1], df[col2], values=df["count"], aggfunc="sum"
    ).fillna(0)
    frequences_conditionnelles = table_conjointe.div(
        table_conjointe.sum(axis=1), axis=0
    )
    return frequences_conditionnelles


def plot_histogram(df, col):
    plt.figure(figsize=(10, 6))
    plt.hist(df[col], bins=20, weights=df["count"], alpha=0.7, color="blue")
    mean = df[col].mean()
    median = df[col].median()
    plt.axvline(mean, color="red", linestyle="dashed", linewidth=1)
    plt.axvline(median, color="purple", linestyle="dashed", linewidth=1)
    plt.text(mean, plt.ylim()[1] * 0.9, f"Mean: {mean:.2f}", color="red")
    plt.text(median, plt.ylim()[1] * 0.8, f"Median: {median:.2f}", color="purple")
    plt.title(f"Histogramme de {col}")
    plt.xlabel(col)
    plt.ylabel("Fréquence")


def plot_boxplot(df, col):
    plt.figure(figsize=(10, 6))
    plt.boxplot(df[col], vert=False)
    plt.title(f"Boxplot de {col}")
    plt.xlabel(col)


def plot_correlation_heatmap(correlations):
    plt.figure(figsize=(10, 8))
    plt.imshow(correlations, cmap="coolwarm", interpolation="none", aspect="auto")
    plt.colorbar()
    plt.xticks(range(len(correlations.columns)), correlations.columns, rotation=90)
    plt.yticks(range(len(correlations.columns)), correlations.columns)
    plt.title("Heatmap des corrélations de Pearson")


def plot_barplot(df, col):
    plt.figure(figsize=(10, 6))
    counts = df.groupby(col)["count"].sum()
    plt.bar(counts.index, counts.values, alpha=0.7, color="blue")
    for i, v in enumerate(counts.values):
        plt.text(i, v + 0.5, str(v), ha="center", va="bottom")
    plt.title(f"Effectifs de {col}")
    plt.xlabel(col)
    plt.ylabel("Effectifs")


def plot_frequences_conjointes_heatmap(frequences_conjointes, col1, col2):
    plt.figure(figsize=(10, 8))
    plt.imshow(
        frequences_conjointes, cmap="coolwarm", interpolation="none", aspect="auto"
    )
    plt.colorbar()
    plt.xticks(
        range(len(frequences_conjointes.columns)),
        frequences_conjointes.columns,
        rotation=90,
    )
    plt.yticks(range(len(frequences_conjointes.index)), frequences_conjointes.index)
    plt.title(f"Fréquences conjointes entre {col1} et {col2}")


In [26]:
# Charger les données depuis un fichier CSV
df = pd.read_csv("./gb_notes_v2.csv")

# Diagnostiquer les colonnes
resultats = diagnostiquer_colonnes(df)

# Afficher les résultats
for col, details in resultats.items():
    print(f"Colonne: {col}, Type: {details['Type']}")
    if "Moyenne" in details:
        print(f"  Moyenne: {details['Moyenne']}")
        print(f"  Médiane: {details['Médiane']}")
        print(f"  Écart-type: {details['Écart-type']}")
        print(f"  Étendue: {details['Étendue']}")
    if "Mode" in details:
        print(f"  Mode: {details['Mode']}")
        print(f"  Effectifs:\n{details['Effectifs']}")
        print(f"  Fréquences:\n{details['Fréquences']}")

# Calculer et afficher les corrélations de Pearson
correlations = calculer_correls(df)
print("\nCorrélations de Pearson:\n", correlations)

# Calculer et afficher les fréquences conjointes et conditionnelles pour les colonnes qualitatives
colonnes_qualitatives = df.select_dtypes(include=["object"]).columns
for i in range(len(colonnes_qualitatives)):
    for j in range(i + 1, len(colonnes_qualitatives)):
        col1 = colonnes_qualitatives[i]
        col2 = colonnes_qualitatives[j]
        table_conjointe, frequences_conjointes = calculer_frequences_conjointes(
            df, col1, col2
        )
        frequences_conditionnelles = calculer_frequences_conditionnelles(df, col1, col2)

        print(
            f"\nFréquences conjointes entre {col1} et {col2}:\n",
            frequences_conjointes,
        )
        print(
            f"\nFréquences conditionnelles de {col1} par rapport à {col2}:\n",
            frequences_conditionnelles,
        )


Colonne: index, Type: Quantitative Continue
  Moyenne: 65.5
  Médiane: 65.5
  Écart-type: 38.24918299781056
  Étendue: 131
Colonne: math, Type: Quantitative Discrète
  Moyenne: 6.545454545454546
  Médiane: 7.0
  Écart-type: 1.9275568381581714
  Étendue: 8.0
Colonne: eng, Type: Quantitative Discrète
  Moyenne: 6.492424242424242
  Médiane: 7.0
  Écart-type: 2.0657003031196424
  Étendue: 9.0
Colonne: group, Type: Qualitative Nominale
  Mode: science
  Effectifs:
group
science    74
lang       58
Name: count, dtype: int64
  Fréquences:
group
science    0.560606
lang       0.439394
Name: proportion, dtype: float64
Colonne: gender, Type: Qualitative Nominale
  Mode: F
  Effectifs:
gender
F    66
M    66
Name: count, dtype: int64
  Fréquences:
gender
F    0.5
M    0.5
Name: proportion, dtype: float64

Corrélations de Pearson:
           index      math       eng     count
index  1.000000  0.982782  0.047486  0.002194
math   0.982782  1.000000 -0.102480 -0.004115
eng    0.047486 -0.102480  1.0

In [33]:
# Générer les graphiques univariés
for col, details in resultats.items():
    if "Moyenne" in details:
        plot_histogram(df, col)
        plt.savefig(f"GB/univariate_{col}_histogram.pdf")
        plt.close()
        plot_boxplot(df, col)
        plt.savefig(f"GB/univariate_{col}_boxplot.pdf")
        plt.close()
    if "Mode" in details:
        plot_barplot(df, col)
        plt.savefig(f"GB/univariate_{col}_barplot.pdf")
        plt.close()

# Générer les graphiques bivariés
# Nuage de points pour "math" et "eng"
plt.figure(figsize=(10, 6))
colors = df["gender"].apply(lambda x: {"M": "blue", "F": "red"}[x])
plt.scatter(df["math"], df["eng"], c=colors, alpha=0.5)
plt.title("Nuage de points des scores en math et eng")
plt.xlabel("Math")
plt.ylabel("Eng")
plt.savefig("GB/bivariate_math_eng_scatter.pdf")
plt.close()

# Histogramme des scores en "math" ventilé par "gender"
plt.figure(figsize=(10, 6))
genders = df["gender"].unique()
for gender in genders:
    subset = df[df["gender"] == gender]
    plt.hist(subset["math"], weights=subset["count"], alpha=0.5, label=gender)
plt.title("Histogramme des scores en math ventilé par gender")
plt.xlabel("Math")
plt.ylabel("Fréquence")
plt.legend()
plt.savefig("GB/bivariate_math_gender_histogram.pdf")
plt.close()
