<a href="https://colab.research.google.com/github/Sela80/Data-Science-/blob/main/R%C3%A9ussite_des_%C3%A9tudiants_facteurs_et_perspectives.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install opendatasets --upgrade --quiet
!pip install category_encoders
!pip install category_encoders --upgrade --quiet

In [None]:
# --- 1. Importation des bibliothèques ---
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas as pd
import joblib
import plotly.express as px
from category_encoders import CatBoostEncoder
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
import matplotlib.pyplot as plt
import seaborn as sns
# Modèles de régression
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor,RandomForestClassifier
from sklearn.svm import SVR
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
import opendatasets as od
od.download("https://www.kaggle.com/datasets/anassarfraz13/student-success-factors-and-insights")

In [None]:

# -2. Chargement et Inspection Initiale des Données ---
# Charge le fichier CSV dans un DataFrame Pandas
df1 = pd.read_csv('/content/student-success-factors-and-insights/StudentPerformanceFactors.csv')
df1.head()

In [None]:
df1.info()

In [None]:
df1.columns

In [None]:
df = df1.copy()

In [None]:
df.isna().sum()

In [None]:
for col in ['Teacher_Quality', 'Parental_Education_Level', 'Distance_from_Home']:
    df[col].fillna(df[col].mode()[0], inplace=True)

df.isna().sum()

In [None]:
df.duplicated().sum()

In [None]:
df.describe().transpose()

#Prétraitement

In [None]:
num_cols = df.select_dtypes(include=['int64']).columns.drop('Exam_Score')
cat_cols = df.select_dtypes(include=['object']).columns

In [None]:
x = df.drop('Exam_Score', axis=1)
y = df['Exam_Score']

In [None]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_cols),
    ('cat', CatBoostEncoder(), cat_cols)
])

In [None]:
# Split train/test
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

print("Taille train:", x_train.shape)
print("Taille test:", x_test.shape)

In [None]:
linear_regression = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', LinearRegression())
])

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

random_forest = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('regressor', RandomForestRegressor(random_state=42))
])

In [None]:
random_forest_classifier = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(random_state=42))
])

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

models_to_evaluate = {
    'Linear Regression': linear_regression,
    'Random Forest': random_forest,
    'Random Forest Classifier': random_forest_classifier
}

results = {}
for name, model in models_to_evaluate.items():
    model.fit(x_train, y_train)
    y_pred = model.predict(x_test)
    results[name] = {
        'MAE': mean_absolute_error(y_test, y_pred),
        'MSE': mean_squared_error(y_test, y_pred),
        'R2': r2_score(y_test, y_pred)
    }

for name, metrics in results.items():
    print(f"Metrics for {name}:")
    print(f"MAE: {metrics['MAE']:.2f}")
    print(f"MSE: {metrics['MSE']:.2f}")
    print(f"R2: {metrics['R2']:.2f}")
    print()


#Predire sur les bases de Train et Test

In [None]:
# Predire les classes sur l'ensemble  de Train et Test
y_train_pred = linear_regression.predict(x_train)
y_test_pred = linear_regression.predict(x_test)

In [None]:
#Calcule les mesures de performance
from sklearn.metrics import accuracy_score, roc_auc_score, confusion_matrix, classification_report, RocCurveDisplay
from sklearn.metrics import recall_score

train_MAE = mean_absolute_error(y_train, y_train_pred)
train_MSE = mean_squared_error(y_train, y_train_pred)
train_R2 = r2_score(y_train, y_train_pred)
test_MAE = mean_absolute_error(y_test, y_test_pred)
test_MSE = mean_squared_error(y_test, y_test_pred)
test_R2 = r2_score(y_test, y_test_pred)

#Créer le tableau d'évaluation de performance
performance_Table = pd.DataFrame({
    'DataSet': ['Train', 'Test'],
    'MAE': [train_MAE, test_MAE],
    'MSE': [train_MSE, test_MSE],
    'R2': [train_R2, test_R2]
})
#Afficher le tableau d'évaluation de performance_Table
print(performance_Table)

In [None]:
# Reshape the performance table for plotting
performance_melted = performance_Table.melt(id_vars='DataSet', var_name='Metric', value_name='Score')

# Plot the performance metrics
plt.figure(figsize=(10, 6))
sns.barplot(x='Metric', y='Score', hue='DataSet', data=performance_melted)
plt.title('Model Performance Comparison')
plt.ylabel('Score')
plt.show()

In [None]:
# Save the linear regression model
joblib.dump(linear_regression, 'linear_regression_model.pkl')
joblib.load('linear_regression_model.pkl')

In [None]:
from google.colab import files
files.download('linear_regression_model.pkl')