In [2]:
import pandas as pd 
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, OneHotEncoder, StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report


In [3]:
# Importation du dataset et creation du dataset

etudiants = pd.read_csv("TB_ETUDIANT.csv", encoding='latin-1', delimiter=';')

df = pd.DataFrame(etudiants)

df.head()

Unnamed: 0,ID_ETUDIANT,Sexe,MoyenneBac,Orientation,NotesMaths,NotesFrançais,NotesPhysChimie,NotesSVT,NotesAnglais,Interets1,Interets2,choixOrientation
0,ET001,F,15.0,L,16.0,16.0,12.75,8.0,6.0,Biologie,Biosciences,Chimie
1,ET002,F,10.0,S,7.0,15.0,20.0,20.0,13.0,Finance / Comptabilité / Statistiques,Business Administration,Business Administration
2,ET003,F,10.0,S,14.0,17.0,17.0,17.0,9.0,Physiques,Biosciences,Biosciences
3,ET004,F,15.0,S,7.0,16.0,12.75,8.0,19.0,Informatique,Genie Logiciel,Informatique
4,ET005,F,12.0,S,9.0,13.0,19.0,19.0,15.0,Informatique,Genie Logiciel,Genie Logiciel


In [4]:
print('Number of rows : ', len(df))
print('---------------------')
# Count the missing values
print('Missing values count : \n', df.isnull().sum())

Number of rows :  1147
---------------------
Missing values count : 
 ID_ETUDIANT         0
Sexe                0
MoyenneBac          0
Orientation         0
NotesMaths          0
NotesFrançais       0
NotesPhysChimie     0
NotesSVT            0
NotesAnglais        0
Interets1           0
Interets2           0
choixOrientation    0
dtype: int64


In [5]:
le_interets = LabelEncoder()
le_interets.fit(df['Interets1'])
le_interets.fit(df['Interets2'])

df['Interets2'] = le_interets.fit_transform(df['Interets2'])
df['Interets1'] = le_interets.fit_transform(df['Interets1'])


# Séparation des variables indépendantes (X) et de la variable cible (y)
X = df[['MoyenneBac', 'NotesMaths', 'NotesFrançais', 'NotesPhysChimie', 'NotesSVT', 'NotesAnglais',  'Interets1', 'Interets2']]
y = df['choixOrientation']

# Division des données en ensembles d'entraînement et de test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)



In [6]:
categorical_cols = ['Interets1', 'Interets2']


preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_cols)
    ],
    remainder='passthrough'
)


decision_tree_model = DecisionTreeClassifier()


pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', decision_tree_model)
])


pipeline.fit(X_train, y_train)

Entrainement et évaluation

In [7]:
classifiers = {
    'Logistic Regression' : LogisticRegression(),
    'decision tree' : DecisionTreeClassifier(),
    'KNN' : KNeighborsClassifier()
}


def train_and_evaluation(classifier):
    for index, (name, classifier) in enumerate(classifier.items()):
        classifier.fit(X_train, y_train)
        y_pred_train = classifier.predict(X_train)
        y_pred_test = classifier.predict(X_test)

        print(f"{name} \n")
        print(f"\t Accuracy (train) : {accuracy_score(y_train, y_pred_train):.2f}")
        print(f"\tAccuracy (test) : {accuracy_score(y_test, y_pred_test):.2f}")
        print(f"{classification_report(y_test, y_pred_test, zero_division=1)}\n")




In [8]:
train_and_evaluation(classifiers)

Logistic Regression 

	 Accuracy (train) : 0.56
	Accuracy (test) : 0.56
                         precision    recall  f1-score   support

                Anglais       0.95      0.71      0.82        28
               Biologie       1.00      0.00      0.00         1
            Biosciences       0.71      0.76      0.74        51
Business Administration       1.00      0.00      0.00        20
Comptabilité et Finance       0.61      0.88      0.72        75
               Economie       1.00      0.00      0.00         1
         Finance Banque       0.00      1.00      0.00         0
            Genie Civil       1.00      0.00      0.00        16
       Genie Electrique       1.00      0.00      0.00        12
         Genie Logiciel       0.53      0.42      0.47        38
        Genie Mecanique       1.00      0.00      0.00        11
           Informatique       0.45      0.91      0.60        54
          Mathematiques       0.00      0.00      1.00        18
              Phy

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [13]:
# Nouvel étudiant pour la prédiction
nouvel_etudiant = {
    'MoyenneBac' : 10,
    'NotesMaths': 15,
    'NotesFrançais': 12,
    'NotesPhysChimie': 15,
    'NotesSVT': 15,
    'NotesAnglais': 12,
    'Interets1': 'Chimie',  
    'Interets2': 'Genie Logiciel' 
}

# Création d'un DataFrame pour le nouvel étudiant
df = pd.DataFrame([nouvel_etudiant])

# Prédiction pour le nouvel étudiant
prediction = pipeline.predict(df)

# Affichage de la prédiction pour le nouvel étudiant
print("Recommandations d'orientation pour le nouvel étudiant :")
print(f"Prédiction : {prediction}")

Recommandations d'orientation pour le nouvel étudiant :
Prédiction : ['Genie Logiciel']
