In [25]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report, accuracy_score
import pickle # For saving encoders and scaler
import joblib # For saving the scikit-learn model
import os


In [26]:
# Load dataset (adjust the path to your CSV file)
df = pd.read_csv("../data/Recommandation_de_formations_expanded (1).csv")
df.head()

Unnamed: 0,Étudiant_ID,Compétence_1,Compétence_2,Compétence_3,Formation_Suivie,Durée_Formation,Note_Formation,Centre_Intérêt,Formation_Recommandée
0,1,C++,Game Development,Unity,Game Development Intermédiaire,18,4.8,Game Dev,Unity 3D
1,2,Rust,Operating System,System Programming,Systèmes d'Exploitation Avancés,24,4.9,OS Dev,Linux Kernel
2,3,Swift,iOS Development,Machine Learning,Développement d'App iOS,22,4.6,Mobile App Dev,Core ML
3,4,PHP,Web Development,MySQL,Développement Web Avancé,16,4.1,Web Dev,Laravel
4,5,Go,Cloud Computing,Kubernetes,Développement d'Application Cloud,20,4.5,Cloud Dev,Google Cloud


In [27]:
# Check for null values and basic info
print(df.info())
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 9 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   Étudiant_ID            1000 non-null   int64  
 1   Compétence_1           1000 non-null   object 
 2   Compétence_2           1000 non-null   object 
 3   Compétence_3           1000 non-null   object 
 4   Formation_Suivie       1000 non-null   object 
 5   Durée_Formation        1000 non-null   int64  
 6   Note_Formation         1000 non-null   float64
 7   Centre_Intérêt         1000 non-null   object 
 8   Formation_Recommandée  1000 non-null   object 
dtypes: float64(1), int64(2), object(6)
memory usage: 70.4+ KB
None
Étudiant_ID              0
Compétence_1             0
Compétence_2             0
Compétence_3             0
Formation_Suivie         0
Durée_Formation          0
Note_Formation           0
Centre_Intérêt           0
Formation_Recommandée    0
dtype: int64

In [28]:
# Columns to encode
categorical_cols = ['Compétence_1', 'Compétence_2', 'Compétence_3',
                    'Formation_Suivie', 'Centre_Intérêt', 'Formation_Recommandée']

# Encode all categorical columns
encoders = {}
for col in categorical_cols:
    encoders[col] = LabelEncoder()
    df[col] = encoders[col].fit_transform(df[col])

df.head()

Unnamed: 0,Étudiant_ID,Compétence_1,Compétence_2,Compétence_3,Formation_Suivie,Durée_Formation,Note_Formation,Centre_Intérêt,Formation_Recommandée
0,1,1,8,22,21,18,4.8,10,26
1,2,14,11,20,23,24,4.9,12,14
2,3,17,17,11,14,22,4.6,11,7
3,4,9,16,14,8,16,4.1,15,13
4,5,2,4,9,16,20,4.5,4,9


In [29]:
# Separate features (X) and target (y)
X = df.drop(['Étudiant_ID', 'Formation_Recommandée'], axis=1)
y = df['Formation_Recommandée']

In [30]:
# Normalize Durée_Formation and Note_Formation
scaler = StandardScaler()
X[['Durée_Formation', 'Note_Formation']] = scaler.fit_transform(X[['Durée_Formation', 'Note_Formation']])
X.head()

Unnamed: 0,Compétence_1,Compétence_2,Compétence_3,Formation_Suivie,Durée_Formation,Note_Formation,Centre_Intérêt
0,1,8,22,21,0.039488,0.982105,10
1,14,11,20,23,0.950753,1.325498,12
2,17,17,11,14,0.646998,0.295318,11
3,9,16,14,8,-0.264267,-1.421648,15
4,2,4,9,16,0.343243,-0.048075,4


In [31]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [32]:
# Train the model
model = KNeighborsClassifier(n_neighbors=3)
model.fit(X_train, y_train)

In [33]:
# Predict and evaluate
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 1.0

Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00         5
           1       1.00      1.00      1.00         6
           2       1.00      1.00      1.00         6
           3       1.00      1.00      1.00         9
           4       1.00      1.00      1.00         5
           5       1.00      1.00      1.00        12
           6       1.00      1.00      1.00        11
           7       1.00      1.00      1.00         9
           8       1.00      1.00      1.00         8
           9       1.00      1.00      1.00        15
          10       1.00      1.00      1.00         5
          11       1.00      1.00      1.00         3
          12       1.00      1.00      1.00        12
          13       1.00      1.00      1.00         1
          14       1.00      1.00      1.00         5
          15       1.00      1.00      1.00         5
          16       1.00      1.00      1.0

In [34]:
# Inverse transform predictions to get course names
recommended_courses = encoders['Formation_Recommandée'].inverse_transform(y_pred)

print("Recommended Courses:")
print(recommended_courses)

Recommended Courses:
['Red Hat' 'Red Hat' 'Xcode' 'Spring Cloud' '.NET Core' 'AWS Lambda'
 'OpenCV' 'Core ML' 'R Studio' 'TensorFlow' '.NET Core' 'OpenCV'
 'TensorFlow' 'Jira' 'Symfony' 'Jira' 'Deep Learning' 'Android Studio'
 '.NET Core' 'R Studio' 'Jira' 'Unreal Engine' 'Red Hat' 'ARKit'
 'Spring Cloud' 'R Studio' 'Linux Kernel' 'TensorFlow' 'Core ML'
 'TensorFlow' 'Red Hat' 'Google Cloud' 'Red Hat' 'Jira' 'TensorFlow'
 'Scikit-Learn' 'Red Hat' 'Apache Spark' 'Linux Kernel' 'R Studio'
 'AWS Lambda' 'Red Hat' 'Ruby on Rails' 'Unreal Engine' 'Jira' 'ARKit'
 'Unreal Engine' 'Linux Kernel' 'Google Cloud' 'Red Hat' 'TensorFlow'
 'Angular 2' '.NET Core' 'AWS Lambda' 'Jira' 'Deep Learning'
 'Android Studio' 'Spring Cloud' 'Red Hat' 'TensorFlow' 'Red Hat'
 'Google Cloud' 'MATLAB' 'R Studio' 'Google Cloud' 'MATLAB'
 'Ruby on Rails' 'Deep Learning' 'TensorFlow' 'Unreal Engine'
 'Android Studio' 'Unreal Engine' 'Xcode' 'OpenSSL' 'MATLAB'
 'Google Cloud' 'Core ML' 'Google Cloud' 'Google Cloud' '

In [35]:
# --- Define Save Directory ---
save_directory = "../data" # Relative path to the 'data' directory
os.makedirs(save_directory, exist_ok=True) # Create 'data' directory if it doesn't exist

# --- 1. Save Trained KNN Model (using joblib) ---
model_filename = "your_knn_model.joblib" # Using .joblib extension
model_save_path = os.path.join(save_directory, model_filename)
joblib.dump(model, model_save_path)
print(f"Trained KNN model saved to: {model_save_path}")

# --- 2. Save Label Encoders ---
encoders_filename = "label_encoders.pkl" # Clearer name
encoders_save_path = os.path.join(save_directory, encoders_filename)
with open(encoders_save_path, "wb") as f:
    pickle.dump(encoders, f) # 'encoders' is the dictionary from Cell 4
print(f"Label encoders saved to: {encoders_save_path}")

# --- 3. Save Standard Scaler ---
scaler_filename = "standard_scaler.pkl"
scaler_save_path = os.path.join(save_directory, scaler_filename)
with open(scaler_save_path, "wb") as f:
    pickle.dump(scaler, f) # 'scaler' is the StandardScaler object from Cell 6
print(f"Standard scaler saved to: {scaler_save_path}")

# --- 4. Save Original Dataset (df_reference.csv) ---
# Ensure df_original is loaded correctly if not already available in the notebook's current state.
# If 'df' from Cell 2 is the original unmodified dataframe, you can use that.
# Or, reload it to be certain:
df_original_for_reference = pd.read_csv("../data/Recommandation_de_formations_expanded (1).csv")

reference_df_filename = "df_reference.csv"
reference_df_save_path = os.path.join(save_directory, reference_df_filename)
df_original_for_reference.to_csv(reference_df_save_path, index=False)
print(f"Reference DataFrame saved to: {reference_df_save_path}")

Trained KNN model saved to: ../data\your_knn_model.joblib
Label encoders saved to: ../data\label_encoders.pkl
Standard scaler saved to: ../data\standard_scaler.pkl
Reference DataFrame saved to: ../data\df_reference.csv
