# Model Experiment Notebook

This notebook is used to experiment with different machine learning models for predicting a student's recommended IT/CS major based on their inputs.

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
import joblib

# Load dataset
data = pd.read_csv('../data/sample_students.csv')

# Preprocess data (this should be done in preprocess.py)
# For now, let's assume we have a function to preprocess the data
def preprocess_data(data):
    # Example preprocessing steps
    data = pd.get_dummies(data, columns=['strongest_subjects', 'preferred_task', 'future_career_goal', 'preferred_work_type', 'preferred_thinking_style'])
    return data

data = preprocess_data(data)

# Define features and target
X = data.drop('recommended_major', axis=1)
y = data['recommended_major']

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Random Forest model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

# Evaluate model
y_pred = rf_model.predict(X_test)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# Save the model
joblib.dump(rf_model, '../backend/ml/model.pkl')

# Train Logistic Regression model
lr_model = LogisticRegression(max_iter=200)
lr_model.fit(X_train, y_train)

# Evaluate model
y_pred_lr = lr_model.predict(X_test)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr))

# Save the Logistic Regression model
joblib.dump(lr_model, '../backend/ml/lr_model.pkl')