In [1]:
!pip install pandas numpy matplotlib seaborn scikit-learn imbalanced-learn

# Student Track Prediction - Complete ML Pipeline
# Predicting Track_Name based on student features
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, StratifiedKFold
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.feature_selection import SelectKBest, chi2, mutual_info_classif
from sklearn.decomposition import PCA
from sklearn.utils import resample
from imblearn.over_sampling import SMOTE
from collections import Counter
import warnings
warnings.filterwarnings('ignore')

# Set style for better plots
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("=== STEP 1: DATA LOADING AND INITIAL EXPLORATION ===")

# Load the dataset
df = pd.read_csv('After some cleaning.csv')  

print(f"Dataset shape: {df.shape}")

print(f"Columns: {df.columns.tolist()}")

print("\nFirst few rows:")
print(df.head())

print("\nDataset info:")
df.info()

print("\nMissing values:")
print(df.isnull().sum())

# Visualize missing values
plt.figure(figsize=(12, 8))
missing_data = df.isnull().sum()
missing_data = missing_data[missing_data > 0].sort_values(ascending=False)
plt.barh(missing_data.index, missing_data.values)
plt.title('Missing Values by Column')
plt.xlabel('Number of Missing Values')
plt.tight_layout()
plt.show()

print("\n=== STEP 2: DATA PREPROCESSING AND CLEANING ===")
# Create a copy for preprocessing
df_processed = df.copy()

# Remove columns that cause data leakage or are not useful for prediction
columns_to_drop = [
    'Student_ID', 'Student_Fname', 'Student_Lname', 'Student_Address', 'Student_Name',
    'Jop_Website', 'Jop_Statue', 'Course_ID', 'Intake_ID', 'Course_Name'
]

# Drop columns that exist in the dataset
df_processed = df_processed.drop([col for col in columns_to_drop if col in df_processed.columns], axis=1)

# Handle missing values in target variable
print(f"Missing values in Track_Name: {df_processed['Track_Name'].isnull().sum()}")
df_processed = df_processed.dropna(subset=['Track_Name'])

# Fill missing values for categorical variables
categorical_columns = df_processed.select_dtypes(include=['object']).columns
for col in categorical_columns:
    if col != 'Track_Name':  # Don't fill target variable
        df_processed[col] = df_processed[col].fillna('Unknown')

print(f"Dataset shape after cleaning: {df_processed.shape}")
print(f"Remaining missing values: {df_processed.isnull().sum().sum()}")

print("\n=== STEP 3: EXPLORATORY DATA ANALYSIS ===")

# Target variable distribution
plt.figure(figsize=(12, 6))
track_counts = df_processed['Track_Name'].value_counts()
plt.subplot(1, 2, 1)
track_counts.plot(kind='bar')
plt.title('Track_Name Distribution')
plt.xlabel('Track Name')
plt.ylabel('Count')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
plt.pie(track_counts.values, labels=track_counts.index, autopct='%1.1f%%')
plt.title('Track_Name Distribution (Pie Chart)')
plt.tight_layout()
plt.show()

# Key feature distributions
features_to_analyze = ['Faculty', 'Certification_Field', 'Gender', 'Company_Type', 'Jop_Category']
fig, axes = plt.subplots(2, 3, figsize=(18, 12))
axes = axes.flatten()

for i, feature in enumerate(features_to_analyze):
    if feature in df_processed.columns:
        df_processed[feature].value_counts().head(10).plot(kind='bar', ax=axes[i])
        axes[i].set_title(f'{feature} Distribution')
        axes[i].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# Correlation analysis between categorical features and target
plt.figure(figsize=(15, 10))
correlation_features = ['Faculty', 'Certification_Field', 'Gender', 'Company_Type']
for i, feature in enumerate(correlation_features):
    if feature in df_processed.columns:
        plt.subplot(2, 2, i+1)
        pd.crosstab(df_processed[feature], df_processed['Track_Name']).plot(kind='bar', ax=plt.gca())
        plt.title(f'{feature} vs Track_Name')
        plt.legend(bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.xticks(rotation=45)

plt.tight_layout()
plt.show()


print("\n=== STEP 4: FEATURE ENGINEERING ===")
# Create new features
df_processed['Cert_Faculty_Match'] = (df_processed['Certification_Field'].str.contains('Computer|Data|AI|ML|Backend|Frontend|Database',
                                                                                       case=False, na=False) & 
                                     df_processed['Faculty'].str.contains('Computer|Engineering', case=False, na=False)).astype(int)

df_processed['Job_Cert_Alignment'] = (df_processed['Jop_Category'] == df_processed['Certification_Field']).astype(int)




df_processed['Is_Egypt'] = (df_processed['Company_location'] == 'Egypt').astype(int)

df_processed['Is_Egypt'].value_counts()[1]

# Create experience level based on job title
experience_mapping = {
    'Junior': ['Junior', 'Assistant', 'Entry'],
    'Mid': ['Developer', 'Analyst', 'Engineer'],
    'Senior': ['Senior', 'Lead', 'Manager', 'Specialist']
}

df_processed['Experience_Level'] = 'Junior'  # Default
for level, keywords in experience_mapping.items():
    for keyword in keywords:
        mask = df_processed['Jop_Title'].str.contains(keyword, case=False, na=False)
        df_processed.loc[mask, 'Experience_Level'] = level

print("New features created:")
print(f"- Cert_Faculty_Match: {df_processed['Cert_Faculty_Match'].sum()} matches")
print(f"- Job_Cert_Alignment: {df_processed['Job_Cert_Alignment'].sum()} aligned")
print(f"- Is_Cairo: {df_processed['Is_Egypt'].sum()} Cairo-based")
print(f"- Experience_Level distribution: {df_processed['Experience_Level'].value_counts().to_dict()}")

# Define features for modeling
feature_columns = [
    'Gender', 'Faculty', 'Certification_Field', 'Certification_Soutce',
    'Company_Type', 'Company_location', 'Jop_Category', 'Jop_Title',
    'Cert_Faculty_Match', 'Job_Cert_Alignment', 'Is_Egypt', 'Experience_Level']


# Filter features that exist in the dataset
available_features = [col for col in feature_columns if col in df_processed.columns]
print(f"Available features for modeling: {available_features}")

# Prepare data for modeling
X = df_processed[available_features].copy()
y = df_processed['Track_Name'].copy()

# Label encode all categorical features
label_encoders = {}
for col in X.columns:
    if X[col].dtype == 'object':
        le = LabelEncoder()
        X[col] = le.fit_transform(X[col].astype(str))
        label_encoders[col] = le

# Label encode target variable
le_target = LabelEncoder()
y_encoded = le_target.fit_transform(y)

# Feature selection using mutual information
selector = SelectKBest(score_func=mutual_info_classif, k=min(10, len(available_features)))
X_selected = selector.fit_transform(X, y_encoded)
selected_features = [available_features[i] for i in selector.get_support(indices=True)]

print(f"Selected features: {selected_features}")

# Visualize feature importance
feature_scores = selector.scores_
plt.figure(figsize=(12, 6))
feature_importance_df = pd.DataFrame({
    'Feature': available_features,
    'Importance': feature_scores
}).sort_values('Importance', ascending=False)

plt.barh(feature_importance_df['Feature'], feature_importance_df['Importance'])
plt.title('Feature Importance (Mutual Information)')
plt.xlabel('Importance Score')
plt.tight_layout()
plt.show()

print("\n=== STEP 6: DATA AUGMENTATION ===")

# Create augmented dataset using SMOTE
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_selected, y_encoded)

print(f"Original dataset size: {len(X_selected)}")
print(f"Augmented dataset size: {len(X_resampled)}")

# Visualize class distribution
plt.figure(figsize=(12, 5))
plt.subplot(1, 2, 1)
unique, counts = np.unique(y_encoded, return_counts=True)
plt.bar([le_target.classes_[i] for i in unique], counts)
plt.title('Original Class Distribution')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
unique, counts = np.unique(y_resampled, return_counts=True)
plt.bar([le_target.classes_[i] for i in unique], counts)
plt.title('Augmented Class Distribution')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

print("\n=== STEP 7: MODEL TRAINING AND EVALUATION ===")

# Split the data
X_train, X_test, y_train, y_test = train_test_split(
    X_resampled, y_resampled, test_size=0.2, random_state=42, stratify=y_resampled
)

# Define models to compare
models = {
    'Random Forest': RandomForestClassifier(random_state=42, n_estimators=100),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42, n_estimators=100),
    'Logistic Regression': LogisticRegression(random_state=42, max_iter=1000),
    'SVM': SVC(random_state=42, probability=True)
}

# Train and evaluate models
model_results = {}
cv_scores = {}

for name, model in models.items():
    print(f"\nTraining {name}...")
    
    # Cross-validation
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    scores = cross_val_score(model, X_train, y_train, cv=cv, scoring='accuracy')
    cv_scores[name] = scores
    
    # Train model
    model.fit(X_train, y_train)
    
    # Predictions
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    
    model_results[name] = {
        'model': model,
        'accuracy': accuracy,
        'cv_mean': scores.mean(),
        'cv_std': scores.std()
    }
    
    print(f"{name} - Accuracy: {accuracy:.4f}, CV Score: {scores.mean():.4f} (+/- {scores.std()*2:.4f})")

# Visualize model comparison
plt.figure(figsize=(15, 5))

# CV scores comparison
plt.subplot(1, 3, 1)
cv_means = [model_results[name]['cv_mean'] for name in models.keys()]
cv_stds = [model_results[name]['cv_std'] for name in models.keys()]
plt.bar(models.keys(), cv_means, yerr=cv_stds, capsize=5)
plt.title('Cross-Validation Scores')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)

# Test accuracy comparison
plt.subplot(1, 3, 2)
test_accuracies = [model_results[name]['accuracy'] for name in models.keys()]
plt.bar(models.keys(), test_accuracies)
plt.title('Test Set Accuracy')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)

# Box plot of CV scores
plt.subplot(1, 3, 3)
cv_data = [cv_scores[name] for name in models.keys()]
plt.boxplot(cv_data, labels=models.keys())
plt.title('CV Score Distribution')
plt.ylabel('Accuracy')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

print("\n=== STEP 8: HYPERPARAMETER TUNING ===")

# Select best model for hyperparameter tuning
best_model_name = max(model_results.keys(), key=lambda x: model_results[x]['accuracy'])
print(f"Best model: {best_model_name}")

# Hyperparameter tuning for Random Forest (typically best for categorical data)
if best_model_name == 'Random Forest':
    param_grid = {
        'n_estimators': [100, 200, 300],
        'max_depth': [None, 10, 20, 30],
        'min_samples_split': [2, 5, 10],
        'min_samples_leaf': [1, 2, 4]
    }
    
    rf = RandomForestClassifier(random_state=42)
    grid_search = GridSearchCV(rf, param_grid, cv=5, scoring='accuracy', n_jobs=-1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    print(f"Best parameters: {grid_search.best_params_}")
    
else:
    best_model = model_results[best_model_name]['model']

# Final evaluation
y_pred_final = best_model.predict(X_test)
final_accuracy = accuracy_score(y_test, y_pred_final)
print(f"Final model accuracy: {final_accuracy:.4f}")

print("\n=== STEP 9: FINAL RESULTS AND VISUALIZATION ===")

# Classification report
print("\nClassification Report:")
print(classification_report(y_test, y_pred_final, target_names=le_target.classes_))

# Confusion matrix
plt.figure(figsize=(12, 8))
cm = confusion_matrix(y_test, y_pred_final)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=le_target.classes_, 
            yticklabels=le_target.classes_)
plt.title('Confusion Matrix')
plt.ylabel('True Label')
plt.xlabel('Predicted Label')
plt.tight_layout()
plt.show()

# Feature importance (if available)
if hasattr(best_model, 'feature_importances_'):
    plt.figure(figsize=(10, 6))
    importance_df = pd.DataFrame({
        'Feature': selected_features,
        'Importance': best_model.feature_importances_
    }).sort_values('Importance', ascending=False)
    
    plt.barh(importance_df['Feature'], importance_df['Importance'])
    plt.title('Feature Importance from Best Model')
    plt.xlabel('Importance')
    plt.tight_layout()
    plt.show()

# Save the model and encoders
import joblib
import os

model_dir = 'track_prediction_model'
os.makedirs(model_dir, exist_ok=True)

# Save model
joblib.dump(best_model, os.path.join(model_dir, 'best_model.pkl'))

# Save label encoders
for col, encoder in label_encoders.items():
    joblib.dump(encoder, os.path.join(model_dir, f'le_{col}.pkl'))

# Save target encoder
joblib.dump(le_target, os.path.join(model_dir, 'le_target.pkl'))

# Save feature selector
joblib.dump(selector, os.path.join(model_dir, 'feature_selector.pkl'))

# Save selected features list
with open(os.path.join(model_dir, 'selected_features.txt'), 'w') as f:
    for feature in selected_features:
        f.write(f"{feature}\n")

print(f"Model and encoders saved to '{model_dir}' directory")

# Example prediction
def predict_track(sample_data):
    # Load saved components
    model = joblib.load(os.path.join(model_dir, 'best_model.pkl'))
    target_encoder = joblib.load(os.path.join(model_dir, 'le_target.pkl'))
    feat_selector = joblib.load(os.path.join(model_dir, 'feature_selector.pkl'))
    
    # Prepare sample data
    sample_df = pd.DataFrame([sample_data])
    
    # Add engineered features
    sample_df['Cert_Faculty_Match'] = (sample_df['Certification_Field'].str.contains('Computer|Data|AI|ML|Backend|Frontend|Database', case=False, na=False) & 
                                      sample_df['Faculty'].str.contains('Computer|Engineering', case=False, na=False)).astype(int)
    sample_df['Job_Cert_Alignment'] = (sample_df['Jop_Category'] == sample_df['Certification_Field']).astype(int)
    sample_df['Is_Egypt'] = (sample_df['Company_location'] == 'Egypt').astype(int)
    sample_df['Experience_Level'] = 'Mid'  # Default
    
    # Encode categorical variables
    for col in sample_df.columns:
        if col in label_encoders and sample_df[col].dtype == 'object':
            try:
                sample_df[col] = label_encoders[col].transform(sample_df[col].astype(str))
            except ValueError:
                # Handle unknown categories
                sample_df[col] = 0
    
    # Select features
    sample_selected = feat_selector.transform(sample_df[available_features])
    
    # Make prediction
    prediction = model.predict(sample_selected)[0]
    predicted_track = target_encoder.inverse_transform([prediction])[0]
    
    # Get prediction probabilities
    probabilities = model.predict_proba(sample_selected)[0]
    
    return predicted_track, probabilities

# Test prediction
sample_student = {
    'Gender': 'M',
    'Faculty': 'Computer Science',
    'Certification_Field': 'Machine Learning',
    'Certification_Soutce': 'Coursera',
    'Company_Type': 'International',
    'Company_location': 'Cairo',
    'Jop_Category': 'Data Analysis',
    'Jop_Title': 'Data Scientist'
}

predicted_track, probabilities = predict_track(sample_student)
print(f"Predicted Track: {predicted_track}")
print(f"Prediction Probabilities: {dict(zip(le_target.classes_, probabilities))}")

print("\n=== PIPELINE COMPLETED SUCCESSFULLY ===")
print(f"Final model accuracy: {final_accuracy:.4f}")
print(f"Number of features used: {len(selected_features)}")
print(f"Dataset size after augmentation: {len(X_resampled)}")



Collecting pandas
  Using cached pandas-2.3.1-cp311-cp311-win_amd64.whl (11.3 MB)
Collecting numpy
  Using cached numpy-2.3.1-cp311-cp311-win_amd64.whl (13.0 MB)
Collecting matplotlib
  Using cached matplotlib-3.10.3-cp311-cp311-win_amd64.whl (8.1 MB)
Collecting seaborn
  Using cached seaborn-0.13.2-py3-none-any.whl (294 kB)
Collecting scikit-learn
  Using cached scikit_learn-1.7.0-cp311-cp311-win_amd64.whl (10.7 MB)
Collecting imbalanced-learn
  Using cached imbalanced_learn-0.13.0-py3-none-any.whl (238 kB)
Collecting pytz>=2020.1 (from pandas)
  Using cached pytz-2025.2-py2.py3-none-any.whl (509 kB)
Collecting tzdata>=2022.7 (from pandas)
  Using cached tzdata-2025.2-py2.py3-none-any.whl (347 kB)
Collecting contourpy>=1.0.1 (from matplotlib)
  Using cached contourpy-1.3.2-cp311-cp311-win_amd64.whl (222 kB)
Collecting cycler>=0.10 (from matplotlib)
  Using cached cycler-0.12.1-py3-none-any.whl (8.3 kB)
Collecting fonttools>=4.22.0 (from matplotlib)
  Using cached fonttools-4.58.5-cp31


[notice] A new release of pip is available: 23.1.2 -> 25.1.1
[notice] To update, run: python.exe -m pip install --upgrade pip



     -------------                           13.5/38.6 MB 40.9 kB/s eta 0:10:12
     -------------                           13.5/38.6 MB 40.9 kB/s eta 0:10:12
     -------------                           13.5/38.6 MB 40.9 kB/s eta 0:10:12
     -------------                           13.5/38.6 MB 40.9 kB/s eta 0:10:12
     -------------                           13.5/38.6 MB 40.9 kB/s eta 0:10:12
     -------------                           13.5/38.6 MB 40.9 kB/s eta 0:10:12
     -------------                           13.5/38.6 MB 40.9 kB/s eta 0:10:12
     -------------                           13.5/38.6 MB 40.9 kB/s eta 0:10:12
     -------------                           13.5/38.6 MB 40.9 kB/s eta 0:10:12
     -------------                           13.5/38.6 MB 40.9 kB/s eta 0:10:12
     -------------                           13.5/38.6 MB 40.8 kB/s eta 0:10:14
     -------------                           13.5/38.6 MB 40.8 kB/s eta 0:10:14
     -------------                     