<a href="https://colab.research.google.com/github/SimeonHristov99/ML_21-22/blob/main/pca.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Interview Success Prediction Challenge
- Goal: Predict whether a candidate will pass an interview.
- Data: https://www.kaggle.com/datasets/vingkan/strategeion-resume-skills
- Type: Binary Classififaction
> **Note**: This is a synthetic dataset.

## Imports and Constants

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC

from sklearn.metrics import log_loss

from sklearn.tree import plot_tree
from sklearn.metrics import f1_score

In [None]:
FIG_SIZE = (12, 10)
DATA_PATH = 'https://raw.githubusercontent.com/SimeonHristov99/ML_21-22/main/Week_10%20-%20Unsupervised%20Learning.%20Dimensionality%20Reduction/data/resumes_development.csv'

plt.rc('figure', figsize=FIG_SIZE)
pd.set_option('max_columns', None)

## Get the data

In [None]:
df = pd.read_csv(DATA_PATH)
df

## Exploratory Data Analysis

Since this is a synthetic dataset, we should not draw conclusions when exploring it.

In [None]:
df.info(verbose=True, show_counts=True)

In [None]:
df.describe()

In [None]:
df.isna().mean().sum()

In [None]:
df['Interview'].value_counts()

In [None]:
for col in df.columns:
  unq = df[col].unique()
  len_unq = len(unq)

  print(f'{col:40} | {len_unq:5} | {unq if len_unq < 15 else ""}')

In [None]:
for col in ['Unix', 'SQL', 'Leadership', 'AutoCAD']:
  fig, ax = plt.subplots(1,2)

  ax[0].set_xticks([1, 2])
  ax[1].set_xticks([1, 2])

  ax1 = sns.histplot(data=df, x=col, hue="Interview", stat="probability", multiple="dodge", ax=ax[0])
  ax2 = sns.histplot(data=df, x=col, hue="Interview", stat="probability", multiple="fill", ax=ax[1])

  fig.show()

## Preprocessing

In [None]:
def preprocess_inputs(df):
  df = df.copy()

  # Drop the ID column
  df = df.drop(['Unnamed: 0'], axis=1)

  # Split into X and y
  y = df['Interview']
  X = df.drop(['Interview'], axis=1)

  # Train-test split
  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

  # Scale data
  scaler = StandardScaler()
  scaler.fit(X_train)
  X_train = pd.DataFrame(scaler.transform(X_train), index=X_train.index, columns=X_train.columns)
  X_test = pd.DataFrame(scaler.transform(X_test), index=X_test.index, columns=X_test.columns)

  return X_train, X_test, y_train, y_test

In [None]:
X_train, X_test, y_train, y_test = preprocess_inputs(df)
X_train

In [None]:
y_train.value_counts()

## Model building without dimensionality reduction

In [None]:
models = {
    '      Logistic Regression': LogisticRegression(),
    ' Decision Tree Classifier': DecisionTreeClassifier(),
    ' Random Forest Classifier': RandomForestClassifier(),
    'Support Vector Classifier': SVC(probability=True)
}

In [None]:
for name, model in models.items():
  model.fit(X_train, y_train)
  print(f'{name} trained!')

In [None]:
for name, model in models.items():
  print(f'{name} | accuracy: {model.score(X_test, y_test):.4f} | log loss: {log_loss(y_test, model.predict_proba(X_test)):.4f} | f1: {f1_score(y_test, model.predict(X_test)):.4f}')

## Model building with dimensionality reduction

- Using all features and just switching the perspective

In [None]:
n_components = 222

pca = PCA(n_components=n_components)
pca.fit(X_train)

X_train_reduced = pd.DataFrame(pca.transform(X_train), index=X_train.index, columns=[ f'PC {i}' for i in range(1, n_components + 1)])
X_test_reduced = pd.DataFrame(pca.transform(X_test), index=X_test.index, columns=[ f'PC {i}' for i in range(1, n_components + 1)])

In [None]:
X_train_reduced

Notice how most of the variance is contained in the first few principle components.

In [None]:
pca.explained_variance_

In [None]:
pca.explained_variance_ratio_

In [None]:
models = {
    '      Logistic Regression': LogisticRegression(),
    ' Decision Tree Classifier': DecisionTreeClassifier(),
    ' Random Forest Classifier': RandomForestClassifier(),
    'Support Vector Classifier': SVC(probability=True)
}

for name, model in models.items():
  model.fit(X_train_reduced, y_train)
  print(f'{name} trained!')

for name, model in models.items():
  print(f'{name} | accuracy: {model.score(X_test_reduced, y_test):.4f} | log loss: {log_loss(y_test, model.predict_proba(X_test_reduced)):7.4f} | f1: {f1_score(y_test, model.predict(X_test_reduced)):.4f}')

The idea now is to drop all columns with little variance and hold on to the first few, as they hold the most amount of information.

In [None]:
n_components = 2

pca = PCA(n_components=n_components)
pca.fit(X_train)

X_train_reduced = pd.DataFrame(pca.transform(X_train), index=X_train.index, columns=[ f'PC {i}' for i in range(1, n_components + 1)])
X_test_reduced = pd.DataFrame(pca.transform(X_test), index=X_test.index, columns=[ f'PC {i}' for i in range(1, n_components + 1)])

models = {
    '      Logistic Regression': LogisticRegression(),
    ' Decision Tree Classifier': DecisionTreeClassifier(),
    ' Random Forest Classifier': RandomForestClassifier(),
    'Support Vector Classifier': SVC(probability=True)
}

for name, model in models.items():
  model.fit(X_train_reduced, y_train)
  print(f'{name} trained!')

for name, model in models.items():
  print(f'{name} | accuracy: {model.score(X_test_reduced, y_test):.4f} | log loss: {log_loss(y_test, model.predict_proba(X_test_reduced)):7.4f} | f1: {f1_score(y_test, model.predict(X_test_reduced)):.4f}')

In [None]:
ax = sns.scatterplot(x=X_test_reduced['PC 1'], y=X_test_reduced['PC 2'], hue=y_test)
plt.show()

In [None]:
plot_tree(models.get(' Decision Tree Classifier'), fontsize=10)
plt.show()

# For Home

Try out clustering and PCA on [this dataset](https://www.kaggle.com/datasets/uciml/aps-failure-at-scania-trucks-data-set).