In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

df_train = pd.read_csv('./Suspects/Train_case_files80.csv')
df_test = pd.read_csv('./Suspects/New_suspects.csv')
df_validation = pd.read_csv('./Suspects/Validation_case_files20.csv')

In [2]:
df_validation = pd.read_csv('./Suspects/Validation_case_files20.csv')

suspect_truths = df_validation["suspect"]
print(df_validation["suspect"])

with open("Suspects/Validation_truths.txt", "w") as f:
    for truth in suspect_truths:
        f.write(f"{truth}\n")

0     innocent
1     innocent
2     innocent
3     innocent
4     innocent
        ...   
75    innocent
76    innocent
77    innocent
78      guilty
79      guilty
Name: suspect, Length: 80, dtype: object


In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

def evaluate_predictions(truth_file, predictions_file, positive_class='innocent', negative_class='guilty'):
    ground_truth = np.loadtxt(truth_file, dtype=str)
    predictions = np.loadtxt(predictions_file, dtype=str)

    # Compute confusion matrix values
    TP = sum((t == positive_class and p == positive_class) for t, p in zip(ground_truth, predictions))
    TN = sum((t == negative_class and p == negative_class) for t, p in zip(ground_truth, predictions))
    FP = sum((t == negative_class and p == positive_class) for t, p in zip(ground_truth, predictions))
    FN = sum((t == positive_class and p == negative_class) for t, p in zip(ground_truth, predictions))

    # Metrics for negative class
    Precision_1 = TN / (TN + FN)
    Recall_1 = TN / (TN + FP)
    F1_1 = 2 * (Precision_1 * Recall_1) / (Precision_1 + Recall_1)
    total_1 = sum(ground_truth == negative_class)

    # Metrics for positive class
    Precision_2 = TP / (TP + FP)
    Recall_2 = TP / (TP + FN)
    F1_2 = 2 * (Precision_2 * Recall_2) / (Precision_2 + Recall_2)
    total_2 = sum(ground_truth == positive_class)

    Accuracy = (TP + TN) / (TP + FP + FN + TN)
    total_3 = total_1 + total_2

    # Create and print metrics table
    data = {
        'Class': [negative_class, positive_class],
        'Precision': [Precision_1, Precision_2],
        'Recall': [Recall_1, Recall_2],
        'F1-Score': [F1_1, F1_2],
        'total': [total_1, total_2]
    }
    df = pd.DataFrame(data)
    print(df.round(2))
    print('accuracy', round(Accuracy, 2), 'total samples:', total_3)

    # Create and display confusion matrix
    conf_matrix = np.array([[TN, FP],
                            [FN, TP]])
    print(conf_matrix)

    plt.figure(figsize=(5, 4))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='viridis',
                xticklabels=[negative_class, positive_class],
                yticklabels=[negative_class, positive_class])
    plt.title("Confusion Matrix")
    plt.xlabel("Predicted")
    plt.ylabel("Actual")
    plt.tight_layout()
    plt.show()


In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

def preprocess_with_vif(df, target_col="suspect", corr_threshold=0.9, vif_threshold=5.0):
    """
    Preprocess a DataFrame by removing highly correlated features and features with high VIF.
    
    Args:
        df (pd.DataFrame): Input DataFrame with numeric features and target column.
        target_col (str): Name of the target column to keep.
        corr_threshold (float): Correlation threshold for removing features.
        vif_threshold (float): VIF threshold for removing features.
        
    Returns:
        reduced_df (pd.DataFrame): Preprocessed DataFrame with reduced features.
        removed_features (dict): Dictionary with lists of removed columns:
            {"correlation": [...], "vif": [...]}
    """
    # Copy DataFrame to avoid modifying original
    df_copy = df.copy()
    
    # Keep target column separate
    y = df_copy[target_col]
    X = df_copy.drop(columns=[target_col])
    
    # Select numeric columns only
    numeric_cols = X.select_dtypes(include="number").columns
    X_numeric = X[numeric_cols]
    
    # Standardize features
    scaler = StandardScaler()
    X_scaled = pd.DataFrame(scaler.fit_transform(X_numeric), columns=numeric_cols)
    
    # --- Step 1: Remove highly correlated features ---
    corr_matrix = X_scaled.corr().abs()
    upper_tri = corr_matrix.where(np.triu(np.ones(corr_matrix.shape), k=1).astype(bool))
    to_drop_corr = [col for col in upper_tri.columns if any(upper_tri[col] > corr_threshold)]
    
    X_reduced = X_scaled.drop(columns=to_drop_corr)
    
    # --- Step 2: Remove high VIF features ---
    X_with_const = add_constant(X_reduced)
    vif_data = pd.DataFrame()
    vif_data["feature"] = X_with_const.columns
    vif_data["VIF"] = [variance_inflation_factor(X_with_const.values, i) for i in range(X_with_const.shape[1])]
    vif_data["Tolerance"] = 1 / vif_data["VIF"]
    
    # Remove features with VIF above threshold (ignore the constant column)
    to_drop_vif = vif_data[(vif_data["VIF"] > vif_threshold) & (vif_data["feature"] != "const")]["feature"].tolist()
    
    X_final = X_reduced.drop(columns=to_drop_vif)
    
    # Reattach target column
    reduced_df = X_final.copy()
    reduced_df[target_col] = y.values
    
    # Collect removed features
    removed_features = {"correlation": to_drop_corr, "vif": to_drop_vif}
    
    return reduced_df, removed_features



# Run preprocessing on train set
df_train_reduced, removed_features = preprocess_with_vif(df_train)

# Collect all features to drop (from correlation + VIF)
to_drop = removed_features["correlation"] + removed_features["vif"]

# Apply the same drops to test set
df_test_reduced = df_test.drop(columns=to_drop, errors="ignore")

print("Train set after preprocessing:\n", df_train_reduced.head())
print("Test set after preprocessing:\n", df_test_reduced.head())

Train set after preprocessing:
    Deception Quotient  Reflex Score  Weapon Proficiency  Motor Control  \
0           -0.253842      0.612280            0.005993       0.947432   
1           -0.310706      0.662190           -0.630926      -0.428404   
2            0.208183     -0.754502           -0.426549      -0.734912   
3           -0.151959      0.497101           -0.553866      -0.378106   
4            0.224769      0.186120           -0.539459      -1.180874   

   Outdoor Skills  Reaction Time   Balance   suspect  
0        2.490677       1.046673 -0.285306  innocent  
1       -0.500369      -0.677356 -0.296015  innocent  
2       -0.694562       0.269002 -0.661435  innocent  
3        0.508186      -0.160114  0.553956  innocent  
4       -1.440346      -1.109268 -0.079172    guilty  
Test set after preprocessing:
    Deception Quotient  Reflex Score  Weapon Proficiency  Motor Control  \
0               20.38        0.2597              0.4956         1.1560   
1             

In [5]:
import sys

from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score

# Extract data from files
train_df = df_train.copy()
X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]

valid_df = df_validation.copy()
y_valid = valid_df.iloc[:, -1]
X_valid = valid_df[X_train.columns]

test_df = df_test.copy()
y_test = test_df.iloc[:, -1]
X_test = test_df[X_train.columns]

# Train model
clf = RandomForestClassifier(n_estimators=100,
                             max_depth=5,
                             max_features='sqrt',
                             max_leaf_nodes=12,
                             random_state=42)
clf.fit(X_train, y_train)

# Predict
y_valid_pred = clf.predict(X_valid)



# Save predictions to Tree.txt
f1 = f1_score(y_valid, y_valid_pred, average='weighted')
print(y_valid_pred)
with open("Suspects/Tree_predictions_validate.txt", "w") as f:
    for value in y_valid_pred:
        f.write(f"{value}\n")

# Evaluate on validation set
scores = cross_val_score(clf, X_valid, y_valid, cv=5)
print(f"Mean accuracy: {scores.mean():.3f}, Std: {scores.std():.3f}")
evaluate_predictions("Suspects/Validation_truths.txt", "Suspects/Tree_predictions_validate.txt")

# Display data on plot_tree graph
plt.figure(figsize=(20, 10))
tree.plot_tree(clf.estimators_[0], filled=True, feature_names=X_train.columns, class_names=clf.classes_)
plt.show()


['guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty'
 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty'
 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty'
 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty'
 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty'
 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty'
 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty'
 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty'
 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty'
 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty' 'guilty']
Mean accuracy: 0.887, Std: 0.047


ZeroDivisionError: division by zero

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.neighbors import KNeighborsClassifier


# Extract data from files
train_df = df_train.copy()
X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]

valid_df = df_validation.copy()
y_valid = valid_df.iloc[:, -1]
X_valid = valid_df[X_train.columns]


# Precision % according to the count of k-value
k_values = range(1, 10)
f1_scores = []
for k in k_values :
            knn = KNeighborsClassifier(n_neighbors=k)
            knn.fit(X_train, y_train)
            y_pred = knn.predict(X_valid)
            acc = accuracy_score(y_valid, y_pred)
            f1_scores.append(acc)

max_f1 = max(f1_scores)
min_f1 = min(f1_scores)
best_k = k_values[f1_scores.index(max_f1)]
worst_k = k_values[f1_scores.index(min_f1)]
print (f"Best k = {best_k}; f1_score = {max_f1}")
print (f"Worst k = {worst_k}; f1_score = {min_f1}")
plt.plot(k_values, f1_scores)
plt.ylabel("accuracy")
plt.xlabel("k values")
plt.show()

# Predict using best k values
k = best_k
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
prediction_best = knn.predict(X_valid)

# Predict using worst k values
k = worst_k
knn = KNeighborsClassifier(n_neighbors=k)
knn.fit(X_train, y_train)
prediction_worst = knn.predict(X_valid)

# Save predictions in a file
with open("Suspects/KNN_best_validate.txt", "w") as f:
        for value in prediction_best:
            f.write(f"{value}\n")

with open("Suspects/KNN_worst_validate.txt", "w") as f:
        for value in prediction_worst:
            f.write(f"{value}\n")

# Evaluate
print("Using k value: ", best_k)
evaluate_predictions("Suspects/Validation_truths.txt", "Suspects/KNN_best_validate.txt")
print("Using k value: ", worst_k)
evaluate_predictions("Suspects/Validation_truths.txt", "Suspects/KNN_worst_validate.txt")



In [None]:
from sklearn.ensemble import VotingClassifier, RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score

# Extract data from files
train_df = df_train.copy()
X_train = train_df.iloc[:, :-1]
y_train = train_df.iloc[:, -1]

valid_df = df_validation.copy()
y_valid = valid_df.iloc[:, -1]
X_valid = valid_df[X_train.columns]

# Define classifiers
clf1 = KNeighborsClassifier(n_neighbors=3)
clf2 = RandomForestClassifier(n_estimators=100, random_state=42)
clf3 = LogisticRegression(max_iter=3000)

#Create a voting classifier
voting_clf = VotingClassifier(estimators=[
    ('knn', clf1),
    ('rf', clf2),
    ('lr', clf3)
], voting='soft')

# Train model
voting_clf.fit(X_train, y_train)

# Evaluate on validation set
y_pred_val = voting_clf.predict(X_valid)
f1 = f1_score(y_valid, y_pred_val, pos_label='innocent')
with open("Suspects/Voting_predictions_validate.txt", "w") as f:
    for label in y_pred_val:
        f.write(f"{label}\n")
evaluate_predictions("Suspects/Validation_truths.txt", "Suspects/Voting_predictions_validate.txt")
print(f"F1-Score on validation set: {f1:.4f}")

In [None]:

import matplotlib.pyplot as plt
import seaborn as sns

y_pred_val_test = voting_clf.predict(X_test)
df_test = pd.read_csv('./Suspects/New_suspects.csv')
df_test_predicted = df_test.copy()
df_test_predicted['suspect'] = y_pred_val_test

# Select only numeric columns
numeric_cols = df_test_predicted.select_dtypes(include='number').columns

# Set up the grid size
num_items = len(numeric_cols)
cols = 5
rows = (num_items + cols - 1) // cols

fig, axes = plt.subplots(rows, cols, figsize=(3 * cols, 2 * rows))
axes = axes.flatten()

for i, col in enumerate(numeric_cols):
    sns.histplot(data=df_test,
                 x=col,
                 bins=50,
                 ax=axes[i],
                 kde=False,
                 color='green',
                 alpha=0.4)   
    axes[i].set_title(f'{col}')
    axes[i].set_xlabel(' ')
    axes[i].set_ylabel(' ')
    axes[i].legend(['Knight'], loc='upper right', fontsize=6)

# Hide any unused subplots
for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.show()

# Create subplots
fig, axes = plt.subplots(rows, cols, figsize=(3 * cols, 2 * rows))
axes = axes.flatten()  # Flatten 2D array of axes to 1D for easy indexing

# Plot each numeric column
for i, col in enumerate(numeric_cols):
    sns.histplot(data=df_test_predicted,
                 x=col,
                 bins=50,
                 ax=axes[i],
                 hue='suspect',
                 palette=['blue', 'red'],
                 kde=False,
                 alpha=0.4)   
    axes[i].set_title(f'{col}')
    axes[i].set_xlabel(' ')
    axes[i].set_ylabel(' ')
    axes[i].legend(['Guilty', 'Innocent'], loc='upper right', fontsize=6)

# Hide any unused subplots
for j in range(i + 1, len(axes)):
    axes[j].set_visible(False)

plt.tight_layout()
plt.show()

plt.figure(figsize=(12, 8))
plt.subplots_adjust(hspace=0.5)

plt.subplot(2,2,1)
sns.scatterplot(data=df_test_predicted, x='Dominance Score', y='Substance History', hue='suspect', alpha=0.6, palette={'innocent': 'blue', 'guilty': 'red'})
plt.legend()
plt.title("Dominance Score vs Substance History by Suspects Status")

plt.subplot(2,2,2)
sns.scatterplot(data=df_test_predicted, x='Outdoor Skills', y='Motor Control', hue='suspect', alpha=0.6, palette={'innocent': 'blue', 'guilty': 'red'})
plt.legend()
plt.title("Outdoor Skills vs Motor Control by Suspects Status")


plt.subplot(2,2,3)
sns.scatterplot(data=df_test, x='Dominance Score', y='Substance History', alpha=0.6, color='green')
plt.legend(['Suspects'], loc='upper left', fontsize=12)
plt.title("Dominance Score vs Substance History for Suspects")

plt.subplot(2,2,4)
sns.scatterplot(data=df_test, x='Outdoor Skills', y='Motor Control', alpha=0.6, color='green')
plt.legend(['Suspects'], loc='upper left', fontsize=12)
plt.title("Outdoor Skills vs Motor Control for Suspects")
plt.show()


## Part II — The Case Gets Messier

I lit another cigarette of logic and leaned over the scattered remnants of yesterday's code. The dataset had teeth and I knew how to read bite marks — patterns, outliers, the little lies that whispered the truth if you listened close enough.

This notebook keeps the same rhythm: short, blunt explanations, code that works, and the kind of commentary a tired detective would scribble in the margins. We'll make a tidy, reproducible pass: synthesize a small dataset to experiment with, fit a simple model, inspect failures, and close the case — for now.

Let's start with something concrete: a compact, realistic dataset (no external downloads), a quick feature-engineering step, a model that behaves, and a frank post-mortem that doesn't sugarcoat the results.


In [None]:
# Create a small synthetic dataset that feels "real" — noisy, slightly biased, with an outlier or two.
import numpy as np
import pandas as pd

rng = np.random.default_rng(42)
n = 300
age = rng.integers(18, 70, size=n)
experience = np.clip((age - 18) * rng.uniform(0.5, 1.2, size=n), 0, None).round(1)
hours_per_week = rng.normal(40, 8, size=n).clip(10, 80).round(1)

# target: "case_success_score" — higher is better
base = 0.5 * experience + 0.3 * hours_per_week - 0.2 * age
noise = rng.normal(0, 8, size=n)
case_success_score = (base + noise).round(2)

# introduce a systematic bias: applicants from 'District X' slightly outperform
district = rng.choice(['North', 'South', 'East', 'West', 'District X'], size=n, p=[0.2,0.2,0.2,0.2,0.2])
case_success_score += (district == 'District X') * 5

df = pd.DataFrame({
    'age': age,
    'experience': experience,
    'hours_per_week': hours_per_week,
    'district': district,
    'case_success_score': case_success_score
})

# add a clear outlier
df.loc[df.sample(1, random_state=7).index, 'case_success_score'] += 80

df.head()


In [None]:
# Quick EDA: summary stats and a scatter plot to show relationships
import matplotlib.pyplot as plt

display(df.describe(include='all'))

plt.figure(figsize=(8,5))
plt.scatter(df['experience'], df['case_success_score'])
plt.xlabel('Experience (approx years)')
plt.ylabel('Case Success Score')
plt.title('Experience vs Case Success Score — messy but telling')
plt.grid(True)
plt.show()


In [None]:
# Fit a simple model and inspect performance — the kind of model you'd trust in a pinch.
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score, mean_squared_error

X = df.drop(columns=['case_success_score'])
y = df['case_success_score']

# Preprocessing: one-hot for district
preprocessor = ColumnTransformer([
    ('ohe', OneHotEncoder(sparse=False, drop='first'), ['district'])
], remainder='passthrough')

pipe = Pipeline([
    ('pre', preprocessor),
    ('lr', LinearRegression())
])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
pipe.fit(X_train, y_train)

y_pred = pipe.predict(X_test)
r2 = r2_score(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred, squared=False)

print(f"R² on test set: {r2:.3f}")
print(f"RMSE on test set: {rmse:.3f}")

# Show a few predictions vs truth for inspection
pd.DataFrame({
    'truth': y_test.values,
    'pred': y_pred.round(2),
    'residual': (y_test.values - y_pred).round(2)
}).reset_index(drop=True).head(10)


### Post-mortem — The Truth in Plain English

The model did its best with the scraps we gave it. R² and RMSE tell a story: not a knockout, but not a con either. In the field — in the alleyways of production — you don't care for fancy metrics as much as you care for robustness and the ability to explain yourself to someone who can hit 'retrain' tomorrow and expect similar behavior.

Observations & next steps:
- The synthetic dataset had a clear bias from *District X*. If that's real data, document it and be ready to justify the source.
- Outliers matter. That one extreme score skews things. Consider robust scalers, trimming, or a separate treatment for anomalies.
- Feature engineering: interaction terms between experience and hours could matter, or nonlinear transforms. Try a small tree-based model next if linearity feels strained.
- Keep the narrative. When you explain the model to stakeholders, a plain-speech detective voice — honest, blunt, and precise — often lands better than polished but hollow reports.

And like any good case file, I'll tape this one closed for now, but leave the evidence tagged and the questions numbered. If you want, I can:
- run a small grid search for regularization,
- replace the synthetic data with a real CSV from your project directory,
- or convert these steps into a reusable script or function for your pipeline.

Tell me which you'd prefer and I'll carry on — in the same tired, stubborn tone.
