# DD2421 Machine Learning: Programming Challenge

Explored to encapsulate the preprocessing and predict step into pipeline, the same process otherwise

In [1]:
import pandas as pd
import scipy.stats as stats
from sklearn.model_selection import StratifiedKFold, cross_val_score, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score, f1_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import (RandomForestClassifier, GradientBoostingClassifier, 
                               ExtraTreesClassifier, AdaBoostClassifier, BaggingClassifier)
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.linear_model import RidgeClassifierCV
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sb
import time

In [2]:
# Load Train and Test set
df = pd.read_csv('TrainOnMe_orig.csv')
test_df = pd.read_csv('EvaluateOnMe.csv')
df.head()

Unnamed: 0,y,x1,x2,x3,x4,x5,x6,x7,x8,x9,x10,x11,x12,x13
0,Jorgsuto,99.62776,-0.7473,-106.04085,-1.21694,199.59889,-88.84034,Slängpolskorgris,1.36603,2.62967,12.1771,-3.38007,True,445.11836
1,Andjorg,100.22296,0.85161,-98.91119,-1.23724,200.1924,-89.88662,Polkagris,-2.23901,0.6537,10.72266,-2.38136,True,451.65919
2,Andsuto,100.12711,0.22396,-96.58029,-1.18725,200.10113,-89.18542,Polkagris,0.38295,-0.58657,12.60082,-1.28045,True,452.3454
3,Jorgsuto,100.81843,0.0212,-102.78871,-1.26271,200.78442,-92.007,Polkagris,-0.28207,0.81744,13.5745,0.12083,True,452.6978
4,Jorgsuto,101.07489,1.18863,-101.29639,-1.23077,201.04424,-92.83613,Schottisgris,1.31717,0.69887,12.50238,-0.71997,True,454.72627


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 14 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   y       1000 non-null   object 
 1   x1      1000 non-null   float64
 2   x2      1000 non-null   float64
 3   x3      1000 non-null   float64
 4   x4      1000 non-null   float64
 5   x5      1000 non-null   float64
 6   x6      1000 non-null   float64
 7   x7      1000 non-null   object 
 8   x8      1000 non-null   float64
 9   x9      1000 non-null   float64
 10  x10     1000 non-null   float64
 11  x11     1000 non-null   float64
 12  x12     1000 non-null   bool   
 13  x13     1000 non-null   float64
dtypes: bool(1), float64(11), object(2)
memory usage: 102.7+ KB


In [4]:
# Divide the train set into features and labels
X = df.drop(['y'], axis=1)
y = df['y']

## Pipeline for dropping data

In [5]:
# Given the analysis in preprocesssing_classify_manually.ipynb,
# I directly decide to select the following features for preprocessing in pipeline
features_to_drop = ['x12', 'x5', 'x13']
categorical_features = ['x7']
numerical_features = [col for col in X.columns if col not in categorical_features + features_to_drop]


In [6]:
# Create a preprocessor
# The preprocessor standardizes the numerical features
# and encodes the categorical features with OneHotEncoder

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), numerical_features),
        ('cat', OneHotEncoder(handle_unknown='ignore', drop='first'), categorical_features),
        ('drop', 'drop', features_to_drop)
    ],
    remainder='passthrough' # 保留其他未指定的列（如果有的话）
)

In [7]:
RANDOM_STATE = 42
# cross validation
n_splits = 10
skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=RANDOM_STATE)

# Divide the train set into train set and validation set
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=RANDOM_STATE, stratify=y
)


In [8]:
# All the classifers to compare
classifiers = {
    "K-neighbours": KNeighborsClassifier(),
    "Decision tree": DecisionTreeClassifier(random_state=RANDOM_STATE),
    "Random forest": RandomForestClassifier(random_state=RANDOM_STATE),
    "Gradient Boosting": GradientBoostingClassifier(random_state=RANDOM_STATE),
    "Extremely random forest": ExtraTreesClassifier(random_state=RANDOM_STATE),
    "Adaboost": AdaBoostClassifier(random_state=RANDOM_STATE),
    "Bagging": BaggingClassifier(random_state=RANDOM_STATE),
    "MLP": MLPClassifier(max_iter=2000, hidden_layer_sizes=(20,20), random_state=RANDOM_STATE),
    "SVM (rbf)": SVC(random_state=RANDOM_STATE),
    "SVM (linear)": SVC(kernel="linear", random_state=RANDOM_STATE),
    "SVM (polynomial)": SVC(kernel="poly", random_state=RANDOM_STATE),
    "Ridge Classifier": RidgeClassifierCV()
}



In [9]:
results = []

for clf_name, clf in classifiers.items():
    print(f"Training: {clf_name}...")
    
    # The full pipeline for preprocessor and classifier
    pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                               ('classifier', clf)])
    
    # Directly cross-validate on the original train set,
    # given that Pipeline handles every step accordingly
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=skf, n_jobs=-1)
    
    # Train and Evaluate
    pipeline.fit(X_train, y_train)
    y_val_pred = pipeline.predict(X_val)
    
    # Using accuracy and f1 score
    val_acc = accuracy_score(y_val, y_val_pred)
    val_f1 = f1_score(y_val, y_val_pred, average='weighted')
    
    results.append({
        'Classifier': clf_name,
        'CV_Mean': cv_scores.mean(),
        'CV_Std': cv_scores.std(),
        'Val_Accuracy': val_acc,
        'Val_F1': val_f1
    })

results_df = pd.DataFrame(results).sort_values('Val_Accuracy', ascending=False)
print("\n" + "="*70)
print(results_df.to_string(index=False))


Training: K-neighbours...
Training: Decision tree...
Training: Random forest...
Training: Gradient Boosting...
Training: Extremely random forest...
Training: Adaboost...
Training: Bagging...
Training: MLP...
Training: SVM (rbf)...
Training: SVM (linear)...
Training: SVM (polynomial)...
Training: Ridge Classifier...

             Classifier  CV_Mean   CV_Std  Val_Accuracy   Val_F1
      Gradient Boosting  0.86125 0.025890         0.840 0.832099
Extremely random forest  0.84125 0.020194         0.820 0.803224
          Random forest  0.87125 0.030644         0.810 0.801870
              SVM (rbf)  0.83375 0.018583         0.810 0.798370
           K-neighbours  0.75875 0.024399         0.800 0.775220
                Bagging  0.85000 0.021651         0.790 0.782729
           SVM (linear)  0.72500 0.037081         0.785 0.764494
                    MLP  0.79375 0.040408         0.780 0.775374
       SVM (polynomial)  0.76875 0.024527         0.775 0.731235
               Adaboost  0.78250

In [10]:
# Choose the best classifer
best_model_name = results_df.iloc[0]['Classifier']
best_classifier = classifiers[best_model_name]

print(f"\n Best Model: {best_model_name}")


 Best Model: Gradient Boosting


In [None]:
# Create the final pipeline with the best classifier
final_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                                 ('classifier', best_classifier)])

# Train the final pipeline with full train set
final_pipeline.fit(X, y)

# Directly predict on the original test set
y_test_pred = final_pipeline.predict(test_df)

# Save the results into .txt
# The result is the same as y_pred.txt
np.savetxt("y_pred_pipeline.txt", y_test_pred, fmt='%s')

print("\nPrediction on EvaluateOnMe.csv is complete and saved to y_pred_pipeline.txt.")


Prediction on EvaluateOnMe.csv is complete and saved to y_pred_pipeline.txt.
