In [4]:
DEBUG = True

In [78]:
DEBUG = False

# Introduction
- Models developed: 
    - SVM, Decision Tree (Ray)
    - KNN, Neural Network (Jamie)

# Feature Extraction

In [24]:
import pandas as pd
import matplotlib.pyplot as plt

from PIL import Image
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, ConfusionMatrixDisplay
from skimage import io, color
from skimage.feature import local_binary_pattern

In [None]:
# read train data
train_metadata = pd.read_csv('Data/train/train_metadata.csv', index_col="id")
train_color_hist = pd.read_csv('Data/train/Features/color_histogram.csv')
train_hog_pca = pd.read_csv('Data/train/Features/hog_pca.csv')
train_additional_features = pd.read_csv('Data/train/Features/additional_features.csv')

In [5]:
# choose data to be used
train_data = [train_hog_pca, train_additional_features]
# drop duplicate image path column
train_data = [col.drop(columns='image_path') for col in train_data]
# make dataframe
train_data_df = pd.concat(train_data, axis=1)
# add index
train_data_df.index = train_metadata.index

if DEBUG:
    print(train_data_df)

      hog_pca_0  hog_pca_1  hog_pca_2  hog_pca_3  hog_pca_4  hog_pca_5  \
id                                                                       
7218  -0.763458   0.927880   0.264329  -0.638673   0.831455  -0.527469   
6333   1.049284   3.608200  -1.817190   0.904380  -0.282195   0.623239   
6867  -1.552440  -0.432374  -0.318422   0.671877  -0.381869  -0.224374   
7730  -1.556871   0.214406   0.973758   0.613876  -0.657341   0.007032   
6338  -0.944294  -0.334833   0.415215  -0.607014  -1.004900   0.172754   
...         ...        ...        ...        ...        ...        ...   
456   -0.562495   0.643567   0.254784  -1.583481   0.202211   1.504130   
4084  -1.370223  -0.398313   0.378050   0.168035  -0.212531  -0.341123   
5117  -1.951729  -1.117028  -1.187795   0.350751  -0.016377  -0.663806   
3334  -2.264590  -1.266415  -0.831067   0.579172  -0.497001  -0.237512   
2403   0.984535   3.178361  -2.070263   0.763265   0.371095   0.249411   

      hog_pca_6  hog_pca_7  hog_pca_8

#### Split Train data into train/test for model testing purposes

In [15]:
# split train data into train and test, for development purposes 
X_train, X_test, y_train, y_test = train_test_split(
    train_data_df, train_metadata["ClassId"], 
    test_size=0.2#,                        # Proportion of the data to include in the test split
    #stratify=train_metadata["ClassId"]         # Stratifies data
)


if DEBUG:
    print(X_train.index[0])
    print(y_train.index[0])
    
    print(f"X_train: \n{X_train}")
    print(f"X_test: \n{X_test}")
    print(f"y_train: \n{y_train}")
    print(f"y_test: \n{y_test}")

#### Import Testing Data

In [16]:
# read test data
test_metadata = pd.read_csv('Data/test/test_metadata.csv', index_col="id")
test_color_hist = pd.read_csv('Data/test/Features/color_histogram.csv')
test_hog_pca = pd.read_csv('Data/test/Features/hog_pca.csv')
test_additional_features = pd.read_csv('Data/test/Features/additional_features.csv')

In [217]:
# choose data to be used
test_data = [test_hog_pca, test_additional_features]
# drop duplicate image path column
test_data = [col.drop(columns='image_path') for col in test_data]
# make dataframe
test_data_df = pd.concat(test_data, axis=1)
# add index
test_data_df.index = test_metadata.index

if DEBUG:
    print(test_data_df)

# ML Models

### SVM

In [218]:
# train an SVM model
SVMModel = SVC(kernel="linear")
SVMModel.fit(X_train, y_train)

In [219]:
# test model on test set, calculate accuracy
y_pred = SVMModel.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

if DEBUG:
    print(y_pred)
    print(y_test)


Accuracy: 81.33%


In [208]:
# display confusion matrix
# cm = confusion_matrix(y_test, y_pred)
# matrix = ConfusionMatrixDisplay(cm)
# matrix.plot()
# plt.gcf().set_size_inches(15, 15)
# plt.title('Confusion Matrix')
# plt.show()


In [220]:
SVMModelFinal = SVC(kernel="linear")
SVMModelFinal.fit(train_data_df, train_metadata["ClassId"])

predictions = SVMModelFinal.predict(test_data_df)

# Create DataFrame with index as 'id' and prediction as 'ClassId'
result_df = pd.DataFrame({'ClassId': predictions}, index=test_data_df.index)

# Write to CSV
result_df.to_csv("result.csv", index=True)

## Random Forest

In [189]:
from sklearn.ensemble import RandomForestClassifier, VotingClassifier

In [221]:
# trains random forest
RFModel = RandomForestClassifier(
    n_estimators=100,     # Number of trees
    max_depth=None       # Limit tree depth (optional)       # For reproducibility
)
RFModel.fit(X_train, y_train)

In [222]:
y_pred = RFModel.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

Accuracy: 75.41%


In [None]:
# display confusion matrix
cm = confusion_matrix(y_test, y_pred)
matrix = ConfusionMatrixDisplay(cm)
matrix.plot()
plt.gcf().set_size_inches(15, 15)
plt.title('Confusion Matrix')
plt.show()

## Voting Classifier Combination

In [223]:
rf = RandomForestClassifier(n_estimators=100, random_state=42)
svm = SVC(probability=True, random_state=42) 

voting_clf = VotingClassifier(
    estimators=[('rf', rf), ('svm', svm)], #('knn', knn)], 
    voting='soft'  # 'hard' for majority vote, 'soft' for probability averaging
)
# Train Voting Classifier
voting_clf.fit(X_train, y_train)

In [224]:
y_pred = voting_clf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# display confusion matrix
# cm = confusion_matrix(y_test, y_pred)
# matrix = ConfusionMatrixDisplay(cm)
# matrix.plot()
# plt.gcf().set_size_inches(15, 15)
# plt.title('Confusion Matrix')
# plt.show()


Accuracy: 74.50%


In [None]:
votingCLFFinal = SVC(kernel="linear")
votingCLFFinal.fit(train_data_df, train_metadata["ClassId"])

predictions = votingCLFFinal.predict(test_data_df)

# Create DataFrame with index as 'id' and prediction as 'ClassId'
result_df = pd.DataFrame({'ClassId': predictions}, index=test_data_df.index)

# Write to CSV
result_df.to_csv("result.csv", index=True)