# Kaggle Fruit-360 Transfer Learning based on PCA + Randomforest

## import packages

In [1]:
import numpy as np
import cv2
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.offsetbox import OffsetImage, AnnotationBbox
import glob
import os
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import svm
from sklearn.cluster import KMeans

%matplotlib inline



## 2 load fruit images and labels

In [None]:
fruit_images = []
labels = []
for fruit_dir_path in glob.glob("/Users/xiaojialiang/Desktop/INFO6105/KaggleFruitRecognition/fruits-360/Training/*"):
    print(fruit_dir_path)
    fruit_label = fruit_dir_path.split("/")[-1]
    for image_path in glob.glob(os.path.join(fruit_dir_path,"*.jpg")):
        image = cv2.imread(image_path,cv2.IMREAD_COLOR)
        
        image = cv2.resize(image,(45,45))
        image = cv2.cvtColor(image,cv2.COLOR_RGB2BGR)
        
        fruit_images.append(image)
        labels.append(fruit_label)
fruit_images = np.array(fruit_images)
labels = np.array(labels)

## 3 build id and label dictionary

In [None]:
label_to_id_dict = {v:i for i,v in enumerate(np.unique(labels))}
id_to_label_dict = {v:k for k,v in label_to_id_dict.items()}

### 3.1 build labelID nparray

In [None]:
label_ids = np.array([label_to_id_dict[x] for x in labels])

## 4 preprocess image data

### 4.1 standard scaling

In [None]:
scaler = StandardScaler()

In [None]:
images_scaled = scaler.fit_transform([i.flatten() for i in fruit_images])

### 4.2 PCA

In [None]:
pca = PCA(n_components=50)
pca_result = pca.fit_transform(images_scaled)

### 4.3 tsne

In [None]:
tsne = TSNE(n_components=2, perplexity=40.0)
tsne_result = tsne.fit_transform(pca_result)
tsne_result_scaled = StandardScaler().fit_transform(tsne_result)

### 4.4 split train and valid dataset

In [None]:
X_train,X_valid,Y_train,Y_valid = train_test_split(pca_result,label_ids,test_size=0.25,random_state=42)

## 5 train the model

In [None]:
forest = RandomForestClassifier(n_estimators = 10)
forest = forest.fit(X_train,Y_train)

## 6 validate the model on the valid dataset

In [None]:
valid_predictions = forest.predict(X_valid)
precision = accuracy_score(valid_predictions,Y_valid) * 100
print(precision)

## 7 test the model on the test dataset

### 7.1 load test images and labels

In [None]:
test_fruit_images = []
test_labels = []
for fruit_dir_path in glob.glob("/Users/xiaojialiang/Desktop/INFO6105/KaggleFruitRecognition/fruits-360/Training/*"):
    fruit_label = fruit_dir_path.split("/")[-1]
    for image_path in glob.glob(os.path.join(fruit_dir_path,"*.jpg")):
        image = cv2.imread(image_path,cv2.IMREAD_COLOR)
        image = cv2.resize(image,(45,45))
        image = cv2.cvtColor(image,cv2.COLOR_RGB2BGR)
        
        test_fruit_images.append(image)
        test_labels.append(fruit_label)
        
test_fruit_images = np.array(test_fruit_images)
test_labels = np.array(test_labels)
        



### 7.2 build test labelID array

In [None]:
test_label_ids = np.array([label_to_id_dict[x] for x in test_labels])

### 7.3 scale test image data

In [None]:
test_images_scaled = scaler.transform([i.flatten() for i in test_fruit_images])

### 7.4 perform PCA and TSNE on scaled data

In [None]:
test_pca_result = pca.transform(test_images_scaled)

In [None]:
test_predictions = forest.predict(test_pca_result)

### 7.5 calculate model's accuracy on the test data

In [None]:
precision2 = accuracy_score(test_predictions, test_label_ids)
precision2