In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from PIL import Image
import glob
import torch
from torchvision.transforms import ToTensor
from PIL import UnidentifiedImageError

In [None]:
merge_df = pd.read_csv("data/merge_df.csv")
images_csv = pd.read_csv("data/Images.csv")
fb_df = pd.read_json("data/cleaned_tabular_new.json")

In [None]:
image_paths = glob.glob('data/cleaned_images/*.jpg')
image_name = [x.split('/')[-1].split('.')[0] for x in image_paths]
image_df = pd.DataFrame({'image_name': image_name})
image_df

In [None]:
#create a df for labels and category numbers to easily spot the categorised label
#y_labelling = pd.DataFrame({'label': merge_df['main_category'],'category': merge_df['main_category']})
#y_labelling 

In [None]:
#create dictionary of category and convert to number
cat_dict = set(merge_df['main_category'])
cat_dict = {k: v for v, k in enumerate(cat_dict)}
#merge_df['main_category'] = merge_df['main_category'].map(cat_dict)
#y_labelling['category'] = y_labelling['category'].map(cat_dict)

In [None]:
cat_dict

In [None]:
#merge image name df with merge df
image_cat_df = image_df.merge(merge_df, how='inner', left_on='image_name', right_on='id_x')
#drop all columns except for image name and main category
image_cat_df = image_cat_df[['image_name', 'main_category']]

In [None]:
image_cat_df

In [None]:
complete_img_arr = []
image_array = []
img_name = []
image_category = []
for index, img in enumerate(image_paths[:12370]):
    images = image_cat_df['image_name'][index]
    image = Image.open(img)
    image = np.array(image)
    t = ToTensor()
    image = t(image)
    image = torch.flatten(image)
    image = image.numpy()
    category = image_cat_df['main_category'][index]
    image_array.append(image)
    img_name.append(images)
    image_category.append(category)
    cat_arr_list = image, category
    complete_img_arr.append(cat_arr_list)
    

In [None]:
image_data_dict = {
    'category': image_category, 'image_name': img_name, 'image_array': image_array, 'label': image_category
}

In [None]:
image_data_df = pd.DataFrame(image_data_dict, columns=['label','category','image_name', 'image_array'])
image_data_df

In [None]:
cat_dict = set(image_data_df['category'])
cat_dict = {k: v for v, k in enumerate(cat_dict)}
image_data_df['category'] = image_data_df['category'].map(cat_dict)
#y_labelling['category'] = y_labelling['category'].map(cat_dict)

In [None]:
cat_dict

In [None]:
image_data_df

In [None]:

X = list(image_data_df['image_array'])
y = list(image_data_df['category'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(len(X_train))
print(len(X_test))
print(len(y_train))
print(len(y_test))

In [None]:
model = LogisticRegression(penalty='none', tol=0.1, solver='saga', multi_class='multinomial')

In [None]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
print('train scaled')
X_test = scaler.transform(X_test)
print('test scaled')

In [None]:
model.fit(X_train, y_train)
predictions = model.predict(X_test)

In [None]:
print(y_test[10])
print('Predicted Class is: %d' % predictions[10])

print(y_test[44])
print('Predicted Class is: %d' % predictions[44])

In [None]:
from sklearn import metrics

score = model.score(X_test, y_test)
print(score)
print(len(y_test))
confusion_matrix = metrics.confusion_matrix(y_test, predictions)
print(confusion_matrix)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

plt.figure(figsize=(13,13))
sns.heatmap(confusion_matrix, annot=True, fmt=".3f", linewidths=.5, square = True, cmap = 'Blues_r');
plt.ylabel('Actual label');
plt.xlabel('Predicted label');
all_sample_title = 'Accuracy Score: {0}'.format(score)
plt.title(all_sample_title, size = 15)

In [None]:
cm = pd.crosstab(y_test, predictions, 
                               rownames=['Actual'], colnames=['Predicted'], normalize='index')
p = plt.figure(figsize=(10,10));
p = sns.heatmap(cm, annot=True, fmt=".2f", cbar=False)
p.set_title(f'Overall accuracy score: {score}')
cat_dict

In [None]:
image_data_df.columns

In [None]:
image_data_df

In [None]:
cat_dict = {value:key for key, value in cat_dict.items()}
uuid_category_dict = {}
for id, uuid in image_data_df['image_name'].items():
    print(f'id: {id}')
    print(f'uuid: {uuid}')
    corr_class = cat_dict[image_data_df['category'][id]]
    print(f'corr_class: {corr_class}')
    uuid_category_dict[uuid] = corr_class


In [None]:
uuid_category_dict

In [None]:
import json
with open("data/category_label.json", "w") as f:
    json.dump(cat_dict, f)


In [None]:
import json
with open("data/uuid_category_img.json", "w") as f:
    json.dump(uuid_category_dict, f)


In [None]:
cat_dict

In [None]:
X_data = image_data_df.drop(['label', 'image_name'],axis=1, inplace=True)

In [None]:
image_data_df

In [None]:
#save_path = "data/image_model_y.pkl"
#y_data = image_data_df['category']
#y_data = y_data.to_pickle(save_path)

In [None]:
save_path = "data/image_model_data.csv"
y_data = image_data_df
y_data = y_data.to_csv(save_path)