In [35]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from statsmodels.formula.api import ols, logit
from sklearn.linear_model import LinearRegression, LogisticRegression
import scipy.stats as sp
from statsmodels.discrete.discrete_model import MNLogit
from sklearn.model_selection import train_test_split

from skimage.io import imread, imshow
import skimage.filters as filter
import skimage.feature as feature
from skimage.measure import label
from tqdm import tqdm
import os
glow='GLOW Data2.xlsx'; lbws='LBWSdata.xlsx'
main='Reduced MNIST Data'; training='/Reduced Trainging data'; testing='/Reduced Testing data'

In [4]:
def find_corner(img):
    X_tl=np.where(img.sum(axis=1)!=0)[0][0]
    Y_tl=np.where(img[X_tl]!=0)[0][0]
    Y_bl=np.where(img.sum(axis=0)!=0)[0][0]
    X_bl=np.where(img[:,Y_bl]!=0)[0][0]
    X_tr=np.where(img.sum(axis=1)!=0)[0][-1]
    Y_tr=np.where(img[X_tr]!=0)[0][-1]
    Y_br=np.where(img.sum(axis=0)!=0)[0][-1]
    X_br=np.where(img[:,Y_br]!=0)[0][-1]
    return {'X_tl':X_tl,'Y_tl':Y_tl,'X_bl':X_bl,'Y_bl':Y_bl,'X_tr':X_tr,'Y_tr':Y_tr,'X_br':X_br,'Y_br':Y_br}
def segment_sum(img,origin,dist,side):
    row_sum=img[origin-dist:origin+dist].sum(axis=1).sum()
    col_sum=img[:,origin-dist:origin+dist].sum(axis=0).sum()
    return {side+'_row_sum':row_sum,side+'_col_sum':col_sum}
def center_sum(img,originX,originY,dist):
    return img[originX-dist:originX+dist,originY-dist:originY+dist].sum()



In [5]:
def img_data(img_og):
    img=feature.canny(img_og)
    
    outp={}
    
    #functions from the data science lesson
    outp.update({'max_col':max(img.sum(axis=0))})
    outp.update({'max_row':max(img.sum(axis=1))})
    outp.update({'total_pixel':img.sum()})
    outp.update({'width':(img == True).any(axis = 0).sum()})
    outp.update({'height':(img == True).any(axis = 1).sum()})
    outp.update({'edges':label(img).max()})

    #apply some to the image before appling canny
    outp.update({'max_col_og':max(img_og.sum(axis=0))})
    outp.update({'max_row_og':max(img_og.sum(axis=1))})
    outp.update({'total_pixel_og':img_og.sum()})

    #find the top left,top right, bottom left and bottom right most pixel on an image
    corners=find_corner(img)
    outp.update({name:value for name,value in zip(corners.keys(),corners.values())})

    #create columns and count the number of white pixels in these
    top=segment_sum(img,7,3,'top')
    outp.update({name:value for name,value in zip(top.keys(),top.values())})
    middle=segment_sum(img,14,3,'mid')
    outp.update({name:value for name,value in zip(middle.keys(),middle.values())})
    bottom=segment_sum(img,21,3,'bot')
    outp.update({name:value for name,value in zip(bottom.keys(),bottom.values())})

    #same prosces for the original images
    outp.update({name:value for name,value in zip(corners.keys(),corners.values())})
    top=segment_sum(img_og,7,3,'top_og')
    outp.update({name:value for name,value in zip(top.keys(),top.values())})
    middle=segment_sum(img_og,14,3,'mid_og')
    outp.update({name:value for name,value in zip(middle.keys(),middle.values())})
    bottom=segment_sum(img_og,21,3,'bot_og')
    outp.update({name:value for name,value in zip(bottom.keys(),bottom.values())})

    #create a square of a specific radious and count the number of pixels
    outp.update({'center':center_sum(img,14,14,4)})
    outp.update({'top_left':center_sum(img,7,7,4)})
    outp.update({'bot_left':center_sum(img,21,7,4)})
    outp.update({'top_right':center_sum(img,7,21,4)})
    outp.update({'bot_right':center_sum(img,21,21,4)})

    #same prosces for the original images
    outp.update({'center_og':center_sum(img_og,14,14,4)})
    outp.update({'top_left_og':center_sum(img_og,7,7,4)})
    outp.update({'bot_left_og':center_sum(img_og,21,7,4)})
    outp.update({'top_right_og':center_sum(img_og,7,21,4)})
    outp.update({'bot_right_og':center_sum(img_og,21,21,4)})
    return outp


In [6]:
def create_image_dataframe(train_or_test,from_folder=range(10)):
    imgdata=pd.DataFrame()
    folder_list=[f'/{item}/' for item in from_folder]
    for folder,i in zip(folder_list,range(len(folder_list))):
        path = main+train_or_test+folder
        files = os.listdir(path)
        for file in files:
            imgdata=pd.concat([imgdata,pd.DataFrame(img_data(imread(main+train_or_test+folder+file)),index=[i])])
        print(f'finnished {train_or_test} {folder}.',end='\r')
    print(f'finnished {train_or_test}.    ')
    return imgdata.reset_index().rename(columns={'index':'true_value'})

In [18]:
def make_1D_image_array(train_or_test,from_folder=range(10),safety=False):
    img_1D_array=pd.DataFrame()
    folder_list=[f'/{item}/' for item in from_folder]
    for folder,i in zip(folder_list,range(len(folder_list))):
        path = main+train_or_test+folder
        files = os.listdir(path)
        for file in files:
            img_1D_array=pd.concat([img_1D_array,pd.DataFrame({folder+file:feature.canny(imread(main+train_or_test+folder+file)).reshape(784).astype(int)})],axis=1)
    return img_1D_array.T

In [19]:
make_1D_image_array(training,from_folder=[3])

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,774,775,776,777,778,779,780,781,782,783
/3/5132.jpg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
/3/5133.jpg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
/3/5134.jpg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
/3/5135.jpg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
/3/5136.jpg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
/3/6127.jpg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
/3/6128.jpg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
/3/6129.jpg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
/3/6130.jpg,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## exercise 2

In [9]:
imgdata_train=create_image_dataframe(training); imgdata_test=create_image_dataframe(testing)

finnished /Reduced Trainging data.    
finnished /Reduced Testing data.    


In [10]:
model=MNLogit.from_formula('true_value ~ '+' + '.join(imgdata_train.drop(columns='true_value').columns),imgdata_train).fit(disp=0)

  eXB = np.column_stack((np.ones(len(X)), np.exp(X)))
  return eXB/eXB.sum(1)[:,None]


In [11]:
preded=model.predict(imgdata_test)

In [12]:
a=(preded.idxmax(axis=1)==imgdata_test.true_value).mean()
print(f'{a:.2%} succes rate')

0.00% succes rate


  a=(preded.idxmax(axis=1)==imgdata_test.true_value).mean()


In [13]:
preded

Unnamed: 0,0,1,2,3,4,5,6,7,8,9
0,,,,,,,,,,
1,,,,,,,,,,
2,,,,,,,,,,
3,,,,,,,,,,
4,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...
1995,,,,,,,,,,
1996,,,,,,,,,,
1997,,,,,,,,,,
1998,,,,,,,,,,


## exercise 3


The black box method may perform better than the descriptive method because we have a large dataset and by 'forcing' the machine to learn image features through image labels, it may detect some features that we humans may miss. However, compared to the descriptive model, we will not be able to reason why the model has chosen its results. 

In [None]:
#imgdata_train=create_image_dataframe(training); imgdata_test=create_image_dataframe(testing)

finnished /Reduced Trainging data.    
finnished /Reduced Testing data.    


',true_value,max_col,max_row,total_pixel,width,height,edges,max_col_og,max_row_og,total_pixel_og,X_tl,Y_tl,X_bl,Y_bl,X_tr,Y_tr,X_br,Y_br,top_row_sum,top_col_sum,mid_row_sum,mid_col_sum,bot_row_sum,bot_col_sum,top_og_row_sum,top_og_col_sum,mid_og_row_sum,mid_og_col_sum,bot_og_row_sum,bot_og_col_sum,center,top_left,bot_left,top_right,bot_right,center_og,top_left_og,bot_left_og,top_right_og,bot_right_og\r\n0,0,14,8,102,18,21,2,5036,3271,47637,4,7,6,6,24,15,17,23,31,24,28,39,29,29,14882,11319,11846,14152,16142,14139,15,13,9,7,15,3702,7197,4504,3182,6867\r\n1,0,8,8,86,16,21,2,4786,3062,44903,4,16,16,6,24,13,13,21,20,15,29,40,26,21,11061,7343,14585,17622,15138,12177,23,1,9,11,8,8850,149,5870,6897,3660\r\n2,0,12,10,109,20,22,2,4803,3800,49680,3,9,8,5,24,14,15,24,34,35,26,28,33,33,14151,16457,12426,11938,18049,16060,3,17,12,10,16,492,6756,6613,4157,7574\r\n3,0,12,9,105,17,22,2,3811,2795,36262,4,15,15,6,25,16,17,22,25,23,25,30,36,37,11895,7742,8238,11818,11199,12568,7,5,16,12,19,1219,1155,4454,

In [None]:
#imgdata_train.to_csv('training_img.csv'); imgdata_test.to_csv('testing_img.csv')

In [47]:
training_set = pd.read_csv('testing_img.csv').iloc[:,1:]

display(training_set)
train, val = train_test_split(training_set, test_size=0.2)

Unnamed: 0,true_value,max_col,max_row,total_pixel,width,height,edges,max_col_og,max_row_og,total_pixel_og,...,center,top_left,bot_left,top_right,bot_right,center_og,top_left_og,bot_left_og,top_right_og,bot_right_og
0,0,9,10,103,22,22,2,4812,4552,68242,...,18,9,15,13,11,3169,5906,10368,9126,6843
1,0,9,14,93,21,22,2,4809,4737,73711,...,17,8,12,14,12,8358,4873,10339,8635,9290
2,0,9,10,89,18,21,2,5076,3795,67054,...,16,9,11,11,11,10378,4055,7430,8892,6351
3,0,8,8,89,18,21,2,5025,3825,59634,...,20,5,11,13,9,8612,1165,7189,7374,5894
4,0,8,10,105,22,20,2,4041,4296,65737,...,22,7,13,19,9,6872,3113,9655,11155,4856
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1995,9,11,7,90,14,22,3,3606,2539,34046,...,21,4,5,12,1,10674,812,1338,4903,169
1996,9,9,7,67,14,20,2,4542,3047,34262,...,11,1,4,9,6,15604,35,853,3541,1415
1997,9,9,7,75,13,21,2,4109,2795,33502,...,20,4,0,7,4,12920,834,74,3197,1207
1998,9,9,6,67,14,20,2,4586,2810,37903,...,14,2,6,8,3,15331,313,2969,3906,435


In [48]:
x_train = train.iloc[:,1:]
y_train = train['true_value']
model = LogisticRegression(multi_class='multinomial',solver ='newton-cg').fit(x_train,y_train)




In [49]:
prediction = model.predict(val.iloc[:,1:])
prediction

array([7, 2, 5, 2, 1, 7, 8, 9, 1, 6, 6, 5, 5, 1, 7, 4, 0, 5, 5, 6, 8, 8,
       3, 6, 5, 9, 6, 4, 6, 7, 1, 0, 3, 9, 1, 4, 5, 9, 7, 5, 0, 0, 6, 9,
       3, 6, 0, 2, 8, 9, 3, 5, 8, 8, 8, 7, 2, 9, 6, 7, 7, 7, 5, 4, 8, 5,
       5, 1, 1, 4, 0, 3, 4, 1, 7, 0, 8, 9, 9, 1, 0, 6, 9, 0, 7, 0, 6, 2,
       0, 6, 0, 3, 7, 4, 8, 9, 6, 2, 6, 4, 4, 3, 6, 3, 9, 9, 7, 4, 0, 3,
       8, 8, 3, 1, 6, 4, 0, 2, 1, 7, 2, 6, 2, 1, 2, 0, 3, 6, 4, 5, 8, 0,
       6, 7, 2, 3, 1, 7, 5, 3, 3, 5, 4, 2, 8, 8, 0, 1, 0, 5, 9, 9, 4, 0,
       5, 9, 2, 9, 6, 6, 0, 8, 1, 9, 3, 4, 0, 1, 3, 3, 0, 8, 2, 6, 4, 7,
       8, 0, 8, 9, 7, 2, 7, 6, 4, 3, 7, 9, 9, 4, 0, 3, 8, 9, 9, 6, 5, 1,
       4, 4, 7, 2, 4, 9, 0, 7, 6, 3, 2, 9, 6, 9, 7, 7, 8, 0, 4, 3, 7, 6,
       9, 0, 5, 6, 1, 1, 6, 7, 6, 8, 4, 4, 2, 9, 3, 0, 2, 7, 2, 4, 5, 4,
       5, 2, 4, 4, 4, 8, 7, 2, 7, 6, 0, 9, 6, 4, 7, 7, 2, 7, 1, 1, 3, 4,
       6, 2, 1, 3, 7, 8, 3, 8, 6, 7, 9, 7, 0, 3, 4, 1, 2, 5, 5, 4, 7, 9,
       7, 5, 1, 1, 8, 0, 0, 7, 3, 2, 0, 3, 0, 5, 9,

Unnamed: 0,true_value,max_col,max_row,total_pixel,width,height,edges,max_col_og,max_row_og,total_pixel_og,...,center,top_left,bot_left,top_right,bot_right,center_og,top_left_og,bot_left_og,top_right_og,bot_right_og
1590,7,7,11,106,20,21,1,3118,4312,44171,...,19,9,8,12,5,4682,3617,3746,6574,1431
1457,7,11,14,75,18,21,1,4050,4559,37215,...,16,16,0,8,4,4731,6401,117,4960,1175
632,3,11,12,93,18,22,3,5041,3826,51162,...,12,8,12,10,9,12569,2113,4416,4918,6853
361,1,18,6,51,8,20,1,5070,2031,31432,...,12,0,0,3,8,11665,66,63,205,2653
1699,8,9,8,93,18,21,3,4261,3534,49101,...,15,5,16,16,0,11356,1595,8213,8817,159
417,2,9,7,81,16,21,1,3319,3300,33235,...,14,11,5,13,10,4205,3676,1038,4884,2679
1186,5,14,8,75,20,21,1,5086,3311,38022,...,16,0,16,16,0,8179,45,7154,7935,0
1066,5,8,9,81,21,17,1,3822,4053,38248,...,14,5,15,16,5,9644,1110,5093,7572,1631
1271,6,10,7,77,16,21,2,4557,4078,50616,...,11,5,9,8,10,13890,922,5851,1672,3917
32,0,9,12,114,22,18,1,3064,3778,42075,...,14,7,16,24,8,2915,1904,6626,6656,2671
