## Principal component analysis for Gruped data sets - Difficulty

If you want to analyze the operator: 
- Relocate type "REL"
- 2-OPT Fam. type "OPT"
- Exchange type "EX"


In [1]:
operator = raw_input()

REL


If you want to analyze the representation: 
- Delimiters type "DEL"
- Giant-Tour type "GT"

In [2]:
representation = raw_input()

DEL


In [3]:
from numpy import eye, asarray, dot, sum, diag
from numpy.linalg import svd
def varimax(Phi, gamma = 1.0, q = 20, tol = 1e-6):
    p,k = Phi.shape
    R = eye(k)
    d=0
    for i in range(q):
        d_old = d
        Lambda = dot(Phi, R)
        u,s,vh = svd(dot(Phi.T,asarray(Lambda)**3 - (gamma/p) * dot(Lambda, diag(diag(dot(Lambda.T,Lambda))))))
        R = dot(u,vh)
        d = sum(s)
        if d_old!=0 and d/d_old < 1 + tol: break
    return dot(Phi, R)

In [4]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
#from dmba import classificationSummary
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix

datos = pd.read_csv('Data.csv')

array = []

for i in datos['INS']:
    array.append(int(i.split('-')[1].replace('n', "")))
    
datos = datos[(datos['REP'] == representation) & ((datos['OPER'] == operator))]

datos['SA_CLASS'] = pd.qcut(datos['SA_MEAN'], 2, labels = ['Low','High'])

datos = datos.reset_index(drop=True)

features = ['CL','PIC','IC','DB','SIC','MIC','FSR','RFB']

# Separating out the features
x = datos.loc[:, features].values

# Separating out the target
y = datos.loc[:,['SA_CLASS']].values

# Standardizing the features
x = StandardScaler().fit_transform(x)

pca = PCA(n_components=3)
principalComponents = pca.fit_transform(x)
principalDf = pd.DataFrame(data = principalComponents
             , columns = ['PC 1', 'PC 2', 'PC 3'])

finalDf = pd.concat([principalDf, datos[['SA_CLASS','INS']]], axis = 1)

var = pca.explained_variance_ratio_

#finalDf

import numpy as np
loadings = pca.components_.T * np.sqrt(pca.explained_variance_)

loading_matrix = pd.DataFrame(loadings, columns=['PC1', 'PC2','PC3'], index=features)
#loading_matrix


operator2 = 0
representation2 = 0

if representation == 'DEL':
    representation2 = 'Delimiters'
else:
    representation2 = 'Giant-Tour'
    
if operator == 'REL':
    operator2 = 'Relocate'
elif operator == 'OPT':
    operator2 = '2-Opt Family'
else:
    operator2 = 'Exchange'    


import matplotlib.pylab as plt
from mpl_toolkits.mplot3d import Axes3D, proj3d
fig = plt.figure(figsize = (8,8))
#ax = fig.add_subplot(1,1,1) 
ax = plt.axes(projection='3d')
ax.set_xlabel('Principal Component 1 ({}%)'.format(round(var[0]*100,2)), fontsize = 9)
ax.set_ylabel('Principal Component 2 ({}%)'.format(round(var[1]*100,2)), fontsize = 9)
ax.set_zlabel('Principal Component 3 ({}%)'.format(round(var[2]*100,2)), fontsize = 9)
ax.set_title('Projection of {0} - {1}'.format(operator2, representation2), fontsize = 11, fontweight="bold", y=1.07, pad=-14)

targets = ['Low','High']
colors = ['g', 'r']

for target, color in zip(targets,colors):
    indicesToKeep = finalDf['SA_CLASS'] == target
    ax.scatter3D(finalDf.loc[indicesToKeep, 'PC 1']
               , finalDf.loc[indicesToKeep, 'PC 2'],
                 finalDf.loc[indicesToKeep, 'PC 3'], c=color, s = 40)

max_x = np.max(finalDf['PC 1'])
min_y = np.min(finalDf['PC 2'])
min_z = np.min(finalDf['PC 3'])


f = lambda x,y,z: proj3d.proj_transform(x,y,z, ax.get_proj())[:2]
ax.legend(targets, loc="lower right", bbox_to_anchor=f(max_x,min_y,min_z), 
          bbox_transform=ax.transData)

ax.grid()

plt.savefig('PCA_DIF_{0}_{1}.png'.format(operator,representation), dpi = 300, bbox_inches = 'tight')


In [5]:
from IPython.display import Image
Image(url = "PCA_DIF_{0}_{1}.png".format(operator,representation), width = 400, height = 400)