In [1]:
# loading packages

import os

import pandas as pd
import numpy as np
from numpy import linalg as LA

# plotting packages
%matplotlib inline
from mpl_toolkits.mplot3d import Axes3D
import matplotlib.pyplot as plt
import matplotlib.cm as cm
import matplotlib.colors as clrs

# PCA algorithm from scikit-learn
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

# Load raw data

In [2]:
# load raw data
raw = pd.read_csv('alldata.csv')

# check the raw data
print(raw.shape)

(29755, 11)


In [3]:
raw.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J,RESULT
0,0,1,0,1,7,13000,0,10,1,34,0
1,1,0,0,0,7,19000,0,1,1,828,0
2,0,0,0,1,7,0,0,9,1,259,1
3,0,0,1,1,3,0,0,5,1,43259,0
4,0,0,0,1,7,10000,0,4,1,47,0


In [4]:
raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 29755 entries, 0 to 29754
Data columns (total 11 columns):
A         29755 non-null int64
B         29755 non-null int64
C         29755 non-null int64
D         29755 non-null int64
E         29755 non-null int64
F         29755 non-null int64
G         29755 non-null int64
H         29755 non-null int64
I         29755 non-null int64
J         29755 non-null int64
RESULT    29755 non-null int64
dtypes: int64(11)
memory usage: 2.5 MB


In [5]:
raw.drop(['RESULT'], axis=1, inplace=True)

# Simple exploratory analysis

# Print summary statistics

In [6]:
# print summary statistics
raw.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
A,29755.0,0.042884,0.202598,0.0,0.0,0.0,0.0,1.0
B,29755.0,0.161754,0.368231,0.0,0.0,0.0,0.0,1.0
C,29755.0,0.469703,0.49909,0.0,0.0,0.0,1.0,1.0
D,29755.0,0.680255,0.466385,0.0,0.0,1.0,1.0,1.0
E,29755.0,6.075416,1.763556,1.0,6.0,7.0,7.0,7.0
F,29755.0,4678.214082,24161.156774,0.0,0.0,0.0,7000.0,1488000.0
G,29755.0,0.023962,0.152934,0.0,0.0,0.0,0.0,1.0
H,29755.0,7.159503,12.466791,0.0,1.0,3.0,8.0,374.0
I,29755.0,0.923576,0.26568,0.0,1.0,1.0,1.0,1.0
J,29755.0,17663.41882,21085.402086,0.0,37.0,599.0,43253.0,43260.0


# Standardization Feature Scaling 

In [7]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X = scaler.fit_transform(raw)
X_scaled = pd.DataFrame(X, columns = [raw.columns])
X_scaled.head()

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
0,-0.211672,2.276448,-0.941135,0.685592,0.524282,0.344434,-0.156686,0.227849,0.28766,-0.83611
1,4.724295,-0.439281,-0.941135,-1.458594,0.524282,0.592771,-0.156686,-0.494081,0.28766,-0.798453
2,-0.211672,-0.439281,-0.941135,0.685592,0.524282,-0.193629,-0.156686,0.147634,0.28766,-0.825439
3,-0.211672,-0.439281,1.062547,0.685592,-1.743902,-0.193629,-0.156686,-0.173223,0.28766,1.213921
4,-0.211672,-0.439281,-0.941135,0.685592,0.524282,0.220266,-0.156686,-0.253438,0.28766,-0.835493


# Correlation matrix

In [8]:
a = X_scaled.corr()
a

Unnamed: 0,A,B,C,D,E,F,G,H,I,J
A,1.0,0.025049,-0.05961,-0.077543,-0.107068,-0.003696,0.038425,-0.037518,-0.029024,0.077597
B,0.025049,1.0,-0.035785,-0.003732,0.029087,0.006799,-0.026457,0.099292,0.072084,-0.036248
C,-0.05961,-0.035785,1.0,-0.537593,0.23766,-0.002221,0.018538,0.202142,0.138925,-0.178639
D,-0.077543,-0.003732,-0.537593,1.0,-0.135887,0.010801,0.019309,0.044356,-0.053461,0.059704
E,-0.107068,0.029087,0.23766,-0.135887,1.0,0.051941,-0.026514,0.203232,0.283445,-0.330355
F,-0.003696,0.006799,-0.002221,0.010801,0.051941,1.0,-0.006523,0.014606,0.038609,-0.074969
G,0.038425,-0.026457,0.018538,0.019309,-0.026514,-0.006523,1.0,-0.028358,-0.089755,0.043139
H,-0.037518,0.099292,0.202142,0.044356,0.203232,0.014606,-0.028358,1.0,0.123974,-0.227114
I,-0.029024,0.072084,0.138925,-0.053461,0.283445,0.038609,-0.089755,0.123974,1.0,-0.320382
J,0.077597,-0.036248,-0.178639,0.059704,-0.330355,-0.074969,0.043139,-0.227114,-0.320382,1.0


# PCA- Principal Components Analysis

In [9]:
pca = PCA(n_components=10)
pca.fit(X_scaled)
rc = raw.columns
rp = pca.explained_variance_ratio_
rdata = {'Feature':rc, 'Individual Variance':rp} 
df = pd.DataFrame(rdata)
df["Cumulative Variance"] = 0.00000
df["Cumulative Variance"][0]=df["Individual Variance"][0]
for x in range(1,len(df["Individual Variance"])):
    df["Cumulative Variance"][x] = df["Cumulative Variance"][x-1]+df["Individual Variance"][x]
df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,Feature,Individual Variance,Cumulative Variance
0,A,0.205671,0.205671
1,B,0.13652,0.342191
2,C,0.105254,0.447445
3,D,0.100849,0.548294
4,E,0.100205,0.648499
5,F,0.092994,0.741494
6,G,0.087173,0.828667
7,H,0.06762,0.896287
8,I,0.064698,0.960985
9,J,0.039015,1.0
