In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib
import matplotlib.pyplot as plt
from scipy import stats
import math
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [2]:
train = pd.read_csv('TrainClean.csv')
test = pd.read_csv('TestClean.csv')
train = train.drop(labels='Unnamed: 0', axis=1) #there is this new misterious and useless column
all_data_index = train.columns.drop(['NumberOfSales'])
all_data = pd.concat((train.loc[:,all_data_index],test.loc[:,all_data_index]))

In [3]:
numeric_variables=pd.DataFrame()
for col in all_data.columns:
    if (all_data[col].value_counts().index.isin([0,1]).all()==0): #if the column is not binary
        numeric_variables[col]=all_data[col]

In [4]:
numeric_variables.head()

Unnamed: 0,NearestCompetitor,Region_AreaKM2,Region_GDP,Region_PopulationK,CloudCover,Max_Dew_PointC,Max_Humidity,Max_Sea_Level_PressurehPa,Max_TemperatureC,Max_VisibilityKm,...,Mean_Sea_Level_PressurehPa,Mean_TemperatureC,Mean_VisibilityKm,Mean_Wind_SpeedKm_h,Min_Dew_PointC,Min_Humidity,Min_Sea_Level_PressurehPa,Min_TemperatureC,Min_VisibilitykM,Precipitationmm
0,326,9643,17130,2770,1000.0,1,100.0,1032,2,19.0,...,1030,1,11.0,16,-2,70,1029,1,6.0,0.0
1,326,9643,17130,2770,1000.0,0,87.0,1030,5,23.0,...,1027,3,13.0,10,-2,58,1025,1,10.0,0.0
2,326,9643,17130,2770,1000.0,0,81.0,1026,4,31.0,...,1024,3,11.0,10,-3,55,1023,2,8.0,0.0
3,326,9643,17130,2770,1000.0,-3,80.0,1027,8,31.0,...,1024,3,15.0,10,-6,25,1022,-1,10.0,0.0
4,326,9643,17130,2770,1000.0,-2,93.0,1024,11,31.0,...,1020,4,22.0,11,-4,26,1016,-2,16.0,0.0


Principal component analysis is affected by attribute scale so we normalize all the attributes by eliminating the mean and scaling to unit variance.

In [5]:
x = StandardScaler().fit_transform(numeric_variables)

In [6]:
n_components = 10
column_names = []
for i in range (0, n_components):
    col_name = "component {}".format(i+1);
    column_names.append(col_name)

In [7]:
pca = PCA(n_components)
new_data = pca.fit_transform(x)
pca_all_data = pd.DataFrame(data = new_data, 
                        columns = column_names)

In [8]:
print("Explained Variance")
for i in range (0, n_components):
    string = "component {} {}".format(i+1, pca.explained_variance_ratio_[i])
    print(string)
print("   Total Explained Variance %3.2f"%sum(pca.explained_variance_ratio_))

Explained Variance
component 1 0.2713414343178465
component 2 0.15425379418974533
component 3 0.14290450064912327
component 4 0.09116742396572779
component 5 0.06351193121047954
component 6 0.051037064820285805
component 7 0.04285431461052587
component 8 0.040334146790142966
component 9 0.031261551396593826
component 10 0.029473814101129567
   Total Explained Variance 0.92


In [9]:
print("Components")
for i,c in enumerate(pca.components_):
    print("Component %d\t%s"%(i,str(c)))

Components
Component 0	[ 0.00524303  0.0614998   0.01445104  0.0066605  -0.00211693 -0.35217793
  0.12209576  0.02523689 -0.38489073 -0.13264459  0.02030514 -0.34702633
  0.22529865 -0.00256972 -0.3925278  -0.17975039  0.06730438 -0.33264421
  0.23530813 -0.02080838 -0.36817666 -0.16018428 -0.02077574]
Component 1	[-0.01416014 -0.04950546 -0.03079082 -0.01653942  0.00071035 -0.19813281
 -0.27523409  0.32682057 -0.00078367  0.10338633 -0.07861828 -0.21510295
 -0.332584    0.33261562 -0.06163204  0.2969181  -0.05966381 -0.22411662
 -0.29551112  0.3204472  -0.13879509  0.28506613 -0.23158764]
Component 2	[-0.02671798 -0.10460026 -0.02292548 -0.00122493  0.01101013 -0.11934613
 -0.26539922 -0.35368643 -0.05145331  0.18952337  0.33590308 -0.13627819
 -0.20155761 -0.38389961 -0.04816683  0.22153717  0.34291208 -0.14083967
 -0.14695343 -0.39331621 -0.04329435  0.22037668  0.06627396]
Component 3	[-0.20688024  0.54096269  0.46706324  0.57780419  0.08977412  0.01758757
 -0.02709681  0.03024732 

In [10]:
pca_all_data.head(5)

Unnamed: 0,component 1,component 2,component 3,component 4,component 5,component 6,component 7,component 8,component 9,component 10
0,3.24388,1.701923,-1.911056,0.158195,-1.623565,0.047775,1.892706,-0.784457,-0.523832,0.396982
1,2.191409,2.77604,-1.007928,0.030726,-0.396286,-0.323661,1.87261,-0.723922,-0.090476,-0.079121
2,2.091733,2.563472,-0.060951,-0.001038,-0.706405,-0.316452,1.861476,-0.794191,0.082356,0.168332
3,1.757692,4.219805,0.745435,0.084325,0.078918,-0.511495,1.892931,-0.720817,0.996274,0.512124
4,1.358421,3.553073,1.406075,0.160967,0.226484,-1.183458,2.134629,-0.061236,-0.134542,0.641845


In [37]:
test_offset = train.shape[0]
trainPCA = pca_all_data.iloc[0:test_offset,:]
trainPCA['NumberOfSales'] = train["NumberOfSales"]
testPCA = pca_all_data.iloc[test_offset:pca_all_data.shape[0],:]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until


In [39]:
trainPCA.to_csv("TrainClean10PCA.csv")
testPCA.to_csv("TestClean10PCA.csv")