#### Import relevant libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

#### Load dataset

In [2]:
marketing_data = pd.read_csv("data/marketing_campaign.csv")

#### Subset for relevant columns

In [3]:
marketing_data = marketing_data[['MntWines','MntFruits', 'MntMeatProducts', 'MntFishProducts', 
                                 'MntSweetProducts', 'MntGoldProds', 'NumDealsPurchases', 
                                 'NumWebPurchases','NumCatalogPurchases', 'NumStorePurchases', 
                                 'NumWebVisitsMonth']]

#### Inspect first 5 rows and data types of the dataset

In [4]:
marketing_data.head()

Unnamed: 0,MntWines,MntFruits,MntMeatProducts,MntFishProducts,MntSweetProducts,MntGoldProds,NumDealsPurchases,NumWebPurchases,NumCatalogPurchases,NumStorePurchases,NumWebVisitsMonth
0,635,88,546,172,88,88,3,8,10,4,7
1,11,1,6,2,1,6,2,1,1,2,5
2,426,49,127,111,21,42,1,8,2,10,4
3,11,4,20,10,3,5,2,2,0,4,6
4,173,43,118,46,27,15,5,5,3,6,5


In [5]:
marketing_data.shape

(2240, 11)

In [6]:
marketing_data.dtypes

MntWines               int64
MntFruits              int64
MntMeatProducts        int64
MntFishProducts        int64
MntSweetProducts       int64
MntGoldProds           int64
NumDealsPurchases      int64
NumWebPurchases        int64
NumCatalogPurchases    int64
NumStorePurchases      int64
NumWebVisitsMonth      int64
dtype: object

#### Check for missing values and remove them

In [7]:
marketing_data.isnull().sum()

MntWines               0
MntFruits              0
MntMeatProducts        0
MntFishProducts        0
MntSweetProducts       0
MntGoldProds           0
NumDealsPurchases      0
NumWebPurchases        0
NumCatalogPurchases    0
NumStorePurchases      0
NumWebVisitsMonth      0
dtype: int64

In [8]:
marketing_data.dropna(inplace=True)

In [9]:
marketing_data.shape

(2240, 11)

#### Scale the data

In [10]:
x = marketing_data.values
marketing_data_scaled = StandardScaler().fit_transform(x)

#### Apply PCA to the dataset

In [11]:
pca_marketing = PCA(n_components=6,random_state = 1)
principalComponents_marketing = pca_marketing.fit_transform(marketing_data_scaled)

In [12]:
principal_marketing_data = pd.DataFrame(data = principalComponents_marketing
             , columns = ['principal component 1', 'principal component 2',
                          'principal component 3','principal component 4'
                         ,'principal component 5','principal component 6'])
principal_marketing_data

Unnamed: 0,principal component 1,principal component 2,principal component 3,principal component 4,principal component 5,principal component 6
0,3.800461,0.572973,1.254630,1.083547,0.274886,2.368660
1,-2.175610,-0.928702,-0.117578,0.292224,0.323580,-0.105413
2,1.501507,0.123894,0.096791,-0.992810,-1.071276,-0.602728
3,-2.016701,-0.518668,0.025703,0.070743,-0.181590,-0.227872
4,-0.044173,0.763401,0.238572,1.149119,-0.334696,-0.495866
...,...,...,...,...,...,...
2235,2.660651,1.308848,2.151732,-2.178308,1.453732,0.526411
2236,-1.063664,2.738997,-0.463307,0.821222,-0.336517,-0.070777
2237,1.130411,0.004491,-1.519866,-0.539346,-0.947537,-0.493189
2238,1.749883,0.079894,-0.509966,-0.305776,-0.035570,-0.746510
