In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import scipy.stats as stats # If want to run stat tests
import pandas_profiling   # If want to come up with a pandas profiling report

from matplotlib.backends.backend_pdf import PdfPages # Optional : If plan to export graphs as pdf

from sklearn.cluster import KMeans # For running a K means algorithm
from sklearn.metrics import silhouette_score # For finding the silhouette_score 
from sklearn.preprocessing import StandardScaler, MinMaxScaler # For Standardization
from sklearn.decomposition import PCA   # For running PCA

from sklearn import metrics

In [3]:
telco = pd.read_csv("telco_csv.csv")

In [4]:
#Handling Outliers

def outlier_capping(x):
    x = x.clip_upper(x.quantile(0.99))
    x = x.clip_lower(x.quantile(0.01))
    return x

telco_new=telco.apply(lambda x: outlier_capping(x))

  after removing the cwd from sys.path.
  """


In [5]:
telco_new = pd.get_dummies(telco_new, columns=['region'], drop_first=True, prefix='region')

telco_new = pd.get_dummies(telco_new, columns=['custcat'], drop_first=True, prefix='cust_cat')

In [11]:
# Initialize the StandardScaler model

sc = StandardScaler()

# Fit the model on the train dataset

std_model = sc.fit(telco_new)

# Perfrom Transformation using the model object

telco_scaled = std_model.transform(telco_new)

telco_scaled ## The output is not a DataFrame

pd.DataFrame(telco_scaled).head(5) ## No columns names

telco_scaled = pd.DataFrame(telco_scaled, columns=telco_new.columns)

# Converting the Scaled output into a df and providing the column names

In [13]:
telco_scaled.shape

(1000, 33)

In [15]:
pca_model = PCA(n_components=33)

In [16]:
pca_model = pca_model.fit(telco_scaled)

In [17]:
# Amount of variance explained by each Principal Compoenet aka Eigen Value

pca_model.explained_variance_

array([7.37594377, 5.3818386 , 3.40079648, 1.84413613, 1.53395795,
       1.24065502, 1.18417401, 1.07621638, 1.00474643, 0.88746091,
       0.69579606, 0.64912595, 0.62999801, 0.59740224, 0.5328508 ,
       0.49399359, 0.46546653, 0.45715771, 0.42521153, 0.4072122 ,
       0.3723491 , 0.35726153, 0.33375878, 0.30301828, 0.26756922,
       0.23446072, 0.21730833, 0.19970755, 0.18901051, 0.11533482,
       0.0680384 , 0.06405373, 0.02702175])

In [18]:
np.cumsum(pca_model.explained_variance_ratio_)

array([0.22328993, 0.38621287, 0.48916425, 0.54499128, 0.59142837,
       0.62898638, 0.66483456, 0.69741456, 0.72783098, 0.75469684,
       0.77576049, 0.7954113 , 0.81448306, 0.83256805, 0.8486989 ,
       0.86365343, 0.87774437, 0.89158378, 0.9044561 , 0.91678352,
       0.92805554, 0.93887082, 0.94897461, 0.9581478 , 0.96624785,
       0.97334562, 0.97992413, 0.98596983, 0.99169169, 0.99518319,
       0.9972429 , 0.99918198, 1.        ])

- We consider the Principal Component = 10 to be the best value of PC

In [19]:
pca_model = PCA(n_components=10) # initalize the model

pca_model = pca_model.fit(telco_scaled) # fit the model

In [20]:
pca_model.explained_variance_

array([7.37594377, 5.3818386 , 3.40079648, 1.84413593, 1.53395013,
       1.24060467, 1.18412156, 1.07617706, 1.0047033 , 0.88736921])

In [21]:
np.cumsum(pca_model.explained_variance_ratio_)

array([0.22328993, 0.38621287, 0.48916425, 0.54499127, 0.59142813,
       0.62898462, 0.6648312 , 0.69741002, 0.72782513, 0.75468821])

In [22]:
reduced_cr=pca_model.transform(telco_scaled)

In [24]:
dimensions = pd.DataFrame(reduced_cr)

In [26]:
dimensions.columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10']

In [27]:
dimensions.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
0,-2.26624,-0.088029,-0.980796,0.309196,1.511157,0.627602,0.391339,-0.950629,0.873658,-0.613191
1,3.456679,-0.773759,-2.418446,1.864013,-1.258068,2.196285,0.550412,-1.014846,0.558347,1.234956
2,0.306759,4.374563,0.098687,0.43232,-1.581057,0.655989,-0.407108,0.334168,-0.462745,-1.134717
3,-3.493635,-0.371592,-0.897152,-1.111257,1.541956,0.302072,0.676718,-0.30391,-1.324545,0.487135
4,-1.943231,0.746403,-2.963879,1.314854,1.620612,0.15493,-0.164593,0.665599,1.223219,-0.049443


In [29]:
pca_model.components_.T

array([[ 7.41177889e-02,  2.27728931e-01,  3.40386014e-01,
         1.57078053e-01, -1.64624771e-02, -8.09487885e-02,
        -1.75739394e-02, -6.99750260e-02, -3.28374832e-02,
         4.87874529e-02],
       [ 5.39364889e-02,  2.42568778e-01,  2.71948321e-01,
        -2.13383013e-01,  2.70347535e-02,  1.89467997e-01,
         9.37103849e-02,  1.70303358e-01,  1.69939908e-01,
        -6.85586623e-02],
       [ 2.66771542e-02,  6.52340084e-03,  2.86298758e-02,
         5.87618394e-01, -5.88208323e-03,  2.71329614e-01,
         4.83957808e-02,  2.33198902e-01,  1.74860161e-01,
        -1.15897312e-01],
       [ 4.28509565e-02,  2.18137424e-01,  2.71667871e-01,
        -1.31408285e-01,  2.03200182e-02,  5.54454284e-02,
         1.13651700e-01,  1.15987622e-01,  1.27601638e-01,
        -7.82347893e-03],
       [ 9.93549586e-02,  1.21553020e-01,  1.67837030e-01,
        -1.36483840e-01,  1.69058035e-02,  4.70282885e-01,
        -4.94211857e-01, -1.44604685e-01, -6.95176197e-03,
        -7.

In [30]:
np.sqrt(pca_model.explained_variance_)

array([2.71586888, 2.31987901, 1.84412485, 1.35798967, 1.23852741,
       1.11382435, 1.0881735 , 1.03738954, 1.00234889, 0.94200277])

In [31]:
#variable reduction
Loadings =  pd.DataFrame((pca_model.components_.T * np.sqrt(pca_model.explained_variance_)).T,columns=telco_new.columns).T

In [33]:
Loadings.columns = ['PC1', 'PC2', 'PC3', 'PC4', 'PC5', 'PC6', 'PC7', 'PC8', 'PC9', 'PC10']

In [35]:
Loadings.head()

Unnamed: 0,PC1,PC2,PC3,PC4,PC5,PC6,PC7,PC8,PC9,PC10
tenure,0.201294,0.528304,0.627714,0.21331,-0.020389,-0.090163,-0.019123,-0.072591,-0.032915,0.045958
age,0.146484,0.56273,0.501507,-0.289772,0.033483,0.211034,0.101973,0.176671,0.170339,-0.064582
marital,0.072452,0.015134,0.052797,0.79798,-0.007285,0.302214,0.052663,0.241918,0.175271,-0.109176
address,0.116378,0.506052,0.500989,-0.178451,0.025167,0.061756,0.123673,0.120324,0.127901,-0.00737
income,0.269835,0.281988,0.309512,-0.185344,0.020938,0.523813,-0.537788,-0.150011,-0.006968,-0.0712


In [36]:
Loadings.to_excel('Loadings.xlsx')