# Python Lab 04a: Introduction to Scikit-Learn (PCA, Kmeans, etc.) and to Pandas

## Francesco Della Santa, Computational Linear Algebra for Large Scale Problems, Politecnico di Torino

In [3]:
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler


## Initialize PCA objects

### Parameters:
n_component: the number of component you want <ul>
<li>can be an integer, </li>
<li>can be none for all component </li>
<li>can be a float value x to save x percent or more of explained variance (in percentage)</li>
</ul>
The solver indicates which solver can be used for the svd decomposition: (svd_solver)
<ul>
<li>'auto': the full svd is computed if and only if the min dimension is lower than 500 elements, otherwise uses randomized svd</li>
<li>'full': the pca computes the full svd</li>
<li>'randomized': computes the randomized svd</li>
<li>'truncated': you use this when you know the number of pca wanted</li>
</ul>




In [None]:
pca_general = PCA() # computes all PCA
pca_perc = PCA(n_components=0.5, svd_solver='full')  # computes pca to explain 50% of the explained variance
pca_ncomp = PCA(n_components=7) # only creates the first 7 pca

### And let's use them...

you obtain the pca matrix using the fit method which creates the matrix: <br>
The pca centers the data using the mean $\mu$ and the variance.
<br>
will create the matrix Pm to perform the first m PC, <br>
Also the explained variance and the eigenvalues are saved (also as a percentage). <br>
<br>
<b>Transform</b> will transform the data given rescaling them by the mean and then computes the PCA applied to the data matrix returning the new matrix.
<b>Inverse Transform</b>: will inverse transform the data from the pca matrix to the initial matrix 

In [None]:
N, n = 1000, 100
S = np.random.rand(N, n)

# Start using the pca object
pca_ncomp.fit(S)
pca_perc.fit(S)

# Representation of S in the m-dim space of PCs
Qm_ncomp = pca_ncomp.transform(S)
Qm_perc = pca_perc.transform(S)

# Approximation of S obtained using m PCs
Stilde_ncomp = pca_ncomp.inverse_transform(Qm_ncomp)
Stilde_perc = pca_perc.inverse_transform(Qm_perc)

print('*********************** DATASET S ***********************')
display(S)
print('*********************************************************')
print('')
print(f'*********************** DATASET PROJECTED (ncomp: {pca_ncomp.n_components_} PCs) ***********************')
display(Qm_ncomp)
print('*********************************************************************************************************')
print('')
print(f'*********************** DATASET PROJECTED (perc: {pca_perc.n_components_} PCs) ************************')
display(Qm_perc)
print('********************************************************************************************************')
print('')
print('*********************** RECOVERED DATASET S (ncomp) ***********************')
display(Stilde_ncomp)
print('***************************************************************************')
print('')
print('*********************** RECOVERED DATASET S (perc) ***********************')
display(Stilde_perc)
print('**************************************************************************')

*********************** DATASET S ***********************


array([[0.40936818, 0.09320249, 0.57537812, ..., 0.96595136, 0.94303054,
        0.50849994],
       [0.57280536, 0.91323463, 0.90188807, ..., 0.40806125, 0.11022807,
        0.43731257],
       [0.74159556, 0.70002704, 0.79311179, ..., 0.48669656, 0.55638231,
        0.75534397],
       ...,
       [0.91667107, 0.35880506, 0.64589064, ..., 0.52542091, 0.33460778,
        0.31873929],
       [0.16336167, 0.00734295, 0.92095168, ..., 0.93186668, 0.27058155,
        0.96089828],
       [0.23308429, 0.05381277, 0.98827004, ..., 0.17953776, 0.38574099,
        0.60653913]])

*********************************************************

*********************** DATASET PROJECTED (ncomp: 7 PCs) ***********************


array([[-0.40937743, -0.26611326, -0.40609986, ...,  0.37950042,
         0.19493742, -0.51249813],
       [ 0.42666812,  0.04381765,  0.22389823, ..., -0.65733858,
         0.12929418, -0.13721852],
       [ 0.02676388,  0.34981673, -0.18429466, ...,  0.15043274,
        -0.01054024, -0.54434622],
       ...,
       [ 0.58224033, -0.62356961, -0.04487258, ...,  0.19014883,
         0.09729572, -0.00117706],
       [ 0.62787018,  0.26823698, -0.3712405 , ...,  0.23386477,
        -0.03370365, -0.17875339],
       [-0.25675158,  0.44556844,  0.28468298, ..., -0.04095999,
         0.30127984,  0.27345359]])

*********************************************************************************************************

*********************** DATASET PROJECTED (perc: 38 PCs) ************************


array([[-0.40937743, -0.26611326, -0.40609986, ...,  0.18820616,
        -0.78960675,  0.01140831],
       [ 0.42666812,  0.04381765,  0.22389823, ..., -0.07804432,
         0.07099695,  0.61449283],
       [ 0.02676388,  0.34981673, -0.18429466, ...,  0.2419083 ,
         0.19820794,  0.20089049],
       ...,
       [ 0.58224033, -0.62356961, -0.04487258, ...,  0.11321253,
         0.11260971, -0.21693969],
       [ 0.62787018,  0.26823698, -0.3712405 , ..., -0.0049759 ,
        -0.30290133, -0.01616229],
       [-0.25675158,  0.44556844,  0.28468298, ...,  0.49297435,
        -0.23160452,  0.00326047]])

********************************************************************************************************

*********************** RECOVERED DATASET S (ncomp) ***********************


array([[0.45785966, 0.29378528, 0.52510831, ..., 0.51045078, 0.66272543,
        0.53322153],
       [0.73269816, 0.45725428, 0.56529887, ..., 0.37308305, 0.51532669,
        0.36964062],
       [0.40170563, 0.58541985, 0.67854428, ..., 0.44955675, 0.40473417,
        0.61522569],
       ...,
       [0.76081662, 0.31614223, 0.40424648, ..., 0.43739715, 0.57559219,
        0.48034855],
       [0.56645253, 0.54473555, 0.6158926 , ..., 0.40526306, 0.42096171,
        0.66906413],
       [0.42623915, 0.6412378 , 0.45071894, ..., 0.59140196, 0.47352894,
        0.43096693]])

***************************************************************************

*********************** RECOVERED DATASET S (perc) ***********************


array([[0.48195631, 0.18270689, 0.55920419, ..., 0.59202694, 0.55718914,
        0.65594237],
       [0.54881227, 0.86371613, 0.71689015, ..., 0.33725961, 0.27945703,
        0.48276414],
       [0.87887106, 0.89958813, 0.71296122, ..., 0.63022463, 0.49571213,
        0.66349043],
       ...,
       [0.86935172, 0.367208  , 0.47088281, ..., 0.43325331, 0.51538368,
        0.42250968],
       [0.3499021 , 0.24654321, 0.89494775, ..., 0.67705348, 0.2805559 ,
        0.92212927],
       [0.39530349, 0.19595388, 0.63564172, ..., 0.25342957, 0.43922659,
        0.67253674]])

**************************************************************************


In [4]:
display(pca_ncomp.explained_variance_ratio_)

array([0.01706731, 0.0166428 , 0.01615093, 0.01544766, 0.01529891,
       0.01518226, 0.01501369])

## Initialize the Standard Scaler

StandardScaler() is used for normalization with regards to the standard deviation and the mean

In [11]:
scaler_recent = StandardScaler(with_std=False)
scaler_znorm = StandardScaler()
# Start using the scaler objects
scaler_recent.fit(S)
scaler_znorm.fit(S)
# recentered S
Sbar = scaler_recent.transform(S)
# standardized S
Shat = scaler_znorm.transform(S)

print(f'*********************** DATASET RECENTERED ***********************')
print('SAMPLE MEAN OF RECENTERED DATA:')
display(Sbar.mean(axis=1)[:10])
print('SAMPLE ST.DEV. OF RECENTERED DATA:')
display(Sbar.std(axis=1)[:10])
print('')
display(Sbar)
print('*******************************************************************')
print('')
print(f'*********************** DATASET STANDARDIZED ***********************')
print('SAMPLE MEAN OF STANDARDIZED DATA:')
display(Shat.mean(axis=1)[:10])
print('SAMPLE ST.DEV. OF STANDARDIZED DATA:')
display(Shat.std(axis=1)[:10])
print('')
display(Shat)
print('**********************************************************************')

*********************** DATASET RECENTERED ***********************
SAMPLE MEAN OF RECENTERED DATA:


array([ 0.00534408,  0.00037233,  0.01381404,  0.02089265, -0.01204571,
        0.04691859, -0.02392384, -0.02242484, -0.0224381 ,  0.00634499])

SAMPLE ST.DEV. OF RECENTERED DATA:


array([0.30786594, 0.28867339, 0.2636256 , 0.28233008, 0.28355555,
       0.26244561, 0.28225062, 0.27823944, 0.27164394, 0.28984261])




array([[-0.0899163 , -0.40906994,  0.07443991, ...,  0.47274189,
         0.44731128,  0.02580004],
       [ 0.07352088,  0.4109622 ,  0.40094985, ..., -0.08514822,
        -0.38549119, -0.04538733],
       [ 0.24231108,  0.19775461,  0.29217358, ..., -0.00651291,
         0.06066306,  0.27264406],
       ...,
       [ 0.41738659, -0.14346737,  0.14495243, ...,  0.03221144,
        -0.16111147, -0.16396062],
       [-0.33592281, -0.49492949,  0.42001346, ...,  0.43865721,
        -0.2251377 ,  0.47819838],
       [-0.26620019, -0.44845967,  0.48733183, ..., -0.31367171,
        -0.10997827,  0.12383922]])

*******************************************************************

*********************** DATASET STANDARDIZED ***********************
SAMPLE MEAN OF STANDARDIZED DATA:


array([ 1.95024870e-02, -8.20676023e-05,  4.77401737e-02,  7.14221943e-02,
       -4.24780029e-02,  1.62224275e-01, -8.18754343e-02, -7.89526111e-02,
       -7.64891015e-02,  2.27108253e-02])

SAMPLE ST.DEV. OF STANDARDIZED DATA:


array([1.06529283, 1.00031263, 0.9131252 , 0.97937983, 0.98059979,
       0.90765667, 0.97665009, 0.96600687, 0.94225995, 1.00624478])




array([[-0.31661275, -1.38311068,  0.25486926, ...,  1.66891531,
         1.55079483,  0.08957458],
       [ 0.25888132,  1.38950861,  1.37278232, ..., -0.30059778,
        -1.33646919, -0.15757929],
       [ 0.8532244 ,  0.66863018,  1.00035134, ..., -0.02299244,
         0.21031429,  0.94658692],
       ...,
       [ 1.46969928, -0.48507903,  0.49629181, ...,  0.11371568,
        -0.55856146, -0.56925126],
       [-1.18284948, -1.67341129,  1.4380528 , ...,  1.54858654,
        -0.78053562,  1.66024643],
       [-0.93734257, -1.5162917 ,  1.66853912, ..., -1.10735165,
        -0.38128645,  0.42995468]])

**********************************************************************


### Let's apply the PCA to standardized data

In [12]:
pca = PCA(n_components=7)

# Start with PCA
pca.fit(Shat)
Qm = pca.transform(Shat)

# Recovering of Shat_tilde
Shat_tilde = pca.inverse_transform(Qm)

# Recovering of S_tilde
S_tilde = scaler_znorm.inverse_transform(Shat_tilde)

print('*********************** RECOVERED DATASET Shat ***********************')
display(Shat_tilde)
print('**********************************************************************')
print('*********************** RECOVERED DATASET S ***********************')
display(Shat)
print('*******************************************************************')

*********************** RECOVERED DATASET Shat ***********************


array([[-0.23464222, -0.6299307 , -0.01333154, ..., -0.01843686,
         0.6271358 ,  0.29260957],
       [ 0.83132994, -0.16760574,  0.110843  , ..., -0.43523653,
         0.12905152, -0.31947582],
       [-0.38747436,  0.22250918,  0.41307938, ..., -0.15495601,
        -0.34365703,  0.52440816],
       ...,
       [ 0.9865518 , -0.47062231, -0.30190596, ..., -0.26109109,
         0.3127003 , -0.01578982],
       [ 0.22237577,  0.08254034,  0.42007762, ..., -0.42397078,
        -0.23361445,  0.65227072],
       [-0.27948591,  0.39537447, -0.1342625 , ...,  0.43758571,
        -0.08486184, -0.21542825]])

**********************************************************************
*********************** RECOVERED DATASET S ***********************


array([[-0.31661275, -1.38311068,  0.25486926, ...,  1.66891531,
         1.55079483,  0.08957458],
       [ 0.25888132,  1.38950861,  1.37278232, ..., -0.30059778,
        -1.33646919, -0.15757929],
       [ 0.8532244 ,  0.66863018,  1.00035134, ..., -0.02299244,
         0.21031429,  0.94658692],
       ...,
       [ 1.46969928, -0.48507903,  0.49629181, ...,  0.11371568,
        -0.55856146, -0.56925126],
       [-1.18284948, -1.67341129,  1.4380528 , ...,  1.54858654,
        -0.78053562,  1.66024643],
       [-0.93734257, -1.5162917 ,  1.66853912, ..., -1.10735165,
        -0.38128645,  0.42995468]])

*******************************************************************


## Initialize $k$-Means

### Parameters:
The number of cluster (to be specified: by default 8), must be selected at first<br>
init argument: you can set many different starting vectors, they are generated randomly or can be even given the data or a function to generate them <br>
n_init: how many times the Kmean is executed wrt different initialization points <br>
max_iter: the number of maximum iteration <br>
tol: the tolerance for stopping criteria <br>
random_state: important for replicability (random seed generation) <br>
algorithm: there can be different algorithm to use (ex elkan or lloyd) 

In [13]:
kmeans_default = KMeans()
kmeans_3c = KMeans(n_clusters=3, init='random', algorithm='lloyd')

W0 = np.random.rand(3, 10)
kmeans_3cW0 = KMeans(n_clusters=3, init=W0, algorithm='lloyd')

### And let's use one of them...

In [14]:
Snew = np.random.rand(N, n)

km = kmeans_3c  # change the KMeans object here if you want to try another one

# Start using the km object, fitting it on the data S
km.fit(S)

# Prediction of cluster belonging w.r.t. S
S_labels = km.labels_

# Prediction of cluster belonging w.r.t. Snew
Snew_labels = km.predict(Snew)

print(f'*********************** S labels ***********************')
display(S_labels[:10])
print('*********************************************************')
print('')
print(f'*********************** Snew labels ***********************')
display(Snew_labels[:10])
print('************************************************************')

*********************** S labels ***********************


array([0, 2, 2, 0, 2, 2, 1, 1, 0, 0], dtype=int32)

*********************************************************

*********************** Snew labels ***********************


array([0, 1, 1, 2, 0, 2, 1, 0, 1, 0], dtype=int32)

************************************************************


## Initialize Serieses

### Using Arrays

In [15]:
x = np.random.rand(10)
x

array([0.51513492, 0.55963512, 0.53185097, 0.24593504, 0.17933056,
       0.60014438, 0.17106858, 0.69983748, 0.25105184, 0.94877271])

In [16]:
s1 = pd.Series(x, index=[f'index{i}' for i in range(1,11)], name='my_series1')
s2 = pd.Series(x, name='my_series2')

In [17]:
s1 

index1     0.515135
index2     0.559635
index3     0.531851
index4     0.245935
index5     0.179331
index6     0.600144
index7     0.171069
index8     0.699837
index9     0.251052
index10    0.948773
Name: my_series1, dtype: float64

In [18]:
s1['index1'] = 0

In [19]:
x

array([0.        , 0.55963512, 0.53185097, 0.24593504, 0.17933056,
       0.60014438, 0.17106858, 0.69983748, 0.25105184, 0.94877271])

In [20]:
s2

0    0.000000
1    0.559635
2    0.531851
3    0.245935
4    0.179331
5    0.600144
6    0.171069
7    0.699837
8    0.251052
9    0.948773
Name: my_series2, dtype: float64

### Using Dictionaries

In [21]:
d = {'Age':30, 'Height':185, 'Weight':90}
d

{'Age': 30, 'Height': 185, 'Weight': 90}

In [22]:
s1d = pd.Series(d)

In [23]:
s1d

Age        30
Height    185
Weight     90
dtype: int64

## Initialize DataFrames

### Using Dictionaries

In [24]:
D = {'Float_random':np.random.rand(10), 'Integer_random':np.random.permutation(10)}
D

{'Float_random': array([0.81922945, 0.00878946, 0.30267626, 0.01557354, 0.51682395,
        0.83181199, 0.98668088, 0.66344031, 0.20693318, 0.38770378]),
 'Integer_random': array([8, 3, 7, 5, 1, 2, 0, 6, 9, 4], dtype=int32)}

In [25]:
df1d = pd.DataFrame(D)
df1d

Unnamed: 0,Float_random,Integer_random
0,0.819229,8
1,0.008789,3
2,0.302676,7
3,0.015574,5
4,0.516824,1
5,0.831812,2
6,0.986681,0
7,0.66344,6
8,0.206933,9
9,0.387704,4


In [26]:
df1d.dtypes

Float_random      float64
Integer_random      int32
dtype: object

### Using Arrays

In [27]:
X = np.random.rand(10,5)
X

array([[0.58540722, 0.15425591, 0.35273849, 0.9785178 , 0.21466462],
       [0.16959672, 0.69741365, 0.01555778, 0.74767698, 0.33614261],
       [0.57812177, 0.22349622, 0.72779877, 0.30738751, 0.10307989],
       [0.65739025, 0.82900994, 0.8950966 , 0.59244314, 0.62186875],
       [0.25784866, 0.75683938, 0.23062865, 0.98044461, 0.31268443],
       [0.98576568, 0.25717255, 0.6657422 , 0.05317074, 0.72783585],
       [0.40264159, 0.43587433, 0.90330786, 0.35234904, 0.92420414],
       [0.19019985, 0.93275091, 0.15924309, 0.33503523, 0.80996114],
       [0.31474585, 0.9148154 , 0.56711042, 0.60085817, 0.26034682],
       [0.90671481, 0.662426  , 0.38140756, 0.85384391, 0.08611771]])

In [28]:
df1 = pd.DataFrame(X, index=range(1, X.shape[0] + 1), columns=[f'column_{i}' for i in range(1, X.shape[1] + 1)])
df1

Unnamed: 0,column_1,column_2,column_3,column_4,column_5
1,0.585407,0.154256,0.352738,0.978518,0.214665
2,0.169597,0.697414,0.015558,0.747677,0.336143
3,0.578122,0.223496,0.727799,0.307388,0.10308
4,0.65739,0.82901,0.895097,0.592443,0.621869
5,0.257849,0.756839,0.230629,0.980445,0.312684
6,0.985766,0.257173,0.665742,0.053171,0.727836
7,0.402642,0.435874,0.903308,0.352349,0.924204
8,0.1902,0.932751,0.159243,0.335035,0.809961
9,0.314746,0.914815,0.56711,0.600858,0.260347
10,0.906715,0.662426,0.381408,0.853844,0.086118


## Extract/Add Column

In [29]:
df1['column_2']

1     0.154256
2     0.697414
3     0.223496
4     0.829010
5     0.756839
6     0.257173
7     0.435874
8     0.932751
9     0.914815
10    0.662426
Name: column_2, dtype: float64

In [30]:
df1['column_6'] = np.random.rand(10)
df1

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,0.585407,0.154256,0.352738,0.978518,0.214665,0.988534
2,0.169597,0.697414,0.015558,0.747677,0.336143,0.725114
3,0.578122,0.223496,0.727799,0.307388,0.10308,0.431186
4,0.65739,0.82901,0.895097,0.592443,0.621869,0.801384
5,0.257849,0.756839,0.230629,0.980445,0.312684,0.362059
6,0.985766,0.257173,0.665742,0.053171,0.727836,0.128897
7,0.402642,0.435874,0.903308,0.352349,0.924204,0.645924
8,0.1902,0.932751,0.159243,0.335035,0.809961,0.053032
9,0.314746,0.914815,0.56711,0.600858,0.260347,0.673258
10,0.906715,0.662426,0.381408,0.853844,0.086118,0.497009


## DataFrame Attributes

We use the DataFrame df1 defined above.

In [34]:
df1.at[6, 'column_2']

np.float64(0.25717255268044004)

In [26]:
df1.iat[5, 1]

0.09607329873473758

In [27]:
df1.index

RangeIndex(start=1, stop=11, step=1)

In [28]:
df1.columns

Index(['column_1', 'column_2', 'column_3', 'column_4', 'column_5', 'column_6'], dtype='object')

In [29]:
df1.axes

[RangeIndex(start=1, stop=11, step=1),
 Index(['column_1', 'column_2', 'column_3', 'column_4', 'column_5', 'column_6'], dtype='object')]

In [30]:
df1.loc[[1,7,10], :]

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,0.999094,0.647682,0.031766,0.390485,0.004092,0.023134
7,0.313547,0.19216,0.075076,0.959145,0.590979,0.688038
10,0.20931,0.968352,0.97937,0.709929,0.620378,0.068132


In [31]:
df1.loc[[1,7,10], ['column_1', 'column_3']]

Unnamed: 0,column_1,column_3
1,0.999094,0.031766
7,0.313547,0.075076
10,0.20931,0.97937


In [32]:
df1.iloc[[0,6,9],[0,2]]

Unnamed: 0,column_1,column_3
1,0.999094,0.031766
7,0.313547,0.075076
10,0.20931,0.97937


In [33]:
df1.loc[(df1.index > 3) & (df1.index <= 7), df1.columns != 'column_3']

Unnamed: 0,column_1,column_2,column_4,column_5,column_6
4,0.861012,0.551957,0.715491,0.453771,0.403453
5,0.633205,0.224036,0.80948,0.528043,0.469931
6,0.798486,0.096073,0.809944,0.264736,0.945147
7,0.313547,0.19216,0.959145,0.590979,0.688038


In [34]:
df1.shape

(10, 6)

In [35]:
df1.ndim

2

In [36]:
df1.size

60

In [37]:
df1.values

array([[0.99909389, 0.64768242, 0.03176591, 0.39048473, 0.00409248,
        0.02313397],
       [0.50413484, 0.05074405, 0.13910772, 0.8948549 , 0.19483533,
        0.2827974 ],
       [0.78727386, 0.16430398, 0.94017413, 0.13657167, 0.41491704,
        0.50077079],
       [0.86101183, 0.55195654, 0.74982833, 0.71549127, 0.45377093,
        0.40345295],
       [0.63320454, 0.22403617, 0.69504948, 0.80948003, 0.52804335,
        0.46993146],
       [0.79848569, 0.0960733 , 0.72708292, 0.8099438 , 0.26473611,
        0.94514745],
       [0.31354707, 0.19216042, 0.07507637, 0.95914538, 0.59097892,
        0.68803845],
       [0.82938146, 0.91215625, 0.40943282, 0.67031839, 0.82330516,
        0.91840485],
       [0.25536228, 0.10967436, 0.40358192, 0.93015562, 0.15731178,
        0.5728634 ],
       [0.20930988, 0.96835213, 0.97936965, 0.70992882, 0.62037765,
        0.06813211]])

## DataFrame Methods

We use the DataFrame df1 defined above.

### Exploration Methods

In [38]:
df1.head(3)

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,0.999094,0.647682,0.031766,0.390485,0.004092,0.023134
2,0.504135,0.050744,0.139108,0.894855,0.194835,0.282797
3,0.787274,0.164304,0.940174,0.136572,0.414917,0.500771


In [39]:
df1.tail(2)

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
9,0.255362,0.109674,0.403582,0.930156,0.157312,0.572863
10,0.20931,0.968352,0.97937,0.709929,0.620378,0.068132


In [40]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 1 to 10
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   column_1  10 non-null     float64
 1   column_2  10 non-null     float64
 2   column_3  10 non-null     float64
 3   column_4  10 non-null     float64
 4   column_5  10 non-null     float64
 5   column_6  10 non-null     float64
dtypes: float64(6)
memory usage: 612.0 bytes


In [41]:
df1.nunique()

column_1    10
column_2    10
column_3    10
column_4    10
column_5    10
column_6    10
dtype: int64

In [42]:
df1.nunique(axis=1)

1     6
2     6
3     6
4     6
5     6
6     6
7     6
8     6
9     6
10    6
dtype: int64

In [43]:
df1.isna()

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,False,False,False,False,False,False
2,False,False,False,False,False,False
3,False,False,False,False,False,False
4,False,False,False,False,False,False
5,False,False,False,False,False,False
6,False,False,False,False,False,False
7,False,False,False,False,False,False
8,False,False,False,False,False,False
9,False,False,False,False,False,False
10,False,False,False,False,False,False


In [44]:
df1.count()

column_1    10
column_2    10
column_3    10
column_4    10
column_5    10
column_6    10
dtype: int64

In [45]:
df1.value_counts()

column_1  column_2  column_3  column_4  column_5  column_6
0.209310  0.968352  0.979370  0.709929  0.620378  0.068132    1
0.255362  0.109674  0.403582  0.930156  0.157312  0.572863    1
0.313547  0.192160  0.075076  0.959145  0.590979  0.688038    1
0.504135  0.050744  0.139108  0.894855  0.194835  0.282797    1
0.633205  0.224036  0.695049  0.809480  0.528043  0.469931    1
0.787274  0.164304  0.940174  0.136572  0.414917  0.500771    1
0.798486  0.096073  0.727083  0.809944  0.264736  0.945147    1
0.829381  0.912156  0.409433  0.670318  0.823305  0.918405    1
0.861012  0.551957  0.749828  0.715491  0.453771  0.403453    1
0.999094  0.647682  0.031766  0.390485  0.004092  0.023134    1
Name: count, dtype: int64

In [46]:
df1.value_counts(normalize=True)

column_1  column_2  column_3  column_4  column_5  column_6
0.209310  0.968352  0.979370  0.709929  0.620378  0.068132    0.1
0.255362  0.109674  0.403582  0.930156  0.157312  0.572863    0.1
0.313547  0.192160  0.075076  0.959145  0.590979  0.688038    0.1
0.504135  0.050744  0.139108  0.894855  0.194835  0.282797    0.1
0.633205  0.224036  0.695049  0.809480  0.528043  0.469931    0.1
0.787274  0.164304  0.940174  0.136572  0.414917  0.500771    0.1
0.798486  0.096073  0.727083  0.809944  0.264736  0.945147    0.1
0.829381  0.912156  0.409433  0.670318  0.823305  0.918405    0.1
0.861012  0.551957  0.749828  0.715491  0.453771  0.403453    0.1
0.999094  0.647682  0.031766  0.390485  0.004092  0.023134    0.1
Name: proportion, dtype: float64

### Statistical Analysis (Basic) and Operations

In [47]:
df1.describe()

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
count,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.619081,0.391714,0.515047,0.702637,0.405237,0.487267
std,0.281782,0.349195,0.352901,0.25746,0.24933,0.31333
min,0.20931,0.050744,0.031766,0.136572,0.004092,0.023134
25%,0.361194,0.123332,0.205226,0.680221,0.212311,0.312961
50%,0.710239,0.208098,0.552241,0.762486,0.434344,0.485351
75%,0.821658,0.623751,0.744142,0.873627,0.575245,0.659245
max,0.999094,0.968352,0.97937,0.959145,0.823305,0.945147


In [48]:
df1.describe(percentiles=[0.13, 0.87, 0.99])

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
count,10.0,10.0,10.0,10.0,10.0,10.0
mean,0.619081,0.391714,0.515047,0.702637,0.405237,0.487267
std,0.281782,0.349195,0.352901,0.25746,0.24933,0.31333
min,0.20931,0.050744,0.031766,0.136572,0.004092,0.023134
13%,0.265254,0.098385,0.085962,0.438056,0.163691,0.104625
50%,0.710239,0.208098,0.552241,0.762486,0.434344,0.485351
87%,0.855635,0.867196,0.907815,0.924154,0.61538,0.879243
99%,0.986667,0.963295,0.975842,0.956536,0.805042,0.942741
max,0.999094,0.968352,0.97937,0.959145,0.823305,0.945147


In [49]:
df1.mean()

column_1    0.619081
column_2    0.391714
column_3    0.515047
column_4    0.702637
column_5    0.405237
column_6    0.487267
dtype: float64

In [50]:
df1.mean(axis=1)

1     0.349376
2     0.344412
3     0.490669
4     0.622585
5     0.559958
6     0.606912
7     0.469824
8     0.760500
9     0.404825
10    0.592578
dtype: float64

In [51]:
df1.corr()

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
column_1,1.0,0.133574,-0.010126,-0.586533,-0.166328,0.101891
column_2,0.133574,1.0,0.166417,-0.237617,0.472292,-0.2839
column_3,-0.010126,0.166417,1.0,-0.303855,0.356664,0.043391
column_4,-0.586533,-0.237617,-0.303855,1.0,0.099108,0.272884
column_5,-0.166328,0.472292,0.356664,0.099108,1.0,0.376828
column_6,0.101891,-0.2839,0.043391,0.272884,0.376828,1.0


In [52]:
df1.cov()

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
column_1,0.079401,0.013143,-0.001007,-0.042552,-0.011686,0.008996
column_2,0.013143,0.121937,0.020508,-0.021363,0.04112,-0.031062
column_3,-0.001007,0.020508,0.124539,-0.027608,0.031383,0.004798
column_4,-0.042552,-0.021363,-0.027608,0.066286,0.006362,0.022014
column_5,-0.011686,0.04112,0.031383,0.006362,0.062166,0.029439
column_6,0.008996,-0.031062,0.004798,0.022014,0.029439,0.098175


In [53]:
df1.sample(3, random_state=10)

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
9,0.255362,0.109674,0.403582,0.930156,0.157312,0.572863
3,0.787274,0.164304,0.940174,0.136572,0.414917,0.500771
6,0.798486,0.096073,0.727083,0.809944,0.264736,0.945147


### Attributes, Operation Methods, and Transformation Methods

In [54]:
df1_copy = df1.copy()
df1_fakecopy = df1

In [55]:
df1_fakecopy.at[1, 'column_1'] = 10

In [56]:
df1_copy.at[1, 'column_1'] = np.nan

In [57]:
df1_copy

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,,0.647682,0.031766,0.390485,0.004092,0.023134
2,0.504135,0.050744,0.139108,0.894855,0.194835,0.282797
3,0.787274,0.164304,0.940174,0.136572,0.414917,0.500771
4,0.861012,0.551957,0.749828,0.715491,0.453771,0.403453
5,0.633205,0.224036,0.695049,0.80948,0.528043,0.469931
6,0.798486,0.096073,0.727083,0.809944,0.264736,0.945147
7,0.313547,0.19216,0.075076,0.959145,0.590979,0.688038
8,0.829381,0.912156,0.409433,0.670318,0.823305,0.918405
9,0.255362,0.109674,0.403582,0.930156,0.157312,0.572863
10,0.20931,0.968352,0.97937,0.709929,0.620378,0.068132


In [58]:
df1

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,10.0,0.647682,0.031766,0.390485,0.004092,0.023134
2,0.504135,0.050744,0.139108,0.894855,0.194835,0.282797
3,0.787274,0.164304,0.940174,0.136572,0.414917,0.500771
4,0.861012,0.551957,0.749828,0.715491,0.453771,0.403453
5,0.633205,0.224036,0.695049,0.80948,0.528043,0.469931
6,0.798486,0.096073,0.727083,0.809944,0.264736,0.945147
7,0.313547,0.19216,0.075076,0.959145,0.590979,0.688038
8,0.829381,0.912156,0.409433,0.670318,0.823305,0.918405
9,0.255362,0.109674,0.403582,0.930156,0.157312,0.572863
10,0.20931,0.968352,0.97937,0.709929,0.620378,0.068132


**N.B.:** modification applied to df1_fakecopy modified df1, too!

In [59]:
pd.concat([df1, df1_copy])

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,10.0,0.647682,0.031766,0.390485,0.004092,0.023134
2,0.504135,0.050744,0.139108,0.894855,0.194835,0.282797
3,0.787274,0.164304,0.940174,0.136572,0.414917,0.500771
4,0.861012,0.551957,0.749828,0.715491,0.453771,0.403453
5,0.633205,0.224036,0.695049,0.80948,0.528043,0.469931
6,0.798486,0.096073,0.727083,0.809944,0.264736,0.945147
7,0.313547,0.19216,0.075076,0.959145,0.590979,0.688038
8,0.829381,0.912156,0.409433,0.670318,0.823305,0.918405
9,0.255362,0.109674,0.403582,0.930156,0.157312,0.572863
10,0.20931,0.968352,0.97937,0.709929,0.620378,0.068132


In [60]:
df1.drop([1, 3], axis=0)

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
2,0.504135,0.050744,0.139108,0.894855,0.194835,0.282797
4,0.861012,0.551957,0.749828,0.715491,0.453771,0.403453
5,0.633205,0.224036,0.695049,0.80948,0.528043,0.469931
6,0.798486,0.096073,0.727083,0.809944,0.264736,0.945147
7,0.313547,0.19216,0.075076,0.959145,0.590979,0.688038
8,0.829381,0.912156,0.409433,0.670318,0.823305,0.918405
9,0.255362,0.109674,0.403582,0.930156,0.157312,0.572863
10,0.20931,0.968352,0.97937,0.709929,0.620378,0.068132


In [61]:
df1_copy.dropna()

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
2,0.504135,0.050744,0.139108,0.894855,0.194835,0.282797
3,0.787274,0.164304,0.940174,0.136572,0.414917,0.500771
4,0.861012,0.551957,0.749828,0.715491,0.453771,0.403453
5,0.633205,0.224036,0.695049,0.80948,0.528043,0.469931
6,0.798486,0.096073,0.727083,0.809944,0.264736,0.945147
7,0.313547,0.19216,0.075076,0.959145,0.590979,0.688038
8,0.829381,0.912156,0.409433,0.670318,0.823305,0.918405
9,0.255362,0.109674,0.403582,0.930156,0.157312,0.572863
10,0.20931,0.968352,0.97937,0.709929,0.620378,0.068132


In [62]:
df1_copy.fillna(1000)

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,1000.0,0.647682,0.031766,0.390485,0.004092,0.023134
2,0.504135,0.050744,0.139108,0.894855,0.194835,0.282797
3,0.787274,0.164304,0.940174,0.136572,0.414917,0.500771
4,0.861012,0.551957,0.749828,0.715491,0.453771,0.403453
5,0.633205,0.224036,0.695049,0.80948,0.528043,0.469931
6,0.798486,0.096073,0.727083,0.809944,0.264736,0.945147
7,0.313547,0.19216,0.075076,0.959145,0.590979,0.688038
8,0.829381,0.912156,0.409433,0.670318,0.823305,0.918405
9,0.255362,0.109674,0.403582,0.930156,0.157312,0.572863
10,0.20931,0.968352,0.97937,0.709929,0.620378,0.068132


In [63]:
df1_copy.rename(index={1:'nuovo_index'}, columns={'column_1':'COLONNA_1'})

Unnamed: 0,COLONNA_1,column_2,column_3,column_4,column_5,column_6
nuovo_index,,0.647682,0.031766,0.390485,0.004092,0.023134
2,0.504135,0.050744,0.139108,0.894855,0.194835,0.282797
3,0.787274,0.164304,0.940174,0.136572,0.414917,0.500771
4,0.861012,0.551957,0.749828,0.715491,0.453771,0.403453
5,0.633205,0.224036,0.695049,0.80948,0.528043,0.469931
6,0.798486,0.096073,0.727083,0.809944,0.264736,0.945147
7,0.313547,0.19216,0.075076,0.959145,0.590979,0.688038
8,0.829381,0.912156,0.409433,0.670318,0.823305,0.918405
9,0.255362,0.109674,0.403582,0.930156,0.157312,0.572863
10,0.20931,0.968352,0.97937,0.709929,0.620378,0.068132


In [64]:
df1_copy.reset_index()

Unnamed: 0,index,column_1,column_2,column_3,column_4,column_5,column_6
0,1,,0.647682,0.031766,0.390485,0.004092,0.023134
1,2,0.504135,0.050744,0.139108,0.894855,0.194835,0.282797
2,3,0.787274,0.164304,0.940174,0.136572,0.414917,0.500771
3,4,0.861012,0.551957,0.749828,0.715491,0.453771,0.403453
4,5,0.633205,0.224036,0.695049,0.80948,0.528043,0.469931
5,6,0.798486,0.096073,0.727083,0.809944,0.264736,0.945147
6,7,0.313547,0.19216,0.075076,0.959145,0.590979,0.688038
7,8,0.829381,0.912156,0.409433,0.670318,0.823305,0.918405
8,9,0.255362,0.109674,0.403582,0.930156,0.157312,0.572863
9,10,0.20931,0.968352,0.97937,0.709929,0.620378,0.068132


In [65]:
df1_copy.sort_values('column_1')

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
10,0.20931,0.968352,0.97937,0.709929,0.620378,0.068132
9,0.255362,0.109674,0.403582,0.930156,0.157312,0.572863
7,0.313547,0.19216,0.075076,0.959145,0.590979,0.688038
2,0.504135,0.050744,0.139108,0.894855,0.194835,0.282797
5,0.633205,0.224036,0.695049,0.80948,0.528043,0.469931
3,0.787274,0.164304,0.940174,0.136572,0.414917,0.500771
6,0.798486,0.096073,0.727083,0.809944,0.264736,0.945147
8,0.829381,0.912156,0.409433,0.670318,0.823305,0.918405
4,0.861012,0.551957,0.749828,0.715491,0.453771,0.403453
1,,0.647682,0.031766,0.390485,0.004092,0.023134


(Notare il NaN messo in fondo; per maggiori informazioni, guardare la documentazione ufficiale)

### Exportation Methods

In [66]:
df1.to_csv('df1.csv', columns=['column_1', 'column_5', 'column_2'], index_label='ID')

In [67]:
df1.to_pickle('df1.pkl')

## Loading a DataFrame

In [68]:
pd.read_csv('df1.csv')

Unnamed: 0,ID,column_1,column_5,column_2
0,1,10.0,0.004092,0.647682
1,2,0.504135,0.194835,0.050744
2,3,0.787274,0.414917,0.164304
3,4,0.861012,0.453771,0.551957
4,5,0.633205,0.528043,0.224036
5,6,0.798486,0.264736,0.096073
6,7,0.313547,0.590979,0.19216
7,8,0.829381,0.823305,0.912156
8,9,0.255362,0.157312,0.109674
9,10,0.20931,0.620378,0.968352


In [69]:
pd.read_csv('df1.csv', usecols=['ID', 'column_1', 'column_2'], index_col='ID')

Unnamed: 0_level_0,column_1,column_2
ID,Unnamed: 1_level_1,Unnamed: 2_level_1
1,10.0,0.647682
2,0.504135,0.050744
3,0.787274,0.164304
4,0.861012,0.551957
5,0.633205,0.224036
6,0.798486,0.096073
7,0.313547,0.19216
8,0.829381,0.912156
9,0.255362,0.109674
10,0.20931,0.968352


In [70]:
pd.read_pickle('df1.pkl')

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6
1,10.0,0.647682,0.031766,0.390485,0.004092,0.023134
2,0.504135,0.050744,0.139108,0.894855,0.194835,0.282797
3,0.787274,0.164304,0.940174,0.136572,0.414917,0.500771
4,0.861012,0.551957,0.749828,0.715491,0.453771,0.403453
5,0.633205,0.224036,0.695049,0.80948,0.528043,0.469931
6,0.798486,0.096073,0.727083,0.809944,0.264736,0.945147
7,0.313547,0.19216,0.075076,0.959145,0.590979,0.688038
8,0.829381,0.912156,0.409433,0.670318,0.823305,0.918405
9,0.255362,0.109674,0.403582,0.930156,0.157312,0.572863
10,0.20931,0.968352,0.97937,0.709929,0.620378,0.068132


## Concatenation of DataFrames

In [71]:
pd.concat([df1, df1.reset_index()], axis=1)

Unnamed: 0,column_1,column_2,column_3,column_4,column_5,column_6,index,column_1.1,column_2.1,column_3.1,column_4.1,column_5.1,column_6.1
1,10.0,0.647682,0.031766,0.390485,0.004092,0.023134,2.0,0.504135,0.050744,0.139108,0.894855,0.194835,0.282797
2,0.504135,0.050744,0.139108,0.894855,0.194835,0.282797,3.0,0.787274,0.164304,0.940174,0.136572,0.414917,0.500771
3,0.787274,0.164304,0.940174,0.136572,0.414917,0.500771,4.0,0.861012,0.551957,0.749828,0.715491,0.453771,0.403453
4,0.861012,0.551957,0.749828,0.715491,0.453771,0.403453,5.0,0.633205,0.224036,0.695049,0.80948,0.528043,0.469931
5,0.633205,0.224036,0.695049,0.80948,0.528043,0.469931,6.0,0.798486,0.096073,0.727083,0.809944,0.264736,0.945147
6,0.798486,0.096073,0.727083,0.809944,0.264736,0.945147,7.0,0.313547,0.19216,0.075076,0.959145,0.590979,0.688038
7,0.313547,0.19216,0.075076,0.959145,0.590979,0.688038,8.0,0.829381,0.912156,0.409433,0.670318,0.823305,0.918405
8,0.829381,0.912156,0.409433,0.670318,0.823305,0.918405,9.0,0.255362,0.109674,0.403582,0.930156,0.157312,0.572863
9,0.255362,0.109674,0.403582,0.930156,0.157312,0.572863,10.0,0.20931,0.968352,0.97937,0.709929,0.620378,0.068132
10,0.20931,0.968352,0.97937,0.709929,0.620378,0.068132,,,,,,,
