In [2]:
import pandas as pd
import numpy as np
data = pd.DataFrame(np.random.randn(1000, 4))
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.022759,0.028171,-0.02058,-0.016249
std,0.982624,1.018218,1.005689,1.025799
min,-2.790342,-3.328531,-3.12227,-3.191551
25%,-0.679149,-0.653589,-0.738181,-0.720807
50%,-0.002729,0.011415,-0.037573,-0.006453
75%,0.658089,0.715809,0.658348,0.702808
max,2.733913,3.22453,3.294324,3.013884


In [3]:
col = data[2]
col[np.abs(col) > 3]

264   -3.122270
914    3.294324
Name: 2, dtype: float64

In [4]:
data[(np.abs(data) > 3).any(1)]

Unnamed: 0,0,1,2,3
7,-0.431843,-3.328531,0.518739,-0.221043
62,-0.629054,-3.003814,0.465665,-1.449405
135,-1.510877,0.961152,1.940851,-3.097943
264,1.345714,-0.251713,-3.12227,0.32163
276,-0.228522,-0.370088,-0.353556,3.013884
826,-0.137057,3.22453,-0.71307,-1.029819
828,-0.81227,-0.439204,-0.139248,-3.191551
914,0.865227,0.391879,3.294324,1.259472


In [None]:
# (np.abs(data) > 3) creates a boolean DataFrame with the same shape as data, where each element
#  is True if the corresponding element
#  in data has an absolute value greater than 3, and False otherwise.
# (np.abs(data) > 3).any(1) checks for each row if there is at least one True value 
# along the columns axis (axis=1), meaning that the row has at least one absolute value greater
# than 3. The result is a boolean Series with length equal to the number of rows in data.
# data[(np.abs(data) > 3).any(1)] selects only the rows in data that have at least one absolute value
# greater than 3, as identified by the boolean Series obtained in step 2nd step.

In [5]:
data[np.abs(data) > 3] = np.sign(data) * 3
data.describe()

Unnamed: 0,0,1,2,3
count,1000.0,1000.0,1000.0,1000.0
mean,-0.022759,0.028279,-0.020752,-0.015973
std,0.982624,1.016493,1.00439,1.024893
min,-2.790342,-3.0,-3.0,-3.0
25%,-0.679149,-0.653589,-0.738181,-0.720807
50%,-0.002729,0.011415,-0.037573,-0.006453
75%,0.658089,0.715809,0.658348,0.702808
max,2.733913,3.0,3.0,3.0


In [None]:
# np.abs(data) > 3 creates a boolean DataFrame with the same shape as data, where each element
#  is True if the absolute value of the
#  corresponding element in data is greater than 3, and False otherwise.
# np.sign(data) creates a DataFrame with the same shape as data, where each element is the 
# sign of the corresponding element in data: -1 for negative values, 0 for zeros, and 1 for positive values.
# np.sign(data) * 3 multiplies each element in the DataFrame obtained in step 2 by 3, so that the
# result is a DataFrame with the same sign as the corresponding element in data, but with an absolute value of 3.
# data[np.abs(data) > 3] = np.sign(data) * 3 replaces the elements in data that have an absolute 
# value greater than 3 with the corresponding elements in the DataFrame obtained in step 3, thus capping extreme
#  values to a maximum of 3 while preserving their sign.
# data.describe() returns a summary of the statistics of the DataFrame data, including the count, mean,
# standard deviation, minimum, and maximum values for each column

In [6]:
np.sign(data).head()

Unnamed: 0,0,1,2,3
0,1.0,-1.0,1.0,1.0
1,-1.0,1.0,1.0,-1.0
2,1.0,-1.0,1.0,-1.0
3,-1.0,1.0,-1.0,1.0
4,1.0,1.0,-1.0,-1.0


In [None]:
# np.sign(data) returns an array of the same shape as data where each element is either
# 1 (positive), -1 (negative), or 0 (zero) depending on
# the sign of the corresponding element in data.