# Outliers , Skewness , Pandas 

In [105]:
import pandas as pd
import numpy as np

In [107]:
#generating large data set

In [109]:
np.random.seed(0)

In [111]:
data = {
    'Age': np.random.normal(30, 10, 10000).tolist(),  # Age data (10,000 rows)
    'Income': np.random.normal(50000, 15000, 10000).tolist(),  # Income data (10,000 rows)
    'Spending_Score': np.random.normal(60, 20, 10000).tolist()  # Spending Score data (10,000 rows)
}

In [113]:
# Manually adding outliers to each column (keeping the lengths consistent)
data['Age'].extend([120, 130, 150])  # Adding outliers to 'Age'
data['Income'].extend([200000, 300000, 500000])  # Adding outliers to 'Income'
data['Spending_Score'].extend([5, 99, 120])  # Adding outliers to 'Spending_Score'

In [115]:
df=pd.DataFrame(data)

In [117]:
df

Unnamed: 0,Age,Income,Spending_Score
0,47.640523,46968.244593,66.600918
1,34.001572,37501.534990,59.990400
2,39.787380,76004.003712,76.362318
3,52.408932,52859.735061,68.564274
4,48.675580,47332.844095,9.921054
...,...,...,...
9998,29.670793,34116.031075,79.536877
9999,42.981114,45102.073364,85.110013
10000,120.000000,200000.000000,5.000000
10001,130.000000,300000.000000,99.000000


In [119]:
df.head()

Unnamed: 0,Age,Income,Spending_Score
0,47.640523,46968.244593,66.600918
1,34.001572,37501.53499,59.9904
2,39.78738,76004.003712,76.362318
3,52.408932,52859.735061,68.564274
4,48.67558,47332.844095,9.921054


### Outliers 

In [122]:
#In the Age column, extreme values like 120, 130, and 150 will be identified as outliers.
#In the Income column, high values like 200,000, 300,000, and 500,000 will be flagged as outliers.
#Similarly, any extreme values for Spending_Score will be detected (e.g., 5, 99).

In [124]:
#In a large dataset, outliers could represent rare events or errors. For example, in financial data, 
#extreme high incomes might represent high-net-worth individuals.

In [126]:
# OUTLIER DETECTION using IQR (Interquartile Range) method

In [135]:
def detect_outliers_large(column):
    Q1 = df[column].quantile(0.25)  # First Quartile
    Q3 = df[column].quantile(0.75)  # Third Quartile
    IQR = Q3 - Q1  # Interquartile Range
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    outliers = df[(df[column] < lower_bound) | (df[column] > upper_bound)]
    return outliers

In [137]:
print(detect_outliers_large('Age'))

              Age         Income  Spending_Score
271      2.274072   46348.060478       81.516312
427      2.603228   68784.035540       53.916671
494     56.962241   37737.494444       39.735798
589     -0.461431   46404.531331       36.650246
685      1.654455   56906.176084       59.632355
...           ...            ...             ...
9849     2.543346   21150.013345       31.243349
9901    58.652035   44474.143355       51.013196
10000  120.000000  200000.000000        5.000000
10001  130.000000  300000.000000       99.000000
10002  150.000000  500000.000000      120.000000

[67 rows x 3 columns]


In [139]:
print(detect_outliers_large('Income'))

              Age         Income  Spending_Score
24      52.697546    7951.488730       46.854104
140     15.087424   90071.954110       47.300213
241     26.027282   98307.526836       64.693060
728     38.126740   93505.317112       63.750558
752     34.481953   95856.522431      111.883662
...           ...            ...             ...
9917    18.121470   91004.048821       81.262760
9993    51.330500   91047.129717       44.747549
10000  120.000000  200000.000000        5.000000
10001  130.000000  300000.000000       99.000000
10002  150.000000  500000.000000      120.000000

[82 rows x 3 columns]


In [141]:
print(detect_outliers_large('Spending_Score'))

              Age         Income  Spending_Score
265     31.887786   41848.335304      -11.037105
304     36.663831   38510.292342      114.230942
619     14.930016   31171.880032      -10.517796
658     26.292960   30308.148238        5.131008
899     25.959677   59471.474386      118.929500
...           ...            ...             ...
9627    25.777066   59549.260292        1.683162
9663    21.386644   44653.558198        4.439867
9870    37.330581   63693.961479      113.512795
10000  120.000000  200000.000000        5.000000
10002  150.000000  500000.000000      120.000000

[82 rows x 3 columns]


### Skewness

In [144]:
#We calculate skewness using Pandas’ df.skew() method, which tells us if the
#data is skewed to the left (negative value) or right (positive value).

In [150]:
#Close to 0: Symmetrical distribution.

In [152]:
#Positive value: Right skewed (longer right tail).

In [154]:
#Negative value: Left skewed (longer left tail).

In [156]:
# might show a positive skew for Income if there are high-income outliers (e.g., 200,000, 300,000).

In [163]:
#Skewness values closer to 0 suggest symmetrical data

In [165]:
print(df.skew())

Age               0.359620
Income            2.777153
Spending_Score   -0.037771
dtype: float64


In [167]:
#Skewness measures the asymmetry of the data distribution.
#Positive Skew (Right Skewed): The tail on the right side of the distribution is longer. The mean is greater than the median.
#Negative Skew (Left Skewed): The tail on the left side is longer. The median is greater than the mean.
#Skewness helps analysts understand whether the data is normally distributed or not, and how it may impact models that assume normality.

### Kurtosis 

In [170]:
#Kurtosis measures the "tailedness" of the data distribution.

In [None]:
#Leptokurtic (positive kurtosis): Heavy tails with more extreme outliers.
#Platykurtic (negative kurtosis): Light tails with fewer outliers.
#Kurtosis helps understand how outliers influence the data and indicates the shape of the distribution, especially the presence of outliers.

In [176]:
#The df.kurt() function calculates kurtosis. Positive kurtosis indicates heavy tails, and negative kurtosis indicates light tails.

In [178]:
# If the kurtosis for Income is high, it means the distribution has heavier tails (more outliers), 
#which may require special attention during data cleaning or model building.

In [185]:
print(df.kurt())

Age                3.476649
Income            71.596729
Spending_Score     0.064317
dtype: float64


In [187]:
#If the kurtosis for Income is high, it means the distribution has heavier tails (more outliers), 
#which may require special attention during data cleaning or model building.

In [189]:
#High kurtosis in financial returns could indicate high risk due to frequent extreme gains or losses.

In [191]:
#Kurtosis will indicate whether the dataset has heavy or light tails, hinting at the distribution of outliers.