In [1]:
import numpy as np
import pandas as pd

## Median-Based Anamoly Detection

In [2]:
# Median Based Anomaly Detection
# we take absloute difference between every value and the median, if that is greater than reasonable threshold, we consider it as anaomoly

In [3]:
x=pd.Series([2.1,2.3,2.2,4.5,2.4])
median = np.median(x)
threshold = 2  #this is an assumption
outliers = []
for i in x:
    if abs(i-median) > threshold:
        outliers.append(i)
print(outliers)

[4.5]


In [4]:
# in above we found the outlier/anamoly

## Mean-Based Anamoly Detection

In [6]:
# condition for NOT anomoly is:
# (mean-std) <= value <= (mean+std)

# if the value is not in that range, then it is considered as an anomoly
# std is standard deviation

In [7]:
mean = np.mean(x)
std = np.std(x)
outliers= []
for i in x:
    if (i < mean-std) or (i > mean+std):
        outliers.append(i)
outliers

[4.5]

## Z-score-based Anamoly Detection

In [8]:
# formula
# z=(value-mean)/std
# if z is greater than a reasonable treshold, then it is said to be a outlier

In [18]:
outliers=[]
for i in x:
    z = (i-mean)/std
    if z>1.5:
        outliers.append(i)
outliers

[4.5]

## Interquartile Range for Anomaly Detection

In [20]:
# A quartile divides the sorted data into 3 points and 4 intervals.
# first quartile is at 25% point of our data
# second is at 50%
# third is at 75%
# Interquartile is the range between the 3rd point and 1st point
# IRQ = Q3-Q1

# any value < (Q1-1.5*IQR) or value > (Q3+1.5*IQR) is considered as anamoly

In [21]:
Q1,Q3 = np.percentile(x,[25,75])  # this will give the values at 25% and 75%
IQR = Q3-Q1
outliers=[]
for i in x:
    if i < (Q1 - 1.5 * IQR) or i > (Q3 + 1.5 * IQR):
        outliers.append(i)
outliers

[4.5]

In [22]:
# In all the Anamoly Detection methods we get the same outlier
# but this is not the case with every data, if the data is more, each method will give different outliers

# Dealing with missing Values

In [23]:
# missing value is determined as NaN in pandas

In [43]:
data1 = {'Name':['Edison','Tesla','Elon','Newton'],'Age':[92,88,21,None]}
data = pd.DataFrame.from_dict(data1)
data

Unnamed: 0,Name,Age
0,Edison,92.0
1,Tesla,88.0
2,Elon,21.0
3,Newton,


In [32]:
# we get bool dataframe with true at the null value place
data.isnull()

Unnamed: 0,Name,Age
0,False,False
1,False,False
2,False,False
3,False,True


In [34]:
# gives the sum of null values in each column
data.isnull().sum()

Name    0
Age     1
dtype: int64

In [35]:
# if the null values are less, we can simple delete the row with null value
# but if the null values are more, if we delete all null value containig rows
# we wont get accurate results

# in such cases we replace missing values with mean/median/mode

In [41]:
# deleting row with missing function
data1 = data
data.dropna(inplace = True)
data

Unnamed: 0,Name,Age
0,Edison,92.0
1,Tesla,88.0
2,Elon,21.0


In [60]:
data1 = {'Name':['Edison','Tesla','Elon','Newton'],'Age':[92,88,21,None]}
data = pd.DataFrame.from_dict(data1)
data1 = data
data

Unnamed: 0,Name,Age
0,Edison,92.0
1,Tesla,88.0
2,Elon,21.0
3,Newton,


In [62]:
# here we can see the null value is filled with mean
data['Age'].fillna(data['Age'].mean(),inplace=True)
data

Unnamed: 0,Name,Age
0,Edison,92.0
1,Tesla,88.0
2,Elon,21.0
3,Newton,67.0


# Regular Expression

In [63]:
# these are used to match patterns in a string
# re module is used for regular expression

In [64]:
import re

In [68]:
txt = 'Python is my fav programming language. I love Python'
x = re.findall('Python',txt)   #here we get a list of all occurances
print(x)
print(len(x))

['Python', 'Python']
2


In [70]:
# ^ used to check wether a string starts with given pattern

re.findall('^Python',txt)

['Python']

In [73]:
y = 'I am Jack'
re.findall('^python',y)  #not matched so empty list

[]

In [76]:
# '\d' is used to find numbers
txt = 'Python was released in 1991, now it is 2022'
re.findall('\d',txt)   #gives all nums in a list

['1', '9', '9', '1', '2', '0', '2', '2']

In [78]:
re.findall('\d+',txt)   #gives continuous nums as a num

['1991', '2022']

In [79]:
# re will only work on strings
# if we want to work on Series then we should use to_string to convert Series object into String

In [82]:
txt = 'Hello World'
match_obj = re.search('Hello',txt)  #searches the existence
match_obj

<re.Match object; span=(0, 5), match='Hello'>

In [84]:
match_obj.span()   #gives the location of string in the whole string

(0, 5)

In [91]:
# for replacing
a = re.sub(pattern='World',repl='Gibbs',string=txt)
a

'Hello Gibbs'

# Feature Scaling

In [93]:
# all column in data might not be in same range like(1-100)
# some times we want all columns in same range, it can be achieved by feature scaling

In [106]:
data = pd.DataFrame.from_dict({'Age' : [26,25,35,42,96,78],'Salary':[10000,13000,22000,31000,41000,14000]})
data

Unnamed: 0,Age,Salary
0,26,10000
1,25,13000
2,35,22000
3,42,31000
4,96,41000
5,78,14000


In [111]:
# min-max Scaling or Normalization

data = (data - data.min()) / (data.max() - data.min())
data

Unnamed: 0,Age,Salary
0,0.014085,0.0
1,0.0,0.096774
2,0.140845,0.387097
3,0.239437,0.677419
4,1.0,1.0
5,0.746479,0.129032


In [112]:
# standardization

data = (data-data.mean()) / data.std()
data

Unnamed: 0,Age,Salary
0,-0.821273,-0.978775
1,-0.855024,-0.730635
2,-0.517514,0.013786
3,-0.281258,0.758206
4,1.541292,1.58534
5,0.933776,-0.647922


In [113]:
data.std()   #used to check wether they are in same range, all 1 means same

Age       1.0
Salary    1.0
dtype: float64