In [9]:
import pandas as pd
import seaborn as sns
from pyod.models.mad import MAD

# Load a sample dataset
diamonds = sns.load_dataset("diamonds")
# Extract the feature we want
X = diamonds[["price"]]

# Initialize and fit a model
mad = MAD().fit(X)

# Extract the outlier labels
labels = mad.labels_

pd.Series(labels).value_counts()

0    49708
1     4232
Name: count, dtype: int64

In [10]:
diamonds[['price']].describe()

Unnamed: 0,price
count,53940.0
mean,3932.799722
std,3989.439738
min,326.0
25%,950.0
50%,2401.0
75%,5324.25
max,18823.0


In [11]:
diamonds.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   price    53940 non-null  int64   
 7   x        53940 non-null  float64 
 8   y        53940 non-null  float64 
 9   z        53940 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 3.0 MB


In [12]:
diamonds.head(2)

Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,Ideal,E,SI2,61.5,55.0,326,3.95,3.98,2.43
1,0.21,Premium,E,SI1,59.8,61.0,326,3.89,3.84,2.31


In [13]:
from sklearn.preprocessing import OrdinalEncoder

# Initialize the encoder
oe = OrdinalEncoder()

# Extract the categorical feature names
cats = diamonds.select_dtypes(include="category").columns.tolist()

# Encode the categorical features
cats_encoded = oe.fit_transform(diamonds[cats])

# Replace the old values with encoded values
diamonds.loc[:, cats] = cats_encoded

diamonds.head()

  diamonds.loc[:, cats] = cats_encoded
  diamonds.loc[:, cats] = cats_encoded
  diamonds.loc[:, cats] = cats_encoded


Unnamed: 0,carat,cut,color,clarity,depth,table,price,x,y,z
0,0.23,2.0,1.0,3.0,61.5,55.0,326,3.95,3.98,2.43
1,0.21,3.0,1.0,2.0,59.8,61.0,326,3.89,3.84,2.31
2,0.23,1.0,1.0,4.0,56.9,65.0,327,4.05,4.07,2.31
3,0.29,3.0,5.0,5.0,62.4,58.0,334,4.2,4.23,2.63
4,0.31,1.0,6.0,3.0,63.3,58.0,335,4.34,4.35,2.75


In [15]:
oe.categories_

[array(['Fair', 'Good', 'Ideal', 'Premium', 'Very Good'], dtype=object),
 array(['D', 'E', 'F', 'G', 'H', 'I', 'J'], dtype=object),
 array(['I1', 'IF', 'SI1', 'SI2', 'VS1', 'VS2', 'VVS1', 'VVS2'],
       dtype=object)]

In [16]:
oe.transform([['Fair','D','IF']])



array([[0., 0., 1.]])

In [17]:
X = diamonds.drop("price", axis=1)
y = diamonds[["price"]]
# Now, let’s build and fit the model:
from pyod.models.iforest import IForest

# Create a model with 10000 trees
iforest = IForest(n_estimators=10000)
iforest.fit(X)  # This will take a minute

# Extract the labels
labels = iforest.labels_

In [18]:
X_outlier_free = X[labels == 0]
y_outlier_free = X[labels == 0]



In [19]:
len(X_outlier_free),len(diamonds)


(48546, 53940)

In [25]:
from pyod.models.ecod import ECOD
clf = ECOD()
clf.fit(X)
# y_train_scores = clf.decision_scores_  # Outlier scores for training data
y_train_scores = clf.labels_
# y_test_scores = clf.decision_function(X_test)  #
pd.Series(y_train_scores).value_counts()

0    48546
1     5394
Name: count, dtype: int64