In [2]:
import numpy as np
np.set_printoptions(threshold=10000, suppress=True)
import pandas as pd
import matplotlib.pyplot as plt

# Partie 1

## Chargement et visualisation des données

In [None]:
mickey = np.loadtxt('mouse.txt')
x1 = mickey[:,0]
x2 = mickey[:,1]

plt.scatter(x1, x2)
plt.xlabel('x1')
plt.ylabel('x2')
plt.title('Mouse data')
plt.show()

## Isolation Forest

In [None]:
from sklearn.ensemble import IsolationForest

iforest = IsolationForest(n_estimators=100, max_samples='auto')
iforest.fit(mickey)

df = pd.DataFrame(mickey)
df['scores'] = iforest.decision_function(mickey)
# -1 for outliers, 1 for inliers
df['anomaly'] = iforest.predict(mickey)
df.head(20)

In [None]:
anomaly = df.loc[df['anomaly'] == -1]
anomaly_index = list(anomaly.index)
print(len(anomaly))

## Local Outlier Factor

In [None]:
from sklearn.neighbors import LocalOutlierFactor

lof = LocalOutlierFactor(n_neighbors=20, contamination=0.1)
pred = lof.fit_predict(mickey)

# -1 for outliers, 1 for inliers
print(pred)


## Mieux choisir le seuil de contamination

In [None]:
contamination = [0.01,0.02,0.03,0.04,0.05,0.1,0.2,0.3,0.4,0.5]

print('Isolation Forest')
for c in contamination:
    iforest = IsolationForest(n_estimators=100, max_samples='auto', contamination=c)
    iforest.fit(mickey)
    df['scores'] = iforest.decision_function(mickey)
    df['anomaly'] = iforest.predict(mickey)
    anomaly = df.loc[df['anomaly'] == -1]
    anomaly_index = list(anomaly.index)
    print(len(anomaly))
    
print('Local Outlier Factor')
for c in contamination:
    lof = LocalOutlierFactor(n_neighbors=20, contamination=c)
    pred = lof.fit_predict(mickey)
    print(len([p for p in pred if p == -1]))

## Visualisation des données aberrantes

In [None]:
iforest = IsolationForest(n_estimators=100, max_samples='auto', contamination=0.02)
iforest.fit(mickey)
df['scores'] = iforest.decision_function(mickey)
df['anomaly'] = iforest.predict(mickey)
plt.scatter(x1, x2, c=df['anomaly'], cmap='coolwarm', s=20, edgecolors='k')
plt.xlabel('x1')
plt.ylabel('x2')
plt.title('Anomalies detected by Isolation Forest')
plt.show()

In [None]:
lof = LocalOutlierFactor(n_neighbors=20, contamination=0.02)
pred = lof.fit_predict(mickey)
plt.scatter(x1, x2, c=pred, cmap='coolwarm', s=20, edgecolors='k')
plt.xlabel('x1')
plt.ylabel('x2')
plt.title('Anomalies detected by Local Outlier Factor')
plt.show()

## Comparaison des méthodes

# Partie 2

## Préparation du jeu de données

In [None]:
from sklearn.preprocessing import StandardScaler

credit_card = pd.read_csv('creditcard.csv')
credit_card = credit_card.drop(columns=['Time'])

# Normalisation
scaler = StandardScaler()
credit_card['Amount'] = scaler.fit_transform(credit_card['Amount'].values.reshape(-1,1))

credit_card.head()

## Undersampling

## Oversampling

## Balancing

## Isolation Forest

## Local Outlier Factor