# Using Isolation Forests to perform data anomaly tests
We use Isolation Forests to demonstrate running data anomaly tests on a public NLP dataset (Sentiment140) from Kaggle

In [1]:
from collections import Counter
import pandas as pd
from sklearn.ensemble import IsolationForest
import numpy as np


In [2]:
# Load the Sentiment140 dataset
df = pd.read_csv('../data/training.1600000.processed.noemoticon.csv', encoding='latin-1', header=None)
df.columns = ['sentiment', 'id', 'date', 'query', 'user', 'text']
# Perform term frequency analysis
term_sentiment_counter = Counter()
for index, row in df.iterrows():
    terms = row['text'].split()
    sentiment = row['sentiment']
    for term in terms:
        term_sentiment_counter[(term, sentiment)] += 1


In [3]:
# Create feature vectors
term_features = {}
for (term, sentiment), count in term_sentiment_counter.items():
    if term not in term_features:
        term_features[term] = [0, 0]
    if sentiment == 4:  # Assuming '4' is positive sentiment
        term_features[term][0] = count
    elif sentiment == 0:  # Assuming '0' is negative sentiment
        term_features[term][1] = count
X = np.array(list(term_features.values()))

In [5]:
# Perform anomaly detection using Isolation Forest
clf = IsolationForest(contamination=0.01)
clf.fit(X)
anomaly_scores = clf.decision_function(X)

In [6]:
# Identify anomalous terms
sorted_indices = np.argsort(anomaly_scores)
anomalous_terms = np.array(list(term_features.keys()))[sorted_indices]
print("Most Anomalous Terms:", anomalous_terms[:10])

Most Anomalous Terms: ['of' 'it' 'that' 'have' 'is' 'so' 'my' 'to' 'the' 'me']
