In [2]:
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
import pandas as pd
from tqdm import tqdm

## Importing Data into code

In [3]:
df1 = pd.read_csv("position-data-2024-02-01.csv",on_bad_lines='skip')
df2 = pd.read_csv("position-data-2024-02-02.csv",on_bad_lines='skip')
df3 = pd.read_csv("position-data-2024-02-03.csv",on_bad_lines='skip')

In [4]:
df = pd.concat([df1, df2 , df3], axis=0, ignore_index=True)

#### defining missing values percentage function

In [5]:
def get_missed_percentage(df):
    missed = pd.DataFrame()
    missed['column'] = df.columns
    missed['percent'] = [round(100* df[col].isnull().sum() / len(df), 2) for col in df.columns]
    missed = missed.sort_values('percent',ascending=False)
    print(missed)

In [6]:
get_missed_percentage(df)

  column  percent
6     gs    15.44
4    alt    11.29
5     hd     0.06
0     id     0.00
1      t     0.00
2     la     0.00
3     lo     0.00


## Data Preprocessing 

Sorting the values wrt plane-id and time-t => this is because the no of unique planes that are traveling in 3 days are 1234. so we need to arrange the dataframe in such a way that  it will be easy for us to see which flight is coming at what time and altitute , speed , latiute and longitude of the flights at that time

In [7]:
df = df.sort_values(by=['id','t'])

By soritng wrt time we can see the real time get moment of the flight and as it is a continous sequential data , it is best suited to sort the values time and get the expected result

Here we are interpolating the missing values to the nearest values -> this method takes from the nearst values and gives us a better result (as data doesn't vary much for consecutive seconds). 

In [8]:
#dividing the df into values and filling it with the nearest values

import warnings
warnings.filterwarnings("ignore")

final_df = pd.DataFrame()

flights = list(df['id'].unique())

for a in tqdm(flights):
    df_a = df[df['id'] == a]
    df_a['gs'] = df_a['gs'].interpolate('nearest')
    df_a['alt'] = df_a['alt'].interpolate('nearest')
    df_a['hd'] = df_a['hd'].interpolate('nearest')
    final_df = pd.concat([final_df, df_a], axis=0, ignore_index=True)



100%|██████████| 1234/1234 [09:31<00:00,  2.16it/s]


Checking again the missing values

In [9]:
get_missed_percentage(final_df)

  column  percent
4    alt     1.17
6     gs     0.72
5     hd     0.01
0     id     0.00
1      t     0.00
2     la     0.00
3     lo     0.00


removing null values if existing any after interpolation

In [10]:
#getting final dataset

final_df1 = final_df.dropna()

## Model Selection and Preperation

In [11]:
from sklearn.ensemble import IsolationForest
from sklearn.preprocessing import StandardScaler


# Dropping unnecessary columns
data = final_df1[['la', 'lo', 'alt', 'hd', 'gs']]
final_df1[['la', 'lo', 'alt', 'hd', 'gs']].corr()

Unnamed: 0,la,lo,alt,hd,gs
la,1.0,-0.318204,0.05011,-0.059221,0.049382
lo,-0.318204,1.0,-0.054795,-0.03764,-0.055987
alt,0.05011,-0.054795,1.0,-0.042261,0.831798
hd,-0.059221,-0.03764,-0.042261,1.0,-0.172675
gs,0.049382,-0.055987,0.831798,-0.172675,1.0


Defining accuracy function

In [12]:
def accuracy_score(data,anamolies):
    anomaly_count = anamolies.shape[0]
    accuracy = 100*list(data['anomaly']).count(-1)/(anomaly_count)
    print("Accuracy of the model:", accuracy)

Defining percentage of anamolies found

In [13]:
def anamoly_percentage(df,ana):
    s1,s2 = df.shape[0],ana.shape[0]
    perc=((s2/s1)*100)
    return perc

Modeling and fitting the data

In [14]:
# Standardize the data
import warnings
warnings.filterwarnings("ignore")

scaler = StandardScaler()
data_scaled = scaler.fit_transform(data)

# Train the Isolation Forest model
random_state = np.random.RandomState(42)
model=IsolationForest(n_estimators=100,max_samples='auto',contamination=float(0.1),random_state=random_state)

model.fit(data_scaled)
predictions = model.predict(data_scaled)
data['scores'] = model.decision_function(data)
data['anomaly'] = predictions

anomalies = data[data['anomaly'] == -1]


In [15]:
anomalies.shape

(310267, 7)

In [16]:
accuracy_score(data,anomalies)

Accuracy of the model: 100.0


From the above we can see that Isolation forest works better and i will work better in the scenarios if labeled data is provided earlier or semi supervised cases