In [34]:
# libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from google.colab import drive
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_error
import xgboost as xgb

In [2]:
# ignore warnings
warnings.filterwarnings('ignore')

In [6]:
# load drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
# load data
df = pd.read_csv('/content/drive/MyDrive/Datasets/Flight_track_train.csv')
df.head()

Unnamed: 0,geo_longitude,geo_latitude,altitude_pressure,climb_rate,gps_altitude,air_speed,flight_track
0,-12.4952,54.9829,-580.9066,-4.2506,3107.9575,221.6929,10.9369
1,53.6539,30.3566,6565.813,16.1493,-920.0656,38.4727,317.1208
2,-80.7304,44.2387,6158.0963,-0.7801,-1287.7323,83.8126,220.1877
3,20.786,17.4654,11321.8167,9.5402,10969.4723,194.1098,216.1874
4,-85.3823,32.2703,4594.6961,0.4988,1710.2622,323.4483,312.527


## => Data preprocessing



In [8]:
# shape
df.shape

(14667, 7)

In [9]:
# missing values
df.isnull().sum()

Unnamed: 0,0
geo_longitude,0
geo_latitude,0
altitude_pressure,0
climb_rate,0
gps_altitude,0
air_speed,0
flight_track,0


In [11]:
# duplicates
df = df.drop_duplicates()
df.shape

(14667, 7)

In [12]:
df.head()

Unnamed: 0,geo_longitude,geo_latitude,altitude_pressure,climb_rate,gps_altitude,air_speed,flight_track
0,-12.4952,54.9829,-580.9066,-4.2506,3107.9575,221.6929,10.9369
1,53.6539,30.3566,6565.813,16.1493,-920.0656,38.4727,317.1208
2,-80.7304,44.2387,6158.0963,-0.7801,-1287.7323,83.8126,220.1877
3,20.786,17.4654,11321.8167,9.5402,10969.4723,194.1098,216.1874
4,-85.3823,32.2703,4594.6961,0.4988,1710.2622,323.4483,312.527


In [13]:
# create X and y
X = df.drop(columns = 'flight_track', axis = 1)
y = df.flight_track

In [15]:
X.head()

Unnamed: 0,geo_longitude,geo_latitude,altitude_pressure,climb_rate,gps_altitude,air_speed
0,-12.4952,54.9829,-580.9066,-4.2506,3107.9575,221.6929
1,53.6539,30.3566,6565.813,16.1493,-920.0656,38.4727
2,-80.7304,44.2387,6158.0963,-0.7801,-1287.7323,83.8126
3,20.786,17.4654,11321.8167,9.5402,10969.4723,194.1098
4,-85.3823,32.2703,4594.6961,0.4988,1710.2622,323.4483


In [16]:
y.head()

Unnamed: 0,flight_track
0,10.9369
1,317.1208
2,220.1877
3,216.1874
4,312.527


In [22]:
# scaling
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled

array([[-0.30399519,  0.97354865, -1.52910154, -0.65269701, -0.67045407,
        -0.21221071],
       [ 0.69986456, -0.30279212,  0.21165027,  2.51546202, -1.60553978,
        -1.46847742],
       [-1.339513  ,  0.41669438,  0.11234127, -0.11371908, -1.6908918 ,
        -1.15760009],
       ...,
       [ 0.69750777, -0.26204466,  1.93026551, -0.72780138,  0.0421972 ,
        -0.27641038],
       [ 1.30150267,  0.06626209, -0.05923812,  0.43505845,  0.64020913,
         0.99214951],
       [-1.23047854,  0.35090864, -0.38751884,  3.41452474, -0.55110207,
        -1.28507036]])

In [23]:
# split
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size = 0.2, random_state = 42)

In [32]:
# base models
models = {
    'RandomForest': RandomForestRegressor(),
    'XgBoost': xgb.XGBRegressor()
}

In [39]:
results = []
for name, model in models.items():
    model.fit(X_train, y_train)
    pred = model.predict(X_test)
    mae = mean_absolute_error(y_test, pred)
    mse = mean_squared_error(y_test, pred)

    results.append({
        'Model': name,
        'MSE': round(mse, 4),
        'MAE':round(mae, 4)
    })

In [40]:
# append result
results_df = pd.DataFrame(results, columns = ['Model', 'MSE', 'MAE'])

In [41]:
results_df

Unnamed: 0,Model,MSE,MAE
0,RandomForest,11400.0133,84.9648
1,XgBoost,12344.6629,88.1307
