In [1]:
# Generic inputs for most ML tasks
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge
from sklearn.linear_model import Lasso
from sklearn import tree
import graphviz
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from datetime import datetime
from sklearn.preprocessing import *
from sklearn.metrics import *
#pd.options.display.float_format = '{:,.2f}'.format

# setup interactive notebook mode
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

from IPython.display import display, HTML

In [2]:
dataset = pd.read_csv('../data_source/train_data/final_training_data.csv', parse_dates = ['Date (MM/DD/YYYY)'])
dataset.head()

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Scheduled Arrival Time,Actual Arrival Time,Scheduled Elapsed Time (Minutes),Actual Elapsed Time (Minutes),Arrival Delay (Minutes),...,Average Temperature(Celcius) /Day,Minimum Temperature(Celcius) /Day,Maximum Temperature(Celcius) /Day,Daily Precipitation(mm) /Day,Maximum Snowfall(mm) /Day,Wind Direction(Degrees) /Day,Wind Speed(km/h) /Day,Peak Wind Gust(km/h) /Day,Atmospheric Pressure(hPa) /Day,Sunshine Duration(m) /Day
0,UA,2022-01-01,1282,N4901U,IAD,23:10,0:01,70,76,51,...,14.4,13.3,16.7,16.5,0.0,165.0,7.6,,1006.7,
1,UA,2023-01-01,604,N814UA,DEN,14:58,14:52,193,177,-6,...,-0.9,-4.9,1.1,1.3,100.0,111.0,10.4,,1010.5,
2,UA,2023-01-01,2488,N38458,EWR,23:14,23:15,75,62,1,...,11.3,7.2,14.4,0.0,0.0,241.0,13.0,,1012.6,
3,UA,2023-01-01,2645,N23721,ORD,23:57,23:47,107,100,-10,...,4.3,2.8,7.2,4.1,0.0,245.0,6.1,,1013.7,
4,UA,2022-01-02,1282,N4901U,IAD,23:10,23:27,70,64,17,...,14.7,5.6,17.8,1.8,0.0,324.0,13.0,,1006.6,


In [3]:
dataset.columns

Index(['Carrier Code', 'Date (MM/DD/YYYY)', 'Flight Number', 'Tail Number',
       'Origin Airport', 'Scheduled Arrival Time', 'Actual Arrival Time',
       'Scheduled Elapsed Time (Minutes)', 'Actual Elapsed Time (Minutes)',
       'Arrival Delay (Minutes)', 'Wheels-on Time', 'Taxi-In time (Minutes)',
       'Delay Carrier (Minutes)', 'Delay Weather (Minutes)',
       'Delay National Aviation System (Minutes)', 'Delay Security (Minutes)',
       'Delay Late Aircraft Arrival (Minutes)', 'Date UTC /Day',
       'Average Temperature(Celcius) /Day',
       'Minimum Temperature(Celcius) /Day',
       'Maximum Temperature(Celcius) /Day', 'Daily Precipitation(mm) /Day',
       'Maximum Snowfall(mm) /Day', 'Wind Direction(Degrees) /Day',
       'Wind Speed(km/h) /Day', 'Peak Wind Gust(km/h) /Day',
       'Atmospheric Pressure(hPa) /Day', 'Sunshine Duration(m) /Day'],
      dtype='object')

In [4]:
dataset['Carrier Code'].unique()

array(['UA'], dtype=object)

In [5]:
dataset.sort_values(['Date (MM/DD/YYYY)'],ascending=True)

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Scheduled Arrival Time,Actual Arrival Time,Scheduled Elapsed Time (Minutes),Actual Elapsed Time (Minutes),Arrival Delay (Minutes),...,Average Temperature(Celcius) /Day,Minimum Temperature(Celcius) /Day,Maximum Temperature(Celcius) /Day,Daily Precipitation(mm) /Day,Maximum Snowfall(mm) /Day,Wind Direction(Degrees) /Day,Wind Speed(km/h) /Day,Peak Wind Gust(km/h) /Day,Atmospheric Pressure(hPa) /Day,Sunshine Duration(m) /Day
733,UA,2017-06-08,1500,N464UA,ORD,16:37,17:01,110,144,24,...,20.6,11.7,28.3,1.5,0.0,,6.8,,1014.4,
738,UA,2017-06-09,1500,N819UA,ORD,16:37,16:31,110,107,-6,...,23.5,18.3,30.6,0.0,0.0,250.0,14.4,,1008.4,
744,UA,2017-06-10,849,N492UA,ORD,20:41,21:01,111,117,20,...,24.2,17.8,32.2,0.0,0.0,203.0,22.7,,1009.3,
745,UA,2017-06-10,1500,N806UA,ORD,16:37,16:22,110,100,-15,...,24.2,17.8,32.2,0.0,0.0,203.0,22.7,,1009.3,
751,UA,2017-06-11,1500,N434UA,ORD,16:37,16:26,110,101,-11,...,27.7,22.8,33.9,0.0,0.0,199.0,24.1,,1011.8,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,UA,2023-01-30,1998,N836UA,ORD,21:17,21:21,113,103,4,...,-9.5,-14.9,-7.7,0.5,80.0,313.0,15.5,,1029.7,
164,UA,2023-01-30,2617,N66831,EWR,23:12,23:15,74,74,3,...,8.4,3.9,13.9,0.0,0.0,220.0,9.4,,1020.2,
168,UA,2023-01-31,604,N851UA,DEN,14:59,14:47,193,175,-12,...,-14.9,-21.6,-4.9,0.3,100.0,178.0,13.3,,1026.1,
169,UA,2023-01-31,1998,N808UA,ORD,21:17,20:52,113,97,-25,...,-14.3,-18.2,-11.0,0.0,80.0,269.0,9.7,,1030.7,


In [6]:
def delay_status(x):
    if x < -10:
        return 'Early'
    elif x >= -10 and x <= 10:
        return 'On-Time'
    elif x > 10 and x <= 30:
        return 'Late'
    else:
        return 'Severely Late'

In [7]:
dataset['Status'] = dataset['Arrival Delay (Minutes)'].apply(delay_status)

In [8]:
dataset['Status'].value_counts()

On-Time          939
Early            681
Severely Late    299
Late             230
Name: Status, dtype: int64

In [9]:
dataset['Date (MM/DD/YYYY)'].dt.month

0        1
1        1
2        1
3        1
4        1
        ..
2144    12
2145    12
2146    12
2147    12
2148    12
Name: Date (MM/DD/YYYY), Length: 2149, dtype: int64

In [10]:
test = dataset[dataset['Date (MM/DD/YYYY)'].dt.month.isin([1,2,3,4])]

In [11]:
test['Status'].value_counts()

On-Time          284
Early            162
Severely Late     94
Late              64
Name: Status, dtype: int64

In [12]:
test.sort_values(['Date (MM/DD/YYYY)'],inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return func(*args, **kwargs)


In [13]:
test

Unnamed: 0,Carrier Code,Date (MM/DD/YYYY),Flight Number,Tail Number,Origin Airport,Scheduled Arrival Time,Actual Arrival Time,Scheduled Elapsed Time (Minutes),Actual Elapsed Time (Minutes),Arrival Delay (Minutes),...,Minimum Temperature(Celcius) /Day,Maximum Temperature(Celcius) /Day,Daily Precipitation(mm) /Day,Maximum Snowfall(mm) /Day,Wind Direction(Degrees) /Day,Wind Speed(km/h) /Day,Peak Wind Gust(km/h) /Day,Atmospheric Pressure(hPa) /Day,Sunshine Duration(m) /Day,Status
22,UA,2019-01-07,607,N810UA,ORD,20:49,20:18,109,86,-31,...,4.4,11.7,11.2,0.0,189.0,25.2,,1007.0,,Early
27,UA,2019-01-08,607,N4888U,ORD,20:49,21:31,109,101,42,...,-4.3,7.2,0.0,0.0,263.0,23.4,,1010.2,,Severely Late
32,UA,2019-01-09,607,N816UA,ORD,20:49,20:38,109,106,-11,...,-9.3,-4.3,0.0,0.0,308.0,22.0,,1024.6,,Early
38,UA,2019-01-10,607,N4888U,ORD,20:49,21:14,109,121,25,...,-9.9,-4.3,0.0,0.0,329.0,7.2,,1030.2,,Late
45,UA,2019-01-11,607,N897UA,ORD,20:49,20:44,109,100,-5,...,-8.2,-1.0,0.0,0.0,181.0,7.9,,1028.2,,On-Time
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
163,UA,2023-01-30,1998,N836UA,ORD,21:17,21:21,113,103,4,...,-14.9,-7.7,0.5,80.0,313.0,15.5,,1029.7,,On-Time
164,UA,2023-01-30,2617,N66831,EWR,23:12,23:15,74,74,3,...,3.9,13.9,0.0,0.0,220.0,9.4,,1020.2,,On-Time
169,UA,2023-01-31,1998,N808UA,ORD,21:17,20:52,113,97,-25,...,-18.2,-11.0,0.0,80.0,269.0,9.7,,1030.7,,Early
170,UA,2023-01-31,2617,N68807,EWR,23:12,22:59,74,66,-13,...,0.0,9.4,1.5,0.0,348.0,16.9,,1021.9,,Early


In [14]:
dataset.columns

Index(['Carrier Code', 'Date (MM/DD/YYYY)', 'Flight Number', 'Tail Number',
       'Origin Airport', 'Scheduled Arrival Time', 'Actual Arrival Time',
       'Scheduled Elapsed Time (Minutes)', 'Actual Elapsed Time (Minutes)',
       'Arrival Delay (Minutes)', 'Wheels-on Time', 'Taxi-In time (Minutes)',
       'Delay Carrier (Minutes)', 'Delay Weather (Minutes)',
       'Delay National Aviation System (Minutes)', 'Delay Security (Minutes)',
       'Delay Late Aircraft Arrival (Minutes)', 'Date UTC /Day',
       'Average Temperature(Celcius) /Day',
       'Minimum Temperature(Celcius) /Day',
       'Maximum Temperature(Celcius) /Day', 'Daily Precipitation(mm) /Day',
       'Maximum Snowfall(mm) /Day', 'Wind Direction(Degrees) /Day',
       'Wind Speed(km/h) /Day', 'Peak Wind Gust(km/h) /Day',
       'Atmospheric Pressure(hPa) /Day', 'Sunshine Duration(m) /Day',
       'Status'],
      dtype='object')

In [15]:
#pre-processing the data
#one hot encode categorical data

from sklearn.preprocessing import OneHotEncoder

def get_ohe(df, col):
    ohe = OneHotEncoder(handle_unknown='error', sparse=False, dtype='int')
    col_name= ohe.fit(df[[col]])
    temp_df = pd.DataFrame(data=ohe.transform(df[[col]]), columns=ohe.get_feature_names_out())
    df.drop(columns=[col], axis=1, inplace=True)
    df = pd.concat([df.reset_index(drop=True), temp_df], axis=1)
    return df,col_name

In [16]:
dataset.columns

Index(['Carrier Code', 'Date (MM/DD/YYYY)', 'Flight Number', 'Tail Number',
       'Origin Airport', 'Scheduled Arrival Time', 'Actual Arrival Time',
       'Scheduled Elapsed Time (Minutes)', 'Actual Elapsed Time (Minutes)',
       'Arrival Delay (Minutes)', 'Wheels-on Time', 'Taxi-In time (Minutes)',
       'Delay Carrier (Minutes)', 'Delay Weather (Minutes)',
       'Delay National Aviation System (Minutes)', 'Delay Security (Minutes)',
       'Delay Late Aircraft Arrival (Minutes)', 'Date UTC /Day',
       'Average Temperature(Celcius) /Day',
       'Minimum Temperature(Celcius) /Day',
       'Maximum Temperature(Celcius) /Day', 'Daily Precipitation(mm) /Day',
       'Maximum Snowfall(mm) /Day', 'Wind Direction(Degrees) /Day',
       'Wind Speed(km/h) /Day', 'Peak Wind Gust(km/h) /Day',
       'Atmospheric Pressure(hPa) /Day', 'Sunshine Duration(m) /Day',
       'Status'],
      dtype='object')

In [17]:
#get ohe for flight

data_flight_no,flight_number_ohe = get_ohe(dataset,'Flight Number')
#tail_no_df, tail_number_ohe = get_ohe(data_flight_no,'Tail Number')
origin_air,origin_airport_ohe = get_ohe(data_flight_no,'Origin Airport')



In [18]:
#drop Carrier code cause its just a single airline

data = origin_air.drop(['Carrier Code','Taxi-In time (Minutes)','Delay Carrier (Minutes)','Delay Weather (Minutes)',
                       'Delay National Aviation System (Minutes)', 'Delay Security (Minutes)',
                        'Delay Late Aircraft Arrival (Minutes)','Date UTC /Day','Wheels-on Time',
                       'Actual Arrival Time','Actual Elapsed Time (Minutes)','Arrival Delay (Minutes)',
                       'Scheduled Elapsed Time (Minutes)','Tail Number','Peak Wind Gust(km/h) /Day',
                       'Sunshine Duration(m) /Day'],axis=1)
data

Unnamed: 0,Date (MM/DD/YYYY),Scheduled Arrival Time,Average Temperature(Celcius) /Day,Minimum Temperature(Celcius) /Day,Maximum Temperature(Celcius) /Day,Daily Precipitation(mm) /Day,Maximum Snowfall(mm) /Day,Wind Direction(Degrees) /Day,Wind Speed(km/h) /Day,Atmospheric Pressure(hPa) /Day,...,Flight Number_2419,Flight Number_2488,Flight Number_2615,Flight Number_2617,Flight Number_2634,Flight Number_2645,Origin Airport_DEN,Origin Airport_EWR,Origin Airport_IAD,Origin Airport_ORD
0,2022-01-01,23:10,14.4,13.3,16.7,16.5,0.0,165.0,7.6,1006.7,...,0,0,0,0,0,0,0,0,1,0
1,2023-01-01,14:58,-0.9,-4.9,1.1,1.3,100.0,111.0,10.4,1010.5,...,0,0,0,0,0,0,1,0,0,0
2,2023-01-01,23:14,11.3,7.2,14.4,0.0,0.0,241.0,13.0,1012.6,...,0,1,0,0,0,0,0,1,0,0
3,2023-01-01,23:57,4.3,2.8,7.2,4.1,0.0,245.0,6.1,1013.7,...,0,0,0,0,0,1,0,0,0,1
4,2022-01-02,23:10,14.7,5.6,17.8,1.8,0.0,324.0,13.0,1006.6,...,0,0,0,0,0,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2144,2019-12-31,18:15,4.6,3.3,7.8,0.0,0.0,294.0,16.9,1004.8,...,0,0,0,0,0,0,0,1,0,0
2145,2021-12-31,18:38,11.9,10.6,16.1,0.5,0.0,201.0,7.9,1012.8,...,0,0,0,0,0,0,0,0,1,0
2146,2022-12-31,14:58,-0.1,-3.2,9.4,0.0,130.0,189.0,14.0,1005.0,...,0,0,0,0,0,0,1,0,0,0
2147,2022-12-31,21:08,0.1,-3.2,3.9,0.0,0.0,23.0,4.3,1013.4,...,0,0,0,0,0,0,0,0,0,1


In [19]:
data.columns[0:20]

Index(['Date (MM/DD/YYYY)', 'Scheduled Arrival Time',
       'Average Temperature(Celcius) /Day',
       'Minimum Temperature(Celcius) /Day',
       'Maximum Temperature(Celcius) /Day', 'Daily Precipitation(mm) /Day',
       'Maximum Snowfall(mm) /Day', 'Wind Direction(Degrees) /Day',
       'Wind Speed(km/h) /Day', 'Atmospheric Pressure(hPa) /Day', 'Status',
       'Flight Number_212', 'Flight Number_279', 'Flight Number_288',
       'Flight Number_308', 'Flight Number_316', 'Flight Number_333',
       'Flight Number_358', 'Flight Number_360', 'Flight Number_376'],
      dtype='object')

In [20]:
data= data.fillna(data.median())

  data= data.fillna(data.median())
  data= data.fillna(data.median())


In [21]:
# Define custom method to convert date column
def convert_date(date_str):
    date_obj = pd.to_datetime(date_str)  # Convert date string to datetime object
    return date_obj.strftime('%m/%d/%Y')  # Format datetime object into desired format

# Apply custom method to date column using apply()
data['Date'] = data['Date (MM/DD/YYYY)'].apply(convert_date)
data.drop(['Date (MM/DD/YYYY)'],axis=1,inplace=True)

In [22]:
# Define custom method to convert date column
def convert_time(date_str):
    date_obj = pd.to_datetime(date_str)  # Convert date string to datetime object
    return date_obj.strftime("%I:%M %p")  

In [23]:
data['Arrival Time']= data['Scheduled Arrival Time'].apply(convert_time)

In [24]:
data.drop(['Scheduled Arrival Time'],axis=1,inplace=True)

In [25]:
data['Arrival Time'] = pd.to_datetime(data['Arrival Time'])
data['Hour'] = data['Arrival Time'].dt.hour
data['Minute'] = data['Arrival Time'].dt.minute

In [26]:
data['Date'] = pd.to_datetime(data['Date'])
data['Day'] = data['Date'].dt.strftime('%A')

In [27]:
data.isna().sum()

Average Temperature(Celcius) /Day    0
Minimum Temperature(Celcius) /Day    0
Maximum Temperature(Celcius) /Day    0
Daily Precipitation(mm) /Day         0
Maximum Snowfall(mm) /Day            0
                                    ..
Date                                 0
Arrival Time                         0
Hour                                 0
Minute                               0
Day                                  0
Length: 110, dtype: int64

In [28]:
data['year'] = data.Date.dt.year
data['month'] = data.Date.dt.month
data['day'] = data.Date.dt.day
data.drop(['Date'],axis=1,inplace=True)

In [29]:
data, week_ohe = get_ohe(data,'Day')

In [30]:
status,status_ohe= get_ohe(data,'Status')

In [31]:
status.columns[115:]

Index(['Day_Tuesday', 'Day_Wednesday', 'Status_Early', 'Status_Late',
       'Status_On-Time', 'Status_Severely Late'],
      dtype='object')

In [32]:
status['day']

0        1
1        1
2        1
3        1
4        2
        ..
2144    31
2145    31
2146    31
2147    31
2148    31
Name: day, Length: 2149, dtype: int64

In [33]:
status.to_csv("Pre-processed_data_20thapril.csv",index=False)

In [34]:
Y = status[['Status_Early', 'Status_Late', 'Status_On-Time',
       'Status_Severely Late']]
Y

Unnamed: 0,Status_Early,Status_Late,Status_On-Time,Status_Severely Late
0,0,0,0,1
1,0,0,1,0
2,0,0,1,0
3,0,0,1,0
4,0,1,0,0
...,...,...,...,...
2144,0,0,1,0
2145,0,0,1,0
2146,1,0,0,0
2147,1,0,0,0


In [35]:
X = status.loc[:,~status.columns.isin(['Status_Early', 'Status_Late', 'Status_On-Time',
       'Status_Severely Late','Arrival Time'])]

In [36]:
X.columns[100:]

Index(['Origin Airport_DEN', 'Origin Airport_EWR', 'Origin Airport_IAD',
       'Origin Airport_ORD', 'Hour', 'Minute', 'year', 'month', 'day',
       'Day_Friday', 'Day_Monday', 'Day_Saturday', 'Day_Sunday',
       'Day_Thursday', 'Day_Tuesday', 'Day_Wednesday'],
      dtype='object')

In [37]:
#split our dataset
X_train, X_test, y_train, y_test = train_test_split(X,Y, test_size=0.20, random_state=42)

In [38]:
#scale our data
sc = StandardScaler().fit(X_train)
X_train = pd.DataFrame(sc.transform(X_train),index=X_train.index,columns = X_train.columns)
X_train

Unnamed: 0,Average Temperature(Celcius) /Day,Minimum Temperature(Celcius) /Day,Maximum Temperature(Celcius) /Day,Daily Precipitation(mm) /Day,Maximum Snowfall(mm) /Day,Wind Direction(Degrees) /Day,Wind Speed(km/h) /Day,Atmospheric Pressure(hPa) /Day,Flight Number_212,Flight Number_279,...,year,month,day,Day_Friday,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday
1433,0.591358,0.637872,0.565808,-0.368856,-0.233644,1.511669,-1.014789,-0.130630,-0.041812,-0.041812,...,0.173627,0.662570,-0.787255,-0.425008,-0.405751,-0.376077,-0.437342,2.452820,-0.402830,-0.401854
630,-0.101544,-0.161510,-0.072641,-0.368856,-0.233644,-1.538013,0.321852,-0.217965,-0.041812,-0.041812,...,0.761155,-0.560472,-1.015601,-0.425008,-0.405751,2.659027,-0.437342,-0.407694,-0.402830,-0.401854
78,-1.098185,-1.163146,-0.915039,-0.368856,-0.233644,-0.098163,-0.504755,-2.037447,-0.041812,-0.041812,...,1.348683,-1.783515,0.011955,-0.425008,2.464564,-0.376077,-0.437342,-0.407694,-0.402830,-0.401854
366,-0.481217,-0.903106,0.219981,-0.368856,-0.233644,0.471778,0.638425,0.087708,-0.041812,-0.041812,...,0.761155,-1.171993,-0.102218,-0.425008,-0.405751,-0.376077,-0.437342,-0.407694,2.482438,-0.401854
1996,-0.500200,-0.643066,-0.374131,-0.245480,-0.233644,-0.118161,1.764018,-1.658995,-0.041812,-0.041812,...,0.761155,1.274091,1.496203,-0.425008,-0.405751,-0.376077,-0.437342,-0.407694,2.482438,-0.401854
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1638,0.714752,0.637872,0.858431,-0.368856,-0.233644,-0.218151,0.954998,0.640830,-0.041812,-0.041812,...,-2.176486,0.968331,-1.586465,-0.425008,2.464564,-0.376077,-0.437342,-0.407694,-0.402830,-0.401854
1095,1.350703,1.340944,1.301798,0.571892,-0.233644,0.651759,-1.137901,-0.581862,-0.041812,-0.041812,...,0.761155,0.051049,1.382030,-0.425008,-0.405751,-0.376077,-0.437342,2.452820,-0.402830,-0.401854
1130,0.857129,0.801601,0.858431,-0.368856,-0.233644,-1.428025,-1.085139,0.640830,-0.041812,-0.041812,...,-1.001430,0.356810,-1.586465,2.352896,-0.405751,-0.376077,-0.437342,-0.407694,-0.402830,-0.401854
1294,1.018490,1.119428,1.062380,-0.368856,-0.233644,0.111815,-0.944440,-0.217965,-0.041812,-0.041812,...,0.173627,0.356810,0.811166,-0.425008,2.464564,-0.376077,-0.437342,-0.407694,-0.402830,-0.401854


In [39]:
#get sample weights
from sklearn.utils.class_weight import compute_sample_weight

sample_weights = compute_sample_weight("balanced", y_train)

In [40]:
sample_weights

array([0.43756477, 0.2809572 , 0.2809572 , ..., 0.43756477, 0.43756477,
       0.2809572 ])

In [47]:
X_train

Unnamed: 0,Average Temperature(Celcius) /Day,Minimum Temperature(Celcius) /Day,Maximum Temperature(Celcius) /Day,Daily Precipitation(mm) /Day,Maximum Snowfall(mm) /Day,Wind Direction(Degrees) /Day,Wind Speed(km/h) /Day,Atmospheric Pressure(hPa) /Day,Flight Number_212,Flight Number_279,...,year,month,day,Day_Friday,Day_Monday,Day_Saturday,Day_Sunday,Day_Thursday,Day_Tuesday,Day_Wednesday
1433,0.591358,0.637872,0.565808,-0.368856,-0.233644,1.511669,-1.014789,-0.130630,-0.041812,-0.041812,...,0.173627,0.662570,-0.787255,-0.425008,-0.405751,-0.376077,-0.437342,2.452820,-0.402830,-0.401854
630,-0.101544,-0.161510,-0.072641,-0.368856,-0.233644,-1.538013,0.321852,-0.217965,-0.041812,-0.041812,...,0.761155,-0.560472,-1.015601,-0.425008,-0.405751,2.659027,-0.437342,-0.407694,-0.402830,-0.401854
78,-1.098185,-1.163146,-0.915039,-0.368856,-0.233644,-0.098163,-0.504755,-2.037447,-0.041812,-0.041812,...,1.348683,-1.783515,0.011955,-0.425008,2.464564,-0.376077,-0.437342,-0.407694,-0.402830,-0.401854
366,-0.481217,-0.903106,0.219981,-0.368856,-0.233644,0.471778,0.638425,0.087708,-0.041812,-0.041812,...,0.761155,-1.171993,-0.102218,-0.425008,-0.405751,-0.376077,-0.437342,-0.407694,2.482438,-0.401854
1996,-0.500200,-0.643066,-0.374131,-0.245480,-0.233644,-0.118161,1.764018,-1.658995,-0.041812,-0.041812,...,0.761155,1.274091,1.496203,-0.425008,-0.405751,-0.376077,-0.437342,-0.407694,2.482438,-0.401854
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1638,0.714752,0.637872,0.858431,-0.368856,-0.233644,-0.218151,0.954998,0.640830,-0.041812,-0.041812,...,-2.176486,0.968331,-1.586465,-0.425008,2.464564,-0.376077,-0.437342,-0.407694,-0.402830,-0.401854
1095,1.350703,1.340944,1.301798,0.571892,-0.233644,0.651759,-1.137901,-0.581862,-0.041812,-0.041812,...,0.761155,0.051049,1.382030,-0.425008,-0.405751,-0.376077,-0.437342,2.452820,-0.402830,-0.401854
1130,0.857129,0.801601,0.858431,-0.368856,-0.233644,-1.428025,-1.085139,0.640830,-0.041812,-0.041812,...,-1.001430,0.356810,-1.586465,2.352896,-0.405751,-0.376077,-0.437342,-0.407694,-0.402830,-0.401854
1294,1.018490,1.119428,1.062380,-0.368856,-0.233644,0.111815,-0.944440,-0.217965,-0.041812,-0.041812,...,0.173627,0.356810,0.811166,-0.425008,2.464564,-0.376077,-0.437342,-0.407694,-0.402830,-0.401854


In [46]:
!pip install xgboost
from xgboost import XGBClassifier
from sklearn.ensemble import *

model = XGBClassifier(n_estimators=100, learning_rate=1.0,max_depth=4, random_state=0)
model = model.fit(X_train,y_train,sample_weight=sample_weights)

model.score(X_train,y_train)



ValueError: y should be a 1d array, got an array of shape (1719, 4) instead.

In [None]:
y_train

In [None]:
print(classification_report(model.predict(X_train),y_train,target_names = list(y_train.columns)))

In [None]:
model.score(sc.transform(X_test),y_test)

In [None]:
print(classification_report(model.predict(sc.transform(X_test)),y_test,target_names = list(y_train.columns)))