## Using XGBRegressor() on data scaled by MinMaxScaler(), instead of StandardScaler()

## 10000 rows of data

In [14]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error

In [10]:
X = pd.read_csv('X_test.csv')
y = pd.read_csv('y_test.csv')
y = y['yFT']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

xgboost = XGBRegressor()
xgboost.fit(X_train, y_train)
y_pred= xgboost.predict(X_test)

print('Training score: ', xgboost.score(X_train, y_train))
print('RMSE: ', mean_squared_error(y_test, y_pred))
print('R-squared score: ', r2_score(y_test, y_pred))

## 100000 rows of data

In [18]:
X = pd.read_csv('X_test100000.csv')
ydf = pd.read_csv('y_test100000.csv')

y = ydf['yFT']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

xgboost = XGBRegressor()
xgboost.fit(X_train, y_train)
y_pred = xgboost.predict(X_test)

In [19]:
print('Training R-Square', xgboost.score(X_train, y_train))
print('RMSE: ', mean_squared_error(y_test, y_pred))
print('R-squared score: ', r2_score(y_test, y_pred))

Training R-Square 0.16326952350050172
RMSE:  0.9684616081650654
R-squared score:  0.0340012678853745


# No dummy (categorical) variables
- MinMaxScaler for all variables including feature variable
- 100000 rows
- Month and unique carrier ID columns dropped
- Origin and dest columns replaced with avg. arrival delay (unscaled)

In [11]:
def replace_origin_dest(df):
    # df: X features dataframe without one-hot encoding

    # Find the average delay times by origin location, and store the values in a dictionary
    origin = pd.read_csv('origin_arr_delay.txt', delimiter = '\t', names = ['origin', 'avg_delay'])
    origin = pd.Series(origin.avg_delay.values, index = origin.origin).to_dict()
    
    # Find the average delay times by destination location, and store the values in a dictionary
    dest = pd.read_csv('dest_arr_delay.txt', delimiter = '\t', names = ['dest', 'avg_delay'])
    dest = pd.Series(dest.avg_delay.values, index = dest.dest).to_dict()
    
    # Replace the values in the "origin" and "dest" columns with the average arrival delay time
    df['origin'] = df['origin'].replace(origin)
    df['dest'] = df['dest'].replace(dest)
    
    return df

In [15]:
X = pd.read_csv('X_test100000noDum.csv', compression = 'gzip')
y = pd.read_csv('y_test100000noDum.csv', compression = 'gzip')
X = X.drop(['op_unique_carrier', 'month'], axis = 1)
X = replace_origin_dest(X)
y = y['yFT']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 1)

xgboost = XGBRegressor()
xgboost.fit(X_train, y_train)
y_pred = xgboost.predict(X_test)

print('Training R-Square', xgboost.score(X_train, y_train))
print('RMSE: ', mean_squared_error(y_test, y_pred))
print('R-squared score: ', r2_score(y_test, y_pred))

Training R-Square 0.17528500134304825
RMSE:  0.02781708895978555
R-squared score:  0.016622917851691343
