In [2]:
import pandas as pd
import os
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score, mean_squared_error

In [3]:
os.chdir('dataset')

In [4]:
df = pd.read_csv('FLIGHT100k.csv')
df.head()

Unnamed: 0,DEPARTURE_TIME,WHEELS_OFF,AIR_TIME,DISTANCE,ARRIVAL_TIME
0,545,556,65,413,705
1,1618,1634,73,554,1752
2,1519,1529,147,1024,1702
3,1258,1315,51,331,1410
4,638,645,127,889,758


In [5]:
X = df.drop('ARRIVAL_TIME',axis=1)
X.shape

(100000, 4)

In [6]:
y=df['ARRIVAL_TIME']

In [7]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [8]:
model=LinearRegression()
model.fit(X_train,y_train)
y_pred=model.predict(X_test)

In [20]:
print("Mean Squared Error:",mean_squared_error(y_pred,y_test))
print("R-Squared: ",r2_score(y_pred,y_test))

Mean Squared Error: 153236.01322080626
R-Squared:  -0.2716026660716957


In [21]:
from sklearn.linear_model import SGDRegressor

sgd_reg = SGDRegressor(max_iter=1000, tol=1e-3, penalty=None, eta0=0.1)
sgd_reg.fit(X, y.ravel())

In [22]:
sgd_reg.intercept_, sgd_reg.coef_

(array([-5.97111803e+11]),
 array([-1.26602415e+12, -1.96841586e+12, -3.86050675e+11, -1.90826788e+12]))

### As the linear Model is performing very poorly and the data has multicollinearity so lets try Rigid and Lesso Regression which handles multicollinearity and avoid overfitting

In [24]:
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline

In [27]:
ridge_model = make_pipeline(StandardScaler(), Ridge(alpha=1.0))
ridge_model.fit(X_train, y_train)
ridge_r2 = ridge_model.score(X_test, y_test)

print("Ridge Regression R-Squared:", ridge_r2)

Ridge Regression R-Squared: 0.44823605677126266


In [28]:
lasso_model = make_pipeline(StandardScaler(), Lasso(alpha=0.1))
lasso_model.fit(X_train, y_train)
lasso_r2 = lasso_model.score(X_test, y_test)

print("Lasso Regression R-Squared:", lasso_r2)

Lasso Regression R-Squared: 0.4482098229234073


In [29]:
elastic_net_model = make_pipeline(StandardScaler(), ElasticNet(alpha=0.1, l1_ratio=0.5))
elastic_net_model.fit(X_train, y_train)
elastic_net_r2 = elastic_net_model.score(X_test, y_test)

print("Elastic Net Regression R-Squared:", elastic_net_r2)

Elastic Net Regression R-Squared: 0.44361454055513416


hyperpremeter changing for the rigid regression for better r2 score

In [30]:
from sklearn.model_selection import GridSearchCV

param_grid = {'ridge__alpha': [0.1, 1.0, 10.0, 100.0]}
ridge_cv = GridSearchCV(ridge_model, param_grid, cv=5)
ridge_cv.fit(X_train, y_train)
best_ridge_r2 = ridge_cv.score(X_test, y_test)

print("Best Ridge Regression R-Squared:", best_ridge_r2)
print("Best Ridge Alpha:", ridge_cv.best_params_)

Best Ridge Regression R-Squared: 0.44823119572574577
Best Ridge Alpha: {'ridge__alpha': 10.0}


## As the linear regression models are performing poorly, lets try non-linear Model for prediction

In [31]:
from sklearn.ensemble import RandomForestRegressor

rf_model = RandomForestRegressor(n_estimators=100)
rf_model.fit(X_train, y_train)
rf_r2 = rf_model.score(X_test, y_test)

print("Random Forest R-Squared:", rf_r2)

Random Forest R-Squared: 0.8178779006486758


much better then linear models

In [32]:
rf_model.predict([[1245,1245,100,602]])



array([1403.46])

In [37]:
def PredictArrival_Time():
    DEPARTURE_TIME=int(input("Enter the DEPARTURE TIME "))
    WHEELS_OFF=int(input("Enter the WHEELS OFF TIME "))
    AIR_TIME=int(input("Enter the AIR TIME "))
    DISTANCE=int(input("Enter the DISTANCE in km "))
    
    features=[[DEPARTURE_TIME,WHEELS_OFF,AIR_TIME,DISTANCE]]
    predict=rf_model.predict(features)
    print("ARRIVAL TIME:",predict[0])
    print("ELASPED TIME:",predict[0]-DEPARTURE_TIME)

In [38]:
PredictArrival_Time()

Enter the DEPARTURE TIME 553
Enter the WHEELS OFF TIME 556
Enter the AIR TIME 65
Enter the DISTANCE in km 413
ARRIVAL TIME: 698.86
ELASPED TIME: 145.86


