In [17]:
#Importă librăriile și datele
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib

df = pd.read_csv("../data/flight_dataset_cleaned.csv")
df.head()


Unnamed: 0,Airline,AirlineID,Source,Destination,Total_Stops,DurationMinutes,DayOfWeek,IsWeekend,Price,DepartureHour,DepartureDay,DepartureMonth,DepartureWeekday
0,3,3,0,3,0,0.034111,6,1,3897,0.956522,24,3,6
1,1,1,2,0,2,0.132855,2,0,7662,0.217391,1,5,2
2,4,4,3,2,2,0.382406,6,1,13882,0.391304,9,6,6
3,3,3,2,0,1,0.089767,6,1,6218,0.782609,12,5,6
4,3,3,0,3,1,0.075404,4,0,13302,0.695652,1,3,4


In [19]:
#Selectează features și target
# Variabila țintă
y = df['Price']

# Eliminăm coloane care nu ajută la predicție
X = df.drop(['Price'], axis=1)

X.head()


Unnamed: 0,Airline,AirlineID,Source,Destination,Total_Stops,DurationMinutes,DayOfWeek,IsWeekend,DepartureHour,DepartureDay,DepartureMonth,DepartureWeekday
0,3,3,0,3,0,0.034111,6,1,0.956522,24,3,6
1,1,1,2,0,2,0.132855,2,0,0.217391,1,5,2
2,4,4,3,2,2,0.382406,6,1,0.391304,9,6,6
3,3,3,2,0,1,0.089767,6,1,0.782609,12,5,6
4,3,3,0,3,1,0.075404,4,0,0.695652,1,3,4


In [21]:
#Împarte datele în train/test
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

print("Train shape:", X_train.shape)
print("Test shape:", X_test.shape)


Train shape: (8546, 12)
Test shape: (2137, 12)


In [23]:
#Antrenează Linear Regression
lr_model = LinearRegression()
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
mse_lr = mean_squared_error(y_test, y_pred_lr)
r2_lr = r2_score(y_test, y_pred_lr)

In [25]:
# 2️⃣ Random Forest Regressor
rf_model = RandomForestRegressor(n_estimators=200, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
mse_rf = mean_squared_error(y_test, y_pred_rf)
r2_rf = r2_score(y_test, y_pred_rf)

In [None]:
# Compare results
print("Linear Regression --> MSE:", mse_lr, ", R^2:", r2_lr)
print("Random Forest --> MSE:", mse_rf, ", R^2:", r2_rf)


In [None]:
#Salvează modelul cel mai bun
# Step 5: Set up Random Forest with GridSearch for hyperparameter tuning
rf = RandomForestRegressor(random_state=42)

# Grid of hyperparameters
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, 30, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt', 'log2', None] 
}

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid,
    cv=3,
    n_jobs=-1,
    verbose=1,
    scoring='r2'
)

# Step 6: Train model with GridSearch
grid_search.fit(X_train, y_train)

# Best model
best_rf = grid_search.best_estimator_
print("Best hyperparameters:", grid_search.best_params_)

# Step 7: Make predictions
y_pred = best_rf.predict(X_test)

# Step 8: Evaluate model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

print(f"Random Forest --> MSE: {mse:.2f}, R²: {r2:.2f}")

models_folder = os.path.join(os.getcwd(), 'models')  # Just "models" in current directory
if not os.path.exists(models_folder):
    os.makedirs(models_folder)

model_path = os.path.join(models_folder, 'random_forest_flight_price_model.pkl')

joblib.dump(best_rf, model_path)
print(f"Model saved at {model_path}")

# Confirm it
print("Files in models folder:", os.listdir(models_folder))
