In [65]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor

In [66]:
data = pd.read_csv('./data/train_data.csv', index_col=0)
data.head()

Unnamed: 0_level_0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Vistara,UK-810,Bangalore,Early_Morning,one,Night,Mumbai,Economy,14.25,21,7212
2,SpiceJet,SG-5094,Hyderabad,Evening,zero,Night,Kolkata,Economy,1.75,7,5292
3,Vistara,UK-846,Bangalore,Morning,one,Evening,Delhi,Business,9.58,5,60553
4,Vistara,UK-706,Kolkata,Morning,one,Evening,Hyderabad,Economy,6.75,28,5760
5,Indigo,6E-5394,Chennai,Early_Morning,zero,Morning,Mumbai,Economy,2.0,4,10712


In [67]:
train_set, test_set = train_test_split(data, test_size=0.2, random_state=24)

In [68]:
train_x = train_set.drop('price', axis=1)
y = train_set['price'].copy()

In [69]:
ohot_cats = ['airline', 'departure_time', 'flight', 'arrival_time', 'source_city', 'destination_city']
ordinal_cats = ['stops', 'class']
nums = ['duration', 'days_left']

In [70]:
print(train_set.stops.unique())
print(train_set['class'].unique())

['zero' 'one' 'two_or_more']
['Economy' 'Business']


In [71]:
stops_enc = OrdinalEncoder(categories=[['zero', 'one', 'two_or_more']])
class_enc = OrdinalEncoder(categories=[['Economy', 'Business']])

Ba'zi ustunlarda kategoriyalarimiz soni ko'pligi sabab piplinedan X lardagi ustunlar soni train va test setlarda xar xil bo'lib qoladi.(train_set dagi barcha kategoriyalar test_set da mavjud bo'lmagani uchun). Buni tuzatis uchun OneHotEncoderni individual ravishda va datasetdagi barcha mavjud kategoriyalarni ko'rsatish orqali hal qilamiz.

In [72]:
def onehotencoder_generator(cat):
    return OneHotEncoder(categories=[data[cat].unique()])

In [73]:
oht_list = [(cat, onehotencoder_generator(cat), [cat,]) for cat in ohot_cats]
pipline = ColumnTransformer([
    ('num', StandardScaler(), nums),
    ('stops', stops_enc, ['stops',]),
    ('class', class_enc, ['class',])
] + oht_list,
sparse_threshold=0)

In [74]:
X = pipline.fit_transform(train_x)

In [75]:
X.shape

(16000, 1344)

In [76]:
model = RandomForestRegressor()
model.fit(X, y)

In [77]:
test_X = pipline.fit_transform(test_set.drop('price', axis=1))
y_predict = model.predict(test_X)
y_test = test_set['price'].copy()

In [78]:
from sklearn.metrics import mean_absolute_error, mean_squared_error

MAE = mean_absolute_error(y_test, y_predict)
RMSE = np.sqrt(mean_squared_error(y_test, y_predict))
print(f"{MAE=}")
print(f"{RMSE=}")

MAE=1660.3317915
RMSE=3403.0194848935557
