In [52]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.ensemble import RandomForestRegressor
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.svm import SVR
from sklearn.linear_model import ElasticNet, Ridge, Lars, LassoLars, LogisticRegression, TweedieRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score

In [2]:
data = pd.read_csv('./data/train_data.csv', index_col=0)
data.head()

Unnamed: 0_level_0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left,price
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,Vistara,UK-810,Bangalore,Early_Morning,one,Night,Mumbai,Economy,14.25,21,7212
2,SpiceJet,SG-5094,Hyderabad,Evening,zero,Night,Kolkata,Economy,1.75,7,5292
3,Vistara,UK-846,Bangalore,Morning,one,Evening,Delhi,Business,9.58,5,60553
4,Vistara,UK-706,Kolkata,Morning,one,Evening,Hyderabad,Economy,6.75,28,5760
5,Indigo,6E-5394,Chennai,Early_Morning,zero,Morning,Mumbai,Economy,2.0,4,10712


In [3]:
ffull = data.flight.unique()
print(ffull)
print(ffull.shape) # flight kategoriyalar soni
fchars = pd.Series(ffull).apply(lambda x :x.split('-')[0]).unique()
print(fchars)
print(len(fchars)) # flight boshidagi harflar uchun kategoriyalar soni

['UK-810' 'SG-5094' 'UK-846' ... 'AI-489' 'G8-213' '6E-7201']
(1310,)
['UK' 'SG' '6E' 'AI' 'G8' 'I5']
6


### Wikipediadan o'qib qilingan xulosa shu bo'ldikiy, bizdagi flight ning raqamli qismi uchish narxiga ta'sirsiz. Lekin xarflar havo yo'lini bildirib narxga ta'sir qilishi mumkin. Shu sabab flight ustunidan raqamlarni olib tashlaymiz.

In [5]:
from typing import Union

class NumCutter(BaseEstimator, TransformerMixin):
    # transformer qayta foydalanishga moslanib cols da kesib olinadigan ustunlar ro'yxatini va split_char da kesiladigan belgini qabul qiladi
    def __init__(self, cols: Union[list, np.array, pd.Index], split_char='-') -> None:
        self.cols = cols
        self.split_char = split_char
    def fit(self, X, y=None):
        return self
    def transform(self, X: pd.DataFrame):
        for col in self.cols:
            X[col] = X[col].apply(lambda item: item.split(self.split_char)[0])
        return X

In [6]:
train_set, test_set = train_test_split(data, test_size=0.2, random_state=24)

In [7]:
train_x = train_set.drop('price', axis=1)
y = train_set['price'].copy()

In [8]:
ohot_cats = ['airline', 'flight', 'departure_time', 'arrival_time', 'source_city', 'destination_city']
ordinal_cats = ['stops', 'class']
nums = ['duration', 'days_left']

In [9]:
print(train_set.stops.unique())
print(train_set['class'].unique())

['zero' 'one' 'two_or_more']
['Economy' 'Business']


In [10]:
# oldindan yaratib olishdan maqsad tartibiga ko'ra 0 1 va 2 raqamlarini oladi
stops_enc = OrdinalEncoder(categories=[['zero', 'one', 'two_or_more']])
class_enc = OrdinalEncoder(categories=[['Economy', 'Business']])

In [11]:
main_tf = ColumnTransformer([
    ('num', StandardScaler(), nums),
    ('stops', stops_enc, ['stops',]),
    ('class', class_enc, ['class',]),
    ('ohots', OneHotEncoder(), ohot_cats)
],# + oht_list,
sparse_threshold=0)

pipline = Pipeline([
    ('splitter_tf', NumCutter(['flight'])),
    ('main_tf', main_tf),
])

In [12]:
X = pipline.fit_transform(train_x)

In [13]:
X.shape

(16000, 40)

In [17]:
test_X = pipline.fit_transform(test_set.drop('price', axis=1))
y_test = test_set['price'].copy()

In [None]:
# model = RandomForestRegressor()
# model.fit(X, y)
# predict = model.predict(test_X)
# MAE = mean_absolute_error(y_test, predict)
# RMSE = np.sqrt(mean_squared_error(y_test, predict))
# print(f"{MAE=}")
# print(f"{RMSE=}")

In [22]:
RF_model = RandomForestRegressor()
RF_model.fit(X, y)
RF_predict = RF_model.predict(test_X)
RF_MAE = mean_absolute_error(y_test, RF_predict)
RF_RMSE = np.sqrt(mean_squared_error(y_test, RF_predict))
print(f"{RF_MAE=}")
print(f"{RF_RMSE=}")

RF_MAE=2093.0162648333335
RF_RMSE=3904.4180583584457


In [25]:
SVM_model = SVR()
SVM_model.fit(X, y)
SVM_predict = SVM_model.predict(test_X)
SVM_MAE = mean_absolute_error(y_test, SVM_predict)
SVM_RMSE = np.sqrt(mean_squared_error(y_test, SVM_predict))
print(f"{SVM_MAE=}")
print(f"{SVM_RMSE=}")

SVM_MAE=15895.734773245631
SVM_RMSE=26250.463798357876


In [37]:
EL_model = ElasticNet()
EL_model.fit(X, y)
EL_predict = EL_model.predict(test_X)
EL_MAE = mean_absolute_error(y_test, EL_predict)
EL_RMSE = np.sqrt(mean_squared_error(y_test, EL_predict))
print(f"{EL_MAE=}")
print(f"{EL_RMSE=}")

EL_MAE=12958.869213502567
EL_RMSE=15687.371768934929


In [38]:
RD_model = Ridge()
RD_model.fit(X, y)
RD_predict = RD_model.predict(test_X)
RD_MAE = mean_absolute_error(y_test, RD_predict)
RD_RMSE = np.sqrt(mean_squared_error(y_test, RD_predict))
print(f"{RD_MAE=}")
print(f"{RD_RMSE=}")

RD_MAE=4503.473147579058
RD_RMSE=6755.79765811301


In [39]:
LRS_model = Lars()
LRS_model.fit(X, y)
LRS_predict = LRS_model.predict(test_X)
LRS_MAE = mean_absolute_error(y_test, LRS_predict)
LRS_RMSE = np.sqrt(mean_squared_error(y_test, LRS_predict))
print(f"{LRS_MAE=}")
print(f"{LRS_RMSE=}")

LRS_MAE=4505.17375
LRS_RMSE=6756.970719912141




In [40]:
LLRS_model = LassoLars()
LLRS_model.fit(X, y)
LLRS_predict = LLRS_model.predict(test_X)
LLRS_MAE = mean_absolute_error(y_test, LLRS_predict)
LLRS_RMSE = np.sqrt(mean_squared_error(y_test, LLRS_predict))
print(f"{LLRS_MAE=}")
print(f"{LLRS_RMSE=}")

LLRS_MAE=4501.713374682694
LLRS_RMSE=6755.515246387858


In [41]:
LOG_model = LogisticRegression()
LOG_model.fit(X, y)
LOG_predict = LOG_model.predict(test_X)
LOG_MAE = mean_absolute_error(y_test, LOG_predict)
LOG_RMSE = np.sqrt(mean_squared_error(y_test, LOG_predict))
print(f"{LOG_MAE=}")
print(f"{LOG_RMSE=}")

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LOG_MAE=3328.4195
LOG_RMSE=8032.70612209858


In [42]:
TR_model = TweedieRegressor()
TR_model.fit(X, y)
TR_predict = TR_model.predict(test_X)
TR_MAE = mean_absolute_error(y_test, TR_predict)
TR_RMSE = np.sqrt(mean_squared_error(y_test, TR_predict))
print(f"{TR_MAE=}")
print(f"{TR_RMSE=}")

TR_MAE=15379.452365262765
TR_RMSE=18093.28455544584


In [44]:
NN_model = MLPRegressor()
NN_model.fit(X, y)
NN_predict = NN_model.predict(test_X)
NN_MAE = mean_absolute_error(y_test, NN_predict)
NN_RMSE = np.sqrt(mean_squared_error(y_test, NN_predict))
print(f"{NN_MAE=}")
print(f"{NN_RMSE=}")

NN_MAE=9938.254807761208
NN_RMSE=12608.933900303156




In [51]:
DT_model = DecisionTreeRegressor()
DT_model.fit(X, y)
DT_predict = DT_model.predict(test_X)
DT_MAE = mean_absolute_error(y_test, DT_predict)
DT_RMSE = np.sqrt(mean_squared_error(y_test, DT_predict))
print(f"{DT_MAE=}")
print(f"{DT_RMSE=}")

DT_MAE=2363.174
DT_RMSE=5070.578932318775


### Natijalarga ko'ra RandomForest modelida to'xtalinildi

# Test prediction results

In [45]:
test_data = pd.read_csv('./data/test_data.csv', index_col=0)
test_data.head()

Unnamed: 0_level_0,airline,flight,source_city,departure_time,stops,arrival_time,destination_city,class,duration,days_left
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
1,Air_India,AI-765,Kolkata,Evening,one,Night,Delhi,Business,28.25,2
2,Vistara,UK-747,Delhi,Early_Morning,one,Night,Mumbai,Business,13.83,34
3,Air_India,AI-570,Mumbai,Early_Morning,zero,Early_Morning,Chennai,Business,2.0,30
4,AirAsia,I5-974,Hyderabad,Night,one,Late_Night,Delhi,Economy,5.17,26
5,Air_India,AI-770,Kolkata,Night,one,Afternoon,Mumbai,Economy,16.33,35


In [48]:
ss_df = pd.read_csv('./data/sample_solution.csv', index_col=0)
ss_df.shape

(5000, 1)

In [59]:
X_prepered = pipline.fit_transform(test_data)
ss_df['price'] = RF_model.predict(X_prepered)

In [60]:
ss_df.head()

Unnamed: 0_level_0,price
id,Unnamed: 1_level_1
1,53568.82
2,55497.15
3,23102.34
4,3380.99
5,5728.02


In [61]:
ss_df.to_csv('mysolution.csv')