In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn import linear_model

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.stats.diagnostic import het_white

from scipy import stats

In [294]:
scaler = StandardScaler()

In [3]:
# df = pd.read_json("df.json")
df = pd.read_json("df.json")
df.head()

Unnamed: 0,airline,number_of_changes,origin_city_code,destination_country_code,search_weekday,depart_weekday,diff_days,distance,price,number_of_changes_category,airline_category,destination_category
0,Turkish Airlines,2,MOW,TR,5,3,12,1608.63,24159,2,Turkish Airlines,tourism
1,S7 Airlines,1,MOW,RU,5,1,10,3379.79,11160,1,S7 Airlines,domestic
2,Turkish Airlines,2,MOW,TR,5,0,9,2086.37,24260,2,Turkish Airlines,tourism
3,Pobeda,0,MOW,RU,5,0,9,1270.49,4649,0,Pobeda,domestic
4,Aeroflot,1,MOW,RU,5,2,4,7042.45,20435,1,Aeroflot,domestic


In [224]:
categorical_columns = ["search_weekday", "depart_weekday", "number_of_changes_category", "airline_category", "destination_category"]
numerical_columns = ["diff_days", "distance"]
target_columns = ["price"]

In [225]:
dummies = pd.get_dummies(data=df[categorical_columns], drop_first=True, dtype=int)
numerical_values = df[numerical_columns]
y = df[target_columns]

X = pd.concat([dummies, numerical_values], axis=1)
X = sm.add_constant(X)

In [226]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=11)

In [227]:
vif_data = pd.DataFrame()
vif_data['feature'] = X_train.columns
vif_data['VIF'] = [variance_inflation_factor(X_train.values, i) for i in range(X_train.shape[1])]
vif_data

Unnamed: 0,feature,VIF
0,const,30.767966
1,search_weekday,1.001149
2,depart_weekday,1.007606
3,number_of_changes_category_1,1.83452
4,number_of_changes_category_2,2.512865
5,number_of_changes_category_3+,1.642167
6,airline_category_Azimut,1.991553
7,airline_category_Nordwind Airlines,1.520187
8,airline_category_Other,1.956152
9,airline_category_Pegasus Airlines,1.946393


In [228]:
vif_data[vif_data["VIF"] >= 3.0]

Unnamed: 0,feature,VIF
0,const,30.767966
14,destination_category_culture,3.607605
15,destination_category_domestic,5.377524
16,destination_category_tourism,3.591784


In [229]:
model = sm.OLS(y_train, X_train)
res = model.fit()

In [230]:
res.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.726
Model:,OLS,Adj. R-squared:,0.726
Method:,Least Squares,F-statistic:,3986.0
Date:,"Sat, 15 Jun 2024",Prob (F-statistic):,0.0
Time:,23:10:28,Log-Likelihood:,-278070.0
No. Observations:,27036,AIC:,556200.0
Df Residuals:,27017,BIC:,556300.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,5264.4747,239.200,22.009,0.000,4795.630,5733.319
search_weekday,101.2438,21.843,4.635,0.000,58.431,144.057
depart_weekday,28.1279,22.251,1.264,0.206,-15.485,71.741
number_of_changes_category_1,3009.7024,117.281,25.662,0.000,2779.826,3239.579
number_of_changes_category_2,8901.4094,161.672,55.058,0.000,8584.524,9218.295
number_of_changes_category_3+,1.362e+04,220.654,61.730,0.000,1.32e+04,1.41e+04
airline_category_Azimut,-3233.7873,171.740,-18.830,0.000,-3570.407,-2897.168
airline_category_Nordwind Airlines,-3226.0158,200.360,-16.101,0.000,-3618.732,-2833.299
airline_category_Other,-3340.3025,232.267,-14.381,0.000,-3795.557,-2885.048

0,1,2,3
Omnibus:,8265.151,Durbin-Watson:,1.982
Prob(Omnibus):,0.0,Jarque-Bera (JB):,55079.41
Skew:,1.304,Prob(JB):,0.0
Kurtosis:,9.488,Cond. No.,34200.0


In [231]:
white_test = het_white(res.resid, res.model.exog)
labels = ['LM', 'LM p-value', 'F', 'F p-value']
print(dict(zip(labels, white_test)))

{'LM': 8115.730211970274, 'LM p-value': 0.0, 'F': 84.21698681381822, 'F p-value': 0.0}


In [232]:
y_pred = res.predict(X_test)

In [233]:
print("Среднеквадратическая погрешность:", metrics.root_mean_squared_error(y_test, y_pred))
print("Абсолютная погрешность:", metrics.mean_absolute_error(y_test, y_pred))

Среднеквадратическая погрешность: 7233.762758692457
Абсолютная погрешность: 4720.07502141427


In [234]:
df["distance_domestic"] = 0.0
df["distance_abroad"] = 0.0

domestic_flights = df["destination_category"] == "domestic"
df.loc[domestic_flights, "distance_domestic"] = df[domestic_flights]["distance"]

abroad_flights = df["destination_category"] != "domestic"
df.loc[abroad_flights, "distance_abroad"] = df[abroad_flights]["distance"]

In [235]:
categorical_columns = ["search_weekday", "depart_weekday", "number_of_changes_category", "airline_category", "destination_category"]
numerical_columns = ["diff_days", "distance_domestic", "distance_abroad"]
target_columns = ["price"]

In [236]:
dummies = pd.get_dummies(data=df[categorical_columns], drop_first=True, dtype=int)
numerical_values = df[numerical_columns]
y = df[target_columns]

X = pd.concat([dummies, numerical_values], axis=1)
X = sm.add_constant(X)

In [237]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=11)

In [238]:
model = sm.OLS(y_train, X_train)
res = model.fit()
res.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.735
Model:,OLS,Adj. R-squared:,0.734
Method:,Least Squares,F-statistic:,3935.0
Date:,"Sat, 15 Jun 2024",Prob (F-statistic):,0.0
Time:,23:10:29,Log-Likelihood:,-277660.0
No. Observations:,27036,AIC:,555400.0
Df Residuals:,27016,BIC:,555500.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,4140.5784,238.818,17.338,0.000,3672.482,4608.675
search_weekday,101.8628,21.515,4.735,0.000,59.692,144.033
depart_weekday,31.9216,21.918,1.456,0.145,-11.038,74.881
number_of_changes_category_1,3641.4932,117.584,30.969,0.000,3411.023,3871.964
number_of_changes_category_2,9729.4991,161.819,60.126,0.000,9412.326,1e+04
number_of_changes_category_3+,1.444e+04,219.197,65.880,0.000,1.4e+04,1.49e+04
airline_category_Azimut,-4003.5369,171.260,-23.377,0.000,-4339.215,-3667.858
airline_category_Nordwind Airlines,-3043.5149,197.456,-15.414,0.000,-3430.539,-2656.491
airline_category_Other,-3477.6534,228.832,-15.197,0.000,-3926.176,-3029.131

0,1,2,3
Omnibus:,8294.045,Durbin-Watson:,1.983
Prob(Omnibus):,0.0,Jarque-Bera (JB):,62922.1
Skew:,1.269,Prob(JB):,0.0
Kurtosis:,10.03,Cond. No.,25800.0


In [239]:
y_pred = res.predict(X_test)

In [240]:
print("Среднеквадратическая погрешность:", metrics.root_mean_squared_error(y_test, y_pred))
print("Абсолютная погрешность:", metrics.mean_absolute_error(y_test, y_pred))

Среднеквадратическая погрешность: 7147.924645267195
Абсолютная погрешность: 4616.146082712166


In [241]:
df["diff_days_squared"] = df["diff_days"]**2

In [242]:
categorical_columns = ["search_weekday", "depart_weekday", "number_of_changes_category", "airline_category", "destination_category"]
numerical_columns = ["diff_days", "diff_days_squared", "distance"]
target_columns = ["price"]

In [243]:
dummies = pd.get_dummies(data=df[categorical_columns], drop_first=True, dtype=int)
numerical_values = df[numerical_columns]
y = df[target_columns]

X = pd.concat([dummies, numerical_values], axis=1)

In [244]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=11)

In [245]:
model = sm.OLS(y_train, X_train)
res = model.fit()
res.summary()

0,1,2,3
Dep. Variable:,price,R-squared (uncentered):,0.889
Model:,OLS,Adj. R-squared (uncentered):,0.888
Method:,Least Squares,F-statistic:,11340.0
Date:,"Sat, 15 Jun 2024",Prob (F-statistic):,0.0
Time:,23:10:29,Log-Likelihood:,-278310.0
No. Observations:,27036,AIC:,556700.0
Df Residuals:,27017,BIC:,556800.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
search_weekday,220.0848,21.398,10.285,0.000,178.144,262.025
depart_weekday,157.8196,21.669,7.283,0.000,115.346,200.293
number_of_changes_category_1,3795.7085,112.949,33.606,0.000,3574.323,4017.094
number_of_changes_category_2,9759.5615,158.301,61.652,0.000,9449.284,1.01e+04
number_of_changes_category_3+,1.464e+04,217.974,67.150,0.000,1.42e+04,1.51e+04
airline_category_Azimut,-2170.9427,169.152,-12.834,0.000,-2502.490,-1839.396
airline_category_Nordwind Airlines,-1944.6853,195.984,-9.923,0.000,-2328.825,-1560.546
airline_category_Other,-1951.6580,228.052,-8.558,0.000,-2398.651,-1504.665
airline_category_Pegasus Airlines,2674.6510,222.483,12.022,0.000,2238.573,3110.729

0,1,2,3
Omnibus:,7807.821,Durbin-Watson:,1.981
Prob(Omnibus):,0.0,Jarque-Bera (JB):,49097.627
Skew:,1.24,Prob(JB):,0.0
Kurtosis:,9.119,Cond. No.,195000.0


In [297]:
categorical_columns = ["search_weekday", "depart_weekday", "number_of_changes_category", "airline_category", "destination_category"]
numerical_columns = ["diff_days", "diff_days_squared", "distance_domestic", "distance_abroad"]
target_columns = ["price"]

In [298]:
dummies = pd.get_dummies(data=df[categorical_columns], drop_first=True, dtype=int)
numerical_values = df[numerical_columns]
y = df[target_columns]

X = pd.concat([dummies, numerical_values], axis=1)

In [299]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=11)

In [300]:
X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(scaler.fit_transform(X_test), columns=X_test.columns, index=X_test.index)

In [301]:
X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)
X_train_scaled = sm.add_constant(X_train_scaled)
X_test_scaled = sm.add_constant(X_test_scaled)

In [302]:
model = sm.OLS(y_train, X_train_scaled)
res = model.fit()
res.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.735
Model:,OLS,Adj. R-squared:,0.734
Method:,Least Squares,F-statistic:,3739.0
Date:,"Sat, 15 Jun 2024",Prob (F-statistic):,0.0
Time:,23:39:17,Log-Likelihood:,-277660.0
No. Observations:,27036,AIC:,555400.0
Df Residuals:,27015,BIC:,555500.0
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.659e+04,42.474,390.584,0.000,1.65e+04,1.67e+04
search_weekday,202.6952,42.506,4.769,0.000,119.381,286.009
depart_weekday,61.7454,42.637,1.448,0.148,-21.825,145.316
number_of_changes_category_1,1812.8512,58.557,30.959,0.000,1698.076,1927.627
number_of_changes_category_2,4105.2986,68.572,59.869,0.000,3970.894,4239.703
number_of_changes_category_3+,3606.3994,55.161,65.379,0.000,3498.281,3714.518
airline_category_Azimut,-1402.5745,61.289,-22.884,0.000,-1522.705,-1282.444
airline_category_Nordwind Airlines,-797.5008,52.676,-15.140,0.000,-900.748,-694.253
airline_category_Other,-892.0239,59.710,-14.939,0.000,-1009.058,-774.989

0,1,2,3
Omnibus:,8273.464,Durbin-Watson:,1.983
Prob(Omnibus):,0.0,Jarque-Bera (JB):,62617.757
Skew:,1.266,Prob(JB):,0.0
Kurtosis:,10.013,Cond. No.,9.45


In [303]:
y_pred = res.predict(X_test_scaled)

In [304]:
print("Среднеквадратическая погрешность:", metrics.root_mean_squared_error(y_test, y_pred))
print("Абсолютная погрешность:", metrics.mean_absolute_error(y_test, y_pred))

Среднеквадратическая погрешность: 7148.60579709232
Абсолютная погрешность: 4632.082629478821


In [295]:
categorical_columns = ["search_weekday", "depart_weekday", "number_of_changes_category", "airline_category", "destination_category"]
numerical_columns = ["diff_days", "diff_days_squared", "distance_domestic", "distance_abroad"]
target_columns = ["price"]

dummies = pd.get_dummies(data=df[categorical_columns], drop_first=True, dtype=int)
numerical_values = df[numerical_columns]
y = df[target_columns]

X = pd.concat([dummies, numerical_values], axis=1)
X.drop(columns=["airline_category_Uzbekistan Airways"], inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=11)

X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(scaler.fit_transform(X_test), columns=X_test.columns, index=X_test.index)

X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)
X_train_scaled = sm.add_constant(X_train_scaled)
X_test_scaled = sm.add_constant(X_test_scaled)

model = sm.OLS(y_train, X_train_scaled)
res = model.fit()
res.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.735
Model:,OLS,Adj. R-squared:,0.734
Method:,Least Squares,F-statistic:,3936.0
Date:,"Sat, 15 Jun 2024",Prob (F-statistic):,0.0
Time:,23:28:54,Log-Likelihood:,-277660.0
No. Observations:,27036,AIC:,555400.0
Df Residuals:,27016,BIC:,555500.0
Df Model:,19,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.659e+04,42.474,390.591,0.000,1.65e+04,1.67e+04
search_weekday,202.7125,42.505,4.769,0.000,119.400,286.025
depart_weekday,61.7368,42.636,1.448,0.148,-21.832,145.306
number_of_changes_category_1,1813.0248,58.549,30.966,0.000,1698.265,1927.784
number_of_changes_category_2,4105.8522,68.509,59.931,0.000,3971.570,4240.134
number_of_changes_category_3+,3606.8915,55.100,65.461,0.000,3498.893,3714.890
airline_category_Azimut,-1406.2167,58.246,-24.143,0.000,-1520.381,-1292.052
airline_category_Nordwind Airlines,-800.2678,50.643,-15.802,0.000,-899.531,-701.005
airline_category_Other,-896.2631,55.430,-16.169,0.000,-1004.908,-787.618

0,1,2,3
Omnibus:,8274.355,Durbin-Watson:,1.983
Prob(Omnibus):,0.0,Jarque-Bera (JB):,62663.061
Skew:,1.266,Prob(JB):,0.0
Kurtosis:,10.015,Cond. No.,9.25


In [296]:
categorical_columns = ["search_weekday", "depart_weekday", "number_of_changes_category", "airline_category", "destination_category"]
numerical_columns = ["diff_days", "diff_days_squared", "distance_domestic", "distance_abroad"]
target_columns = ["price"]

dummies = pd.get_dummies(data=df[categorical_columns], drop_first=True, dtype=int)
numerical_values = df[numerical_columns]
y = df[target_columns]

X = pd.concat([dummies, numerical_values], axis=1)
X.drop(columns=["airline_category_Uzbekistan Airways", "depart_weekday"], inplace=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=11)

X_train_scaled = pd.DataFrame(scaler.fit_transform(X_train), columns=X_train.columns, index=X_train.index)
X_test_scaled = pd.DataFrame(scaler.fit_transform(X_test), columns=X_test.columns, index=X_test.index)

X_train = sm.add_constant(X_train)
X_test = sm.add_constant(X_test)
X_train_scaled = sm.add_constant(X_train_scaled)
X_test_scaled = sm.add_constant(X_test_scaled)

model = sm.OLS(y_train, X_train_scaled)
res = model.fit()
res.summary()

0,1,2,3
Dep. Variable:,price,R-squared:,0.735
Model:,OLS,Adj. R-squared:,0.734
Method:,Least Squares,F-statistic:,4154.0
Date:,"Sat, 15 Jun 2024",Prob (F-statistic):,0.0
Time:,23:32:14,Log-Likelihood:,-277660.0
No. Observations:,27036,AIC:,555400.0
Df Residuals:,27017,BIC:,555500.0
Df Model:,18,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,1.659e+04,42.474,390.583,0.000,1.65e+04,1.67e+04
search_weekday,203.0160,42.506,4.776,0.000,119.703,286.329
number_of_changes_category_1,1814.3498,58.543,30.992,0.000,1699.602,1929.098
number_of_changes_category_2,4107.5200,68.501,59.963,0.000,3973.254,4241.786
number_of_changes_category_3+,3607.0564,55.101,65.463,0.000,3499.056,3715.057
airline_category_Azimut,-1406.7644,58.246,-24.152,0.000,-1520.929,-1292.600
airline_category_Nordwind Airlines,-800.2243,50.644,-15.801,0.000,-899.490,-700.959
airline_category_Other,-898.1883,55.415,-16.208,0.000,-1006.804,-789.572
airline_category_Pegasus Airlines,318.1511,55.628,5.719,0.000,209.117,427.186

0,1,2,3
Omnibus:,8275.985,Durbin-Watson:,1.983
Prob(Omnibus):,0.0,Jarque-Bera (JB):,62664.542
Skew:,1.266,Prob(JB):,0.0
Kurtosis:,10.015,Cond. No.,9.25
