In [17]:
import warnings
warnings.filterwarnings('ignore')

# 1. Подготовка данных


## 1.1 Загрузка и просмотр


In [18]:
import pandas as pd

futuristic_city = pd.read_csv("futuristic_city_traffic.csv")

futuristic_city.head()

Unnamed: 0,City,Vehicle Type,Weather,Economic Condition,Day Of Week,Hour Of Day,Speed,Is Peak Hour,Random Event Occurred,Energy Consumption,Traffic Density
0,SolarisVille,Drone,Snowy,Stable,Sunday,20,29.4268,0,0,14.7134,0.5241
1,AquaCity,Flying Car,Solar Flare,Recession,Wednesday,2,118.8,0,0,143.5682,0.3208
2,Neuroburg,Autonomous Vehicle,Solar Flare,Recession,Wednesday,16,100.3904,0,0,91.264,0.0415
3,Ecoopolis,Drone,Clear,Booming,Thursday,8,76.8,1,0,46.0753,0.1811
4,AquaCity,Autonomous Vehicle,Solar Flare,Stable,Saturday,16,45.2176,0,0,40.1934,0.4544


In [19]:
futuristic_city.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1219567 entries, 0 to 1219566
Data columns (total 11 columns):
 #   Column                 Non-Null Count    Dtype  
---  ------                 --------------    -----  
 0   City                   1219567 non-null  object 
 1   Vehicle Type           1219567 non-null  object 
 2   Weather                1219567 non-null  object 
 3   Economic Condition     1219567 non-null  object 
 4   Day Of Week            1219567 non-null  object 
 5   Hour Of Day            1219567 non-null  int64  
 6   Speed                  1219567 non-null  float64
 7   Is Peak Hour           1219567 non-null  int64  
 8   Random Event Occurred  1219567 non-null  int64  
 9   Energy Consumption     1219567 non-null  float64
 10  Traffic Density        1219567 non-null  float64
dtypes: float64(3), int64(3), object(5)
memory usage: 102.4+ MB


## 1.2 Удаление выбросов


In [20]:
def delete_outlier(df: pd.DataFrame):
    numeric_columns = df.describe().columns

    outlier = df[numeric_columns]

    Q1 = outlier.quantile(0.25)

    Q3 = outlier.quantile(0.75)

    IQR = Q3 - Q1

    filtered_data = outlier[
        ~((outlier < (Q1 - 1.5 * IQR)) |
          (outlier > (Q3 + 1.5 * IQR))
          ).any(axis=1)]

    index_list = list(filtered_data.index.values)

    print(
        f'Удалено: {int(100 - len(filtered_data) / len(df) * 100)}% данных ({len(df) - len(filtered_data):_})')

    return df[df.index.isin(index_list)]

In [21]:
futuristic_city = delete_outlier(futuristic_city)

futuristic_city.info()

Удалено: 21% данных (267_195)
<class 'pandas.core.frame.DataFrame'>
Index: 952372 entries, 0 to 1219565
Data columns (total 11 columns):
 #   Column                 Non-Null Count   Dtype  
---  ------                 --------------   -----  
 0   City                   952372 non-null  object 
 1   Vehicle Type           952372 non-null  object 
 2   Weather                952372 non-null  object 
 3   Economic Condition     952372 non-null  object 
 4   Day Of Week            952372 non-null  object 
 5   Hour Of Day            952372 non-null  int64  
 6   Speed                  952372 non-null  float64
 7   Is Peak Hour           952372 non-null  int64  
 8   Random Event Occurred  952372 non-null  int64  
 9   Energy Consumption     952372 non-null  float64
 10  Traffic Density        952372 non-null  float64
dtypes: float64(3), int64(3), object(5)
memory usage: 87.2+ MB


## 1.3 Создание датасетов регрессии и классификации


In [22]:
x_regr = futuristic_city.drop("Speed", axis=1)
y_regr = futuristic_city["Speed"]

x_clsf = futuristic_city.drop("Economic Condition", axis=1)
y_clsf = futuristic_city["Economic Condition"]

## 1.4 Замена категориальных данных


In [23]:
x_regr = pd.get_dummies(x_regr, dtype=float)

y_regr = y_regr.astype("float16")

x_clsf = pd.get_dummies(x_clsf, dtype=float)

y_clsf = y_clsf.replace({"Stable": 1, "Booming": 2, "Recession": 0})

## 1.5 Стандартизация данных


In [24]:
from sklearn.preprocessing import MinMaxScaler
import numpy as np

scaler = MinMaxScaler()

x_regr = scaler.fit_transform(x_regr)

y_regr = scaler.fit_transform(np.array(y_regr).reshape(-1, 1)).ravel()

x_clsf = scaler.fit_transform(x_clsf)

## 1.6 Уменьшение размерности


In [25]:
from sklearn.decomposition import PCA

pca_regr = PCA(n_components=10)
x_regr_pca = pd.DataFrame(pca_regr.fit_transform(
    x_regr), columns=pca_regr.get_feature_names_out())

pca_clsf = PCA(n_components=10)
x_clsf_pca = pd.DataFrame(pca_clsf.fit_transform(
    x_clsf), columns=pca_clsf.get_feature_names_out())

In [26]:
from sklearn.feature_selection import SelectKBest

skb_regr = SelectKBest(k=10)
x_regr_skb = pd.DataFrame(skb_regr.fit_transform(
    x_regr, y_regr), columns=skb_regr.get_feature_names_out())

skb_clsf = SelectKBest(k=10)
x_clsf_skb = pd.DataFrame(skb_clsf.fit_transform(
    x_clsf, y_clsf), columns=skb_clsf.get_feature_names_out())

## 1.7 Уменьшение потребления памяти


In [27]:
import numpy as np


def reduce_mem_usage(df: pd.DataFrame):
    start_mem = df.memory_usage().sum() / 1024**2
    for col in df.columns:
        col_type = df[col].dtypes
        if str(col_type)[:5] == 'float' and (df[col] % 1 != 0).any():
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.finfo('f2').min and c_max < np.finfo('f2').max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo('f4').min and c_max < np.finfo('f4').max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
        elif str(col_type)[:3] == 'int' or (df[col] % 1 == 0).all():
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo('i1').min and c_max < np.iinfo('i1').max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo('i2').min and c_max < np.iinfo('i2').max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo('i4').min and c_max < np.iinfo('i4').max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo('i8').min and c_max < np.iinfo('i8').max:
                df[col] = df[col].astype(np.int64)
        elif col == 'timestamp':
            df[col] = pd.to_datetime(df[col])
        elif str(col_type)[:8] != 'datetime':
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    print('Потребление памяти сократилось на', round(start_mem - end_mem, 2),
          'Мб (минус', round(100 * (start_mem - end_mem) / start_mem, 1), '%)')
    return df

In [28]:
for df in [x_regr_pca, x_regr_skb, x_clsf_pca, x_clsf_skb]:
    reduce_mem_usage(df)

Потребление памяти сократилось на 54.5 Мб (минус 75.0 %)
Потребление памяти сократилось на 61.76 Мб (минус 85.0 %)
Потребление памяти сократилось на 54.5 Мб (минус 75.0 %)
Потребление памяти сократилось на 61.76 Мб (минус 85.0 %)


# 2. Создание обучающих наборов


In [29]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error as mse, r2_score as r2, classification_report


def test_model(x, y, model, classification=False):
    x_tr, x_test, y_tr, y_test = train_test_split(
        x, y, test_size=0.2, random_state=42
    )

    model = model.fit(x_tr, y_tr)

    y_pred = model.predict(x_test)

    if classification:
        print(classification_report(y_test, y_pred))
    else:
        print(f"* mse: {mse(y_test, y_pred).round(3)}\n* r2 : {r2(y_test, y_pred).round(3)}")

# 3. Создание модели


In [30]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import Pipeline

pipeline = Pipeline([
    ('polynomial', PolynomialFeatures(degree=4)),
    ('model', LinearRegression())
])

In [31]:
from sklearn.linear_model import PassiveAggressiveClassifier

model = PassiveAggressiveClassifier()

# 4. Обучение модели и оценка производительности


In [32]:
test_model(x_regr_pca, y_regr, pipeline)

* mse: 0.0010004043579101562
* r2 : 0.967


In [33]:
test_model(x_regr_skb, y_regr, pipeline)

* mse: 0.0240020751953125
* r2 : 0.321


In [35]:
test_model(x_clsf_pca, y_clsf, model, classification=True)

              precision    recall  f1-score   support

           0       0.36      0.16      0.22     64698
           1       0.38      0.17      0.23     62850
           2       0.32      0.68      0.44     62927

    accuracy                           0.34    190475
   macro avg       0.35      0.34      0.30    190475
weighted avg       0.35      0.34      0.30    190475



In [36]:
test_model(x_clsf_skb, y_clsf, model, classification=True)

              precision    recall  f1-score   support

           0       0.62      0.95      0.75     64698
           1       0.47      0.57      0.52     62850
           2       0.50      0.12      0.19     62927

    accuracy                           0.55    190475
   macro avg       0.53      0.55      0.48    190475
weighted avg       0.53      0.55      0.49    190475

