In [27]:
import pandas as pd
import tensorflow as tf
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import os
from sklearn.model_selection import train_test_split
from imblearn.under_sampling import RandomUnderSampler

from sklearn.metrics import mean_squared_error, mean_absolute_error
from sklearn.metrics import confusion_matrix, classification_report

## Решить задачу классификации

In [28]:

df = pd.read_csv(os.path.join('..', '..', 'data', 'ForUbung', 'neo_task.csv'))

x = lambda x: x.mean() if x.notna().any() else 0
group = df.groupby('miss_distance')['relative_velocity'].transform('mean').iat[0]
df['relative_velocity'].fillna(group, inplace=True)

x = lambda x: x.median() if x.notna().any() else 0
group = df.groupby('est_diameter_min')['absolute_magnitude'].transform(x)
df['absolute_magnitude'].fillna(group, inplace=True)

table = df['est_diameter_max'] - df['est_diameter_min']
df['est_diameter_max'].fillna(df['est_diameter_min'] + table.mean(), inplace=True)

df['name'].fillna('0 unknow', inplace=True)

df['id'].fillna(df['id'].mode()[1], inplace=True)

df.loc[df['absolute_magnitude'] == 0]
df = df.drop(index=df.loc[df['absolute_magnitude'] == 0].index)

dict_patch = {
    False: 0,
    True: 1
}
df['hazardous'] = df['hazardous'].map(dict_patch)

X = df.drop(['name', 'hazardous', 'est_diameter_max'], axis=1)
Y = df['hazardous']
rus = RandomUnderSampler()

X_resampled, y_resampled = rus.fit_resample(X, Y)

X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, train_size=0.2, random_state=42, stratify=y_resampled)

In [29]:
X_train.shape

(3536, 5)

In [30]:
model_classification_1 = tf.keras.Sequential(
    [
        tf.keras.layers.Dense(1, activation="sigmoid", input_shape=(5, )),
    ]
)
w0 = 1 / y_train[y_train==0].shape[0]
w1 = 1 / y_train[y_train==1].shape[0]

model_classification_1.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.001), loss="binary_crossentropy")

model_classification_1.fit(X_train, y_train, epochs=50, verbose=None, class_weight={0: w0, 1: w1})

y_pred = np.around(model_classification_1.predict(X_test, verbose=None))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00      7072
           1       0.50      1.00      0.67      7072

    accuracy                           0.50     14144
   macro avg       0.25      0.50      0.33     14144
weighted avg       0.25      0.50      0.33     14144

[[   0 7072]
 [   0 7072]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [31]:
model_classification_1 = tf.keras.Sequential(
    [
        tf.keras.layers.Dense(64, activation="relu", input_shape=(5,)),
        tf.keras.layers.Dense(128, activation="relu"),
        tf.keras.layers.Dropout(0.05),
        tf.keras.layers.Dense(64, activation="relu"),
        tf.keras.layers.Dense(32, activation="relu"),
        tf.keras.layers.Dense(16, activation="relu"),
        # используем 1 нейрон и sigmoid
        tf.keras.layers.Dense(1, activation="sigmoid"),
    ]
)
model_classification_1.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.005), loss="binary_crossentropy")
model_classification_1.fit(X_train, y_train, epochs=25, verbose=None,
                           class_weight={0: w0, 1: w1})
y_pred = np.around(model_classification_1.predict(X_test, verbose=None))
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.78      0.27      0.40      7072
           1       0.56      0.92      0.70      7072

    accuracy                           0.60     14144
   macro avg       0.67      0.60      0.55     14144
weighted avg       0.67      0.60      0.55     14144

[[1922 5150]
 [ 533 6539]]


## Решить задачу регрессии

In [32]:
df = pd.read_csv(os.path.join('..', '..', 'data', 'ForUbung', 'trip_duration_task.csv'))

df['vendor_id'] = df['vendor_id'].fillna(1)
df['vendor_id'] = df['vendor_id'].astype(int)

df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime']).map(pd.Timestamp.timestamp)
df['dropoff_datetime'] = pd.to_datetime(df['dropoff_datetime']).map(pd.Timestamp.timestamp)

group = df.groupby('dropoff_latitude')['pickup_latitude'].transform('median').median()
df['pickup_latitude'] = df['pickup_latitude'].fillna(group)

null_passeger = df.loc[df['passenger_count'] == 0]

df = df.drop(index=null_passeger.index)

del_latidute = (np.pi / 180) * (df['dropoff_latitude'] - df['pickup_latitude'])
mean_latidude = (df['dropoff_latitude'] + df['pickup_latitude']) / 2
del_longitude = (np.pi / 180) * (df['dropoff_longitude'] - df['pickup_longitude'])

distance = pd.DataFrame(6371 * np.sqrt(np.power(del_latidute, 2) + np.power(np.cos(mean_latidude) * del_longitude, 2)))
df['distance'] = distance

df['id'] = df['id'].transform(lambda x: x.str[2:]).astype(int)


X = df.drop('trip_duration', axis=1)
Y = df[['trip_duration']]

X_train, X_test, y_train, y_test = train_test_split(X, Y, train_size=0.5, random_state=42)

In [33]:
model_regression = tf.keras.Sequential(
    [
        tf.keras.layers.Dense(1, activation="linear"),
    ]
)
model_regression.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.005), loss="mse")

model_regression.fit(X_train, y_train)

print(mean_absolute_error(y_test, model_regression.predict(X_test)))
print(mean_squared_error(y_test, model_regression.predict(X_test)))

10938.52220197673
147815076.99745306


In [34]:
X_train.shape

(364644, 10)

In [35]:
model_regression = tf.keras.Sequential(
    [
        tf.keras.layers.Dense(64, activation="relu", input_shape=(10,)),
        tf.keras.layers.Dense(32, activation="linear"),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(16, activation="relu"),
        tf.keras.layers.Dropout(0.1),
        tf.keras.layers.Dense(1, activation="linear"),
    ]
)

model_regression.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=0.005), loss="mse")


948.1718471035038
10617664.17287037


In [37]:
model_regression.fit(X_train, y_train, epochs=10, verbose=None)

print(mean_absolute_error(y_test, model_regression.predict(X_test)))
print(mean_squared_error(y_test, model_regression.predict(X_test)))

555.3080880271689
9799636.745708058
