<a href="https://colab.research.google.com/github/OfficalOffical/BasicTranslator/blob/master/latestScores.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [4]:
import numpy as np
import pandas as pd
from keras.models import Sequential
from sklearn.svm import SVR
from sklearn.multioutput import MultiOutputRegressor
from keras.models import Sequential
from keras.layers import Dense, Conv1D, Flatten, MaxPooling1D, Dropout, BatchNormalization
from keras.optimizers import Adam
from keras.src.regularizers import l1_l2, l2
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from keras.layers import Conv1D, BatchNormalization, Activation, MaxPooling1D, Flatten, Dense, Dropout
from keras.callbacks import EarlyStopping, ModelCheckpoint
from keras.src.saving.saving_api import load_model

In [9]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from scipy.stats import zscore
import re


def work_on_outliers(input_df,inf_threshold,sup_threshold, feature_number,visualize):
    # WORKING ON OUTLIERS

    # Working on negpmax feature (the only one that is showing outliers)
    features = [f for f in input_df.columns[feature_number::4]]
    if visualize:
        fig, axes = plt.subplots(4, 3, figsize=(15, 10))  # Adjust the figsize as needed

        # Flatten the 2D array of subplots
        axes = axes.flatten()

    output_df=input_df.copy()

    for i, feature in enumerate(features):
        Q1 = input_df[feature].quantile(0.25)
        Q3 = input_df[feature].quantile(0.75)
        IQR = Q3 - Q1
        lower_bound = Q1 - inf_threshold * IQR

        if feature_number==3:
            upper_bound=0
        else:
            upper_bound=Q3 + sup_threshold * IQR




        # removing too  small values and positive values for negpmax
        #removing too large values for rms

        outliers = output_df[(output_df[feature] < lower_bound) | (output_df[feature] > upper_bound)]



        output_df = output_df[(output_df[feature] > lower_bound) & (output_df[feature] < upper_bound)]
        if visualize:
            axes[i].scatter(outliers["x"], outliers["y"], c=outliers[feature], cmap='viridis')  # Adjust the colormap as needed
            axes[i].set_title(f'REMOVED OUTLIERS: {feature}')
            axes[i].set_xlabel("X-axis")
            axes[i].set_ylabel("Y-axis")

    if visualize:
        plt.tight_layout()
        plt.show()


    if visualize:

        # let's see if the representation is working better after removing the outliers '
        random_10_percent = output_df.sample(frac=0.1, random_state=42)

        fig, axes = plt.subplots(4, 3, figsize=(15, 10))  # Adjust the figsize as needed
        fig.suptitle("RESULT AFTER REMOVING OUTLIERS (ON A 10% SAMPLE) ", fontsize=16)
        # Flatten the 2D array of subplots
        axes = axes.flatten()

        for i, feature in enumerate(features):
            axes[i].scatter(random_10_percent["x"], random_10_percent["y"], c=random_10_percent[feature],
                            cmap='viridis',label=feature)  # Adjust the colormap as needed
            axes[i].set_title(f'scatter plot: {feature}')
            axes[i].set_xlabel("X-axis")
            axes[i].set_ylabel("Y-axis")
            axes[i].legend(loc='best')  # Add legend

        # Adjust layout to prevent overlap
        plt.tight_layout()
        plt.show()


    return output_df


#MAIN CODE
def pre_process(df, normalize=False, see_graphs=False):
    #REMOVING NOISE COLUMNS

    #working on tmax features, which seem to be the most meaningful for removing the noise sensors
    z_scores = pd.DataFrame(zscore(df), columns=df.columns[5::5])
    outlier_threshold = 3
    outliers = (z_scores.abs() > outlier_threshold)

    #noise sensors have a random distribution of tmax, almost all  the values are contained
    #in the value of 3*dev_std. We order by increasing order of outliers and remove the columns
    #that have less outliers (random distribution)

    percentage_outliers = (outliers.sum() / len(df)) * 100
    sorted_columns = percentage_outliers.sort_values()
    numbers_to_remove = [re.search(r'\[(\d+)\]', column).group(1) for column in sorted_columns[:6].index]
    columns_to_drop = [column for column in df.columns if
                       any(re.search(rf'\[{number}\]', column) for number in numbers_to_remove)]

    df_clean = df.drop(columns=columns_to_drop)

    # the feature describing the area and the feature describing the pmax are highly correlated,
    # we can remove the area feature.

    features_area = [f for f in df_clean.columns[4::5]]

    df_clean = df_clean.drop(columns=features_area)

    #OUTLIERS

    # df_clean_out_negpmax=work_on_outliers(df_clean,25,0,3,see_graphs) #3 means negpmax

    #we can work also on the rms

    # df_clean_out_rms = work_on_outliers(df_clean, 1.5,1.5, 5,see_graphs)  # 5 means rms

    # SPLITTING AND NORMALIZING IF REQUIRED
    df_final=df_clean.copy()


    if normalize:
        scaler = StandardScaler()

        X_scaled = scaler.fit_transform(X)
    else:
        X_scaled= X.copy()



    return df_final













In [5]:
from tensorflow.keras.losses import Loss
import tensorflow.keras.backend as K
import pandas as pd
from keras import backend as K
import keras.saving
from keras.saving import load_model


@keras.saving.register_keras_serializable()
def euclidean_distance_loss(y_true, y_pred):
    return K.sqrt(K.sum(K.square(y_pred - y_true), axis=-1))

def euclidean_distance_error(y_true, y_pred):
    """
    Calculate the mean Euclidean distance error between true and predicted 2D points.
    :param y_true: Array of true values with shape (n_samples, 2).
    :param y_pred: Array of predicted values with shape (n_samples, 2).
    :return: Mean Euclidean distance error.
    """
    return np.mean(np.sqrt(np.sum(np.square(y_true - y_pred), axis=1)))



In [6]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [10]:
df = pd.read_csv('/content/drive/MyDrive/development.csv')


'''columns_to_remove = ["pmax[0]", "negpmax[0]", "tmax[0]", "area[0]", "rms[0]",
                         "pmax[7]", "negpmax[7]", "tmax[7]", "area[7]", "rms[7]",
                         "pmax[12]", "negpmax[12]", "tmax[12]", "area[12]", "rms[12]",
                         "pmax[15]", "negpmax[15]", "tmax[15]", "area[15]", "rms[15]",
                         "pmax[16]", "negpmax[16]", "tmax[16]", "area[16]", "rms[16]",
                         "pmax[17]", "negpmax[17]", "tmax[17]", "area[17]", "rms[17]"]



df = df.drop(columns=columns_to_remove)

features_area = [f for f in df.columns[3::5]]

df = df.drop(columns=features_area)

print(df.shape)
df = df.dropna()
print(df.shape)'''

df = pre_process(df)



# Prepare the dataset
X = df.iloc[:, 2:].values
Y = df.iloc[:, :2].values

In [7]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)




# Further split training set to create a validation set
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=42) # Adjust the test_size as needed

# Reshape the data
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
X_val = X_val.reshape((X_val.shape[0], X_val.shape[1], 1))

# Define the model
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dense(64, activation='relu'))
model.add(Dense(32, activation='relu'))
model.add(Dense(2, activation='linear'))

# make the step of adam optimizer more smaller


# Compile the model
model.compile(optimizer='adam', loss=euclidean_distance_loss)

model.summary()


early_stopping = EarlyStopping(monitor='val_loss', patience=10)
model_checkpoint = ModelCheckpoint('best_model.keras', save_best_only=True)

model.fit(X_train, Y_train, epochs=500, batch_size=32, validation_data=(X_val, Y_val), callbacks=[early_stopping,model_checkpoint], verbose=1)

# Evaluate the model
predictions = model.predict(X_test)

print(predictions)

mse = mean_squared_error(Y_test, predictions)
print(f"Mean Squared Error: {mse}")
rmse = sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")
euclidian= euclidean_distance_error(Y_test, predictions)
print(f"Euclidian distance Error: {euclidian}")

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               6272      
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dense_2 (Dense)             (None, 32)                2080      
                                                                 
 dense_3 (Dense)             (None, 2)                 66        
                                                                 
Total params: 16674 (65.13 KB)
Trainable params: 16674 (65.13 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
Epoch 1/500
Epoch 2/500
Epoch 3/500

KeyboardInterrupt: 

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)




# Further split training set to create a validation set
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=42) # Adjust the test_size as needed

# Reshape the data
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
X_val = X_val.reshape((X_val.shape[0], X_val.shape[1], 1))

model = load_model('/content/best_model100.keras')

# Evaluate the model
predictions = model.predict(X_test)

print(predictions)

mse = mean_squared_error(Y_test, predictions)
print(f"Mean Squared Error: {mse}")
rmse = sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")
euclidian= euclidean_distance_error(Y_test, predictions)
print(f"Euclidian distance Error: {euclidian}")

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# Further split training set to create a validation set
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=42) # Adjust the test_size as needed

# Reshape the data
X_train = X_train.reshape((X_train.shape[0], X_train.shape[1], 1))
X_test = X_test.reshape((X_test.shape[0], X_test.shape[1], 1))
X_val = X_val.reshape((X_val.shape[0], X_val.shape[1], 1))

In [None]:
model = Sequential([
    Conv1D(128, 3, input_shape=(X_train.shape[1], 1)),
    BatchNormalization(),
    Activation('relu'),
    MaxPooling1D(3),
    Dropout(0.2),


    Conv1D(64, 3),
    BatchNormalization(),
    Activation('relu'),
    MaxPooling1D(3),
    Dropout(0.2),




    Flatten(),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(2, activation='linear')
])




model.compile(optimizer='adam', loss=euclidean_distance_loss)

model.summary()# assuming custom loss function

early_stopping = EarlyStopping(monitor='val_loss', patience=10)
model_checkpoint = ModelCheckpoint('best_model1010.keras', save_best_only=True)



model.fit(X_train, Y_train, epochs=500, batch_size=32, validation_data=(X_val, Y_val), callbacks=[early_stopping,model_checkpoint], verbose=1)
# Evaluate the model
predictions = model.predict(X_test)
mse = mean_squared_error(Y_test, predictions)
print(f"Mean Squared Error: {mse}")
rmse = sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")
euclidian= euclidean_distance_error(Y_test, predictions)
print(f"Euclidian distance Error: {euclidian}")









In [None]:
model = Sequential([





    Conv1D(64, 3, input_shape=(X_train.shape[1], 1)),
    BatchNormalization(),
    Activation('relu'),
    MaxPooling1D(3),







    Flatten(),
    Dense(128, activation='relu'),
    Dense(64, activation='relu'),
    Dense(32, activation='relu'),
    Dense(2, activation='linear')
])




model.compile(optimizer=Adam(learning_rate=0.001), loss=euclidean_distance_loss)

model.summary()# assuming custom loss function

early_stopping = EarlyStopping(monitor='val_loss', patience=10)
model_checkpoint = ModelCheckpoint('best_model1002.keras', save_best_only=True)



model.fit(X_train, Y_train, epochs=550, batch_size=32, validation_data=(X_val, Y_val), callbacks=[early_stopping,model_checkpoint], verbose=1)
# Evaluate the model
predictions = model.predict(X_test)
mse = mean_squared_error(Y_test, predictions)
print(f"Mean Squared Error: {mse}")
rmse = sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")
euclidian= euclidean_distance_error(Y_test, predictions)
print(f"Euclidian distance Error: {euclidian}")









In [None]:
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from math import sqrt
from sklearn.externals import joblib

# Prepare the data for XGBoost
df = pd.read_csv('development.csv')
df = pre_process(df)
df = df.sample(n=50000, random_state=42)

X = df.iloc[:, 2:].values
Y = df.iloc[:, :2].values

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)


# Define the XGBoost model
model = xgb.XGBRegressor(objective=, colsample_bytree = 0.3, learning_rate = 0.1,
                max_depth = 5, alpha = 10, n_estimators = 10)

# Fit the model
model.fit(X_train, Y_train, early_stopping_rounds=10, eval_set=[(X_val, Y_val)], verbose=True)

# Save the model
joblib.dump(model, "best_model.xgb")

# Evaluate the model
predictions = model.predict(X_test)
mse = mean_squared_error(Y_test, predictions)
print(f"Mean Squared Error: {mse}")
rmse = sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")
euclidian= euclidean_distance_error(Y_test, predictions)
print(f"Euclidian distance Error: {euclidian}")

In [15]:
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from math import sqrt
import joblib  # Import joblib

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=42)



# Define the model
model = xgb.XGBRegressor(
    objective='reg:squarederror',
    colsample_bytree=0.5,  # Adjusted
    learning_rate=0.05,    # Lowered
    max_depth=6,           # Adjusted
    alpha=10,
    n_estimators=2000,     # Increased
    subsample=0.8,
    min_child_weight=1,    # New parameter
    gamma=0.1              # New parameter
)


# Fit the model
model.fit(X_train, Y_train, early_stopping_rounds=10, eval_set=[(X_val, Y_val)], verbose=True)

# Save the model
joblib.dump(model, "best_model.xgb")

# Evaluate the model
predictions = model.predict(X_test)
mse = mean_squared_error(Y_test, predictions)
print(f"Mean Squared Error: {mse}")
rmse = sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")
euclidian= euclidean_distance_error(Y_test, predictions)
print(f"Euclidian distance Error: {euclidian}")

[0]	validation_0-rmse:111.59383




[1]	validation_0-rmse:106.16916
[2]	validation_0-rmse:101.06038
[3]	validation_0-rmse:96.16871
[4]	validation_0-rmse:91.52126
[5]	validation_0-rmse:87.07277
[6]	validation_0-rmse:82.87988
[7]	validation_0-rmse:78.90285
[8]	validation_0-rmse:75.12546
[9]	validation_0-rmse:71.51963
[10]	validation_0-rmse:68.10857
[11]	validation_0-rmse:64.85715
[12]	validation_0-rmse:61.82016
[13]	validation_0-rmse:58.88669
[14]	validation_0-rmse:56.10601
[15]	validation_0-rmse:53.43904
[16]	validation_0-rmse:50.91345
[17]	validation_0-rmse:48.50905
[18]	validation_0-rmse:46.23949
[19]	validation_0-rmse:44.08711
[20]	validation_0-rmse:42.07191
[21]	validation_0-rmse:40.10545
[22]	validation_0-rmse:38.23457
[23]	validation_0-rmse:36.48973
[24]	validation_0-rmse:34.81007
[25]	validation_0-rmse:33.22357
[26]	validation_0-rmse:31.70963
[27]	validation_0-rmse:30.27954
[28]	validation_0-rmse:28.94536
[29]	validation_0-rmse:27.65257
[30]	validation_0-rmse:26.42255
[31]	validation_0-rmse:25.26048
[32]	validation

In [12]:
import xgboost as xgb
import numpy as np
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from math import sqrt
import joblib  # Import joblib
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from math import sqrt
from joblib import dump

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)


# Model initialization and training with verbose output
model = RandomForestRegressor(n_estimators=100, random_state=42, verbose=3)
model.fit(X_train, Y_train)

dump(model, 'random_forest_model.joblib')

# Predicting the targets
predictions = model.predict(X_test)
mse = mean_squared_error(Y_test, predictions)
print(f"Mean Squared Error: {mse}")
rmse = sqrt(mse)
print(f"Root Mean Squared Error: {rmse}")
euclidian= euclidean_distance_error(Y_test, predictions)
print(f"Euclidian distance Error: {euclidian}")

building tree 1 of 100
building tree 2 of 100
building tree 3 of 100
building tree 4 of 100
building tree 5 of 100
building tree 6 of 100
building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100
building tree 11 of 100
building tree 12 of 100
building tree 13 of 100
building tree 14 of 100
building tree 15 of 100
building tree 16 of 100
building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100
building tree 23 of 100
building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100


[Parallel(n_jobs=1)]: Done  31 tasks      | elapsed: 11.2min


building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100
building tree 38 of 100
building tree 39 of 100
building tree 40 of 100
building tree 41 of 100
building tree 42 of 100
building tree 43 of 100
building tree 44 of 100
building tree 45 of 100
building tree 46 of 100
building tree 47 of 100
building tree 48 of 100
building tree 49 of 100
building tree 50 of 100
building tree 51 of 100
building tree 52 of 100
building tree 53 of 100
building tree 54 of 100
building tree 55 of 100
building tree 56 of 100
building tree 57 of 100
building tree 58 of 100
building tree 59 of 100
building tree 60 of 100
building tree 61 of 100
building tree 62 of 100
building tree 63 of 100
building tree 64 of 100
building tree 65 of 100
building tree 66 of 100
building tree 67 of 100
building tree 68 of 100
building tree 69 of 100
building tree 70 of 100
building tree 71 of 100
building tree 72 of 100
building tree 73

[Parallel(n_jobs=1)]: Done  31 tasks      | elapsed:    1.2s


Mean Squared Error: 15.578667301242266
Root Mean Squared Error: 3.9469820497745194
Euclidian distance Error: 4.504138469275587
