In [None]:
!pip install pandas
!pip install joblib
!pip install catboost
!pip install scikit-learn
!pip install matplotlib
print('Done')

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
import joblib
from catboost import CatBoostRegressor, Pool
import groupdfcleaner as gdfc
import numpy as np

df = pd.read_csv('TRAININGSSET_REMOVED.csv')
df = gdfc.tijd(df)
df = gdfc.wind_direction(df)

# print(df['REMOVED'].value_counts())

df.drop(columns=['REMOVED'], inplace=True)

# df['REMOVED'] = df['REMOVED'].map({'REMOVED': 0, 'REMOVED': 1}).astype(int)
# df.drop(columns=['REMOVED'], inplace=True, errors='ignore')

ship_data = df
ship_data.to_csv('REMOVED.csv', index=False)

ship_data['REMOVED'] = ship_data['REMOVED'].replace('#', np.nan)

categorical_features = ['REMOVED']


X = ship_data.drop(columns=['REMOVED'])
y = ship_data['REMOVED']

train_pool = Pool(data=X, label=y, cat_features=categorical_features)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

model = CatBoostRegressor(iterations=1500, early_stopping_rounds=100, learning_rate=0.03, depth=10, loss_function='MAE', verbose=100, l2_leaf_reg=9)
print('silly cat is now running')
model.fit(train_pool)

y_pred = model.predict(X_test)

X_test['REMOVED'] = y_pred
X_test['REMOVED'] = y_test.values
X_test.to_csv('REMOVED.csv', index=False)

joblib.dump(model, 'REMOVED.pkl')

y_pred_test = model.predict(X_test)
mae_test = mean_absolute_error(y_test, y_pred_test)
print("MAE on test set:", mae_test)
r2_test = r2_score(y_test, y_pred_test)
print("R2 score on test set:", r2_test)
mse_test = mean_squared_error(y_test, y_pred_test)
print("Mean Squared Error on test set:", mse_test)

y_pred_train = model.predict(X_train)
mae_train = mean_absolute_error(y_train, y_pred_train)
print("MAE on training set:", mae_train)
r2_train = r2_score(y_train, y_pred_train)
print("R2 score on training set:", r2_train)
mse_train = mean_squared_error(y_train, y_pred_train)
print("Mean Squared Error on training set:", mse_train)

feature_importances = model.get_feature_importance(train_pool)
feature_importance_dict = dict(zip(X.columns, feature_importances))
print("\nFeature Importance (Weight):")
for feature, importance in feature_importance_dict.items():
    print(f"{feature}: {importance}")

feature_importances_loss_change = model.get_feature_importance(train_pool, type='LossFunctionChange')
feature_importance_loss_change_dict = dict(zip(X.columns, feature_importances_loss_change))
print("\nFeature Importance (Loss Function Change):")
for feature, importance in feature_importance_loss_change_dict.items():
    print(f"{feature}: {importance}")

feature_importances_permutation = model.get_feature_importance(train_pool, type='PredictionValuesChange')
feature_importance_permutation_dict = dict(zip(X.columns, feature_importances_permutation))
print("\nFeature Importance (Permutation):")
for feature, importance in feature_importance_permutation_dict.items():
    print(f"{feature}: {importance}")

In [None]:
import matplotlib.pyplot as plt

def plot_bar(dictionary, title):
    
    sorted_items = sorted(dictionary.items(), key=lambda x: x[1], reverse=False)
    features = [item[0] for item in sorted_items]
    values = [item[1] for item in sorted_items]
    
    plt.figure(figsize=(8, 7))
    plt.barh(features, values)
    plt.xlabel('Importance')
    plt.ylabel('Features')
    plt.title(title)
    plt.tight_layout()
    plt.show()

plot_bar(feature_importance_dict, 'Feature Importance (Weight)')
plot_bar(feature_importance_loss_change_dict, 'Feature Importance (Loss Change)')
plot_bar(feature_importance_permutation_dict, 'Feature Importance (Permutation)')


In [42]:
loaded_model = joblib.load('REMOVED.pkl')

testset = pd.read_csv('REMOVED.csv')
testset = gdfc.tijd(testset)
testset = gdfc.wind_direction(testset)

testset.drop(columns=['REMOVED'], inplace=True)

categorical_features = ['REMOVED']

testset['REMOVED'] = testset['REMOVED'].replace('#', '0')

X_test = testset

test_pool = Pool(data=X_test, cat_features=categorical_features)

y_pred_test = loaded_model.predict(test_pool)

testset['REMOVED'] = y_pred_test
testset.to_csv('REMOVED.csv', index=False)
testset[['REMOVED']].to_csv('REMOVED.csv', index=False, header=None)