In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import BaggingRegressor, RandomForestRegressor,AdaBoostRegressor, GradientBoostingRegressor

from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error


url = 'https://raw.githubusercontent.com/TheLazyCactus/ML_Project/refs/heads/main/ML_Project_safety.csv'
df = pd.read_csv(url, sep=";", low_memory =False)

Need to change the order to get the oldest value first

In [None]:
df = df.sort_values(by="Year", ascending=True)
#Drop columns
cols_to_remove = ["FAR", "TRIR total","TRIR company only", "TRIR contractor only", "LTIR company only", "LTIR contractor only"]
df.drop(["FAR total", "LTIR company only","LTIR contractor only","TRIR company only", "TRIR contractor only"], axis = 1, inplace=True)

df = df.drop(columns=[col for col in cols_to_remove if col in df.columns])  # List of columns to convert
cols = ["LTIR total"]
cols_to_process = [col for col in cols if col in df.columns]

# Replace commas with dots and convert to float for the existing columns
df[cols_to_process] = df[cols_to_process].replace(',', '.', regex=True).astype(float)
df[cols] = df[cols].astype(float)

df = df.fillna(0)

EDA

In [None]:
print(df.shape)
print(df.columns)
print(df.dtypes)
print(df.nunique())
print(df.isna().sum())

Filter to keep only the last 3 years

In [None]:
from sklearn.preprocessing import OneHotEncoder

# Ensure the "Year" column exists and filter only the last 3 years
year_column = "Year"  # Adjust this if your column name is different
if year_column not in df.columns:
    raise KeyError(f"Column '{year_column}' not found. Available columns: {df.columns}")

df_filtered = df[df[year_column].isin([2020, 2021, 2022, 2023])]

# Display filtered data
print(df_filtered.head())

Lag creation

In [None]:
# Create lag features: LTIR from the past 1 2 and 3 years
df_filtered['LTIR_1'] = df_filtered.groupby('Company code')['LTIR total'].shift(1)
df_filtered['LTIR_2'] = df_filtered.groupby('Company code')['LTIR total'].shift(2)
df_filtered['LTIR_3'] = df_filtered.groupby('Company code')['LTIR total'].shift(3)
df_filtered = df_filtered.rename(columns={"LTIR total": "LTIR 2023", "LTIR_1": "LTIR 2022", "LTIR_2": "LTIR 2021", "LTIR_3": "LTIR 2020"})
# Drop rows with NaN values (first 2 years for each company)
data = df_filtered.dropna().reset_index(drop=True)
print(data)  # Fixed column name

# Drop rows with NaN values due to shifting
df_filtered = df_filtered.dropna().reset_index(drop=True)  # Ensure you are dropping from df_final, not df

One hot encoding

In [None]:
from sklearn.preprocessing import OneHotEncoder


# Ensure "Company code" exists before encoding
company_column = "Company code"
if company_column not in df_filtered.columns:
    raise KeyError(f"Column '{company_column}' not found. Available columns: {df_filtered.columns}")

# Initialize OneHotEncoder
encoder = OneHotEncoder(sparse_output=False)

# Fit-transform the 'Company code' column
encoded_company = encoder.fit_transform(df_filtered[[company_column]])

# Get feature names for encoded categories
company_columns = encoder.get_feature_names_out([company_column])

# Convert to DataFrame
df_encoded = pd.DataFrame(encoded_company, columns=company_columns)

# Merge encoded data back into df_filtered and drop the original "Company code"
df_final = pd.concat([df_filtered.reset_index(drop=True), df_encoded], axis=1)
df_final.drop(columns=[company_column], inplace=True, errors='ignore')

import pickle
with open('encoder.pkl', 'wb') as f:
    pickle.dump(encoder, f)


In [None]:
df_final.drop(columns=["Year"], inplace=True)

**Model training**

In [None]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression


# Features for initial model training
X = df_final[['LTIR 2020', 'LTIR 2021', 'LTIR 2022']]  # Predictors
y = df_final['LTIR 2023']  # Target variable

# Train Linear Regression model
model = LinearRegression()
linear_reg = model.fit(X, y)

# Prepare storage for future predictions
future_data = []

for _, row in df_final.iterrows():
    # Extract relevant lag features
    lag2 = row['LTIR 2020']
    lag1 = row['LTIR 2021']
    ltir_total = row['LTIR 2022']

    # Predict 2024 LTIR using Lag2, Lag1, and LTIR total
    input_2024 = pd.DataFrame([[lag2, lag1, ltir_total]], columns=['LTIR 2020', 'LTIR 2021', 'LTIR 2022'])
    predicted_2024 = linear_reg.predict(input_2024)[0]

    # Predict 2025 LTIR using Lag1, LTIR total, and predicted 2024
    input_2025 = pd.DataFrame([[lag1, ltir_total, predicted_2024]], columns=['LTIR 2020', 'LTIR 2021', 'LTIR 2022'])
    predicted_2025 = linear_reg.predict(input_2025)[0]

    # Predict 2026 LTIR using LTIR total, predicted 2024, and predicted 2025
    input_2026 = pd.DataFrame([[ltir_total, predicted_2024, predicted_2025]], columns=['LTIR 2020', 'LTIR 2021', 'LTIR 2022'])
    predicted_2026 = linear_reg.predict(input_2026)[0]

    # Append predictions to the future_data list
    future_data.append([predicted_2024, predicted_2025, predicted_2026])

# Convert predictions into a DataFrame
future_df = pd.DataFrame(future_data, columns=['LTIR_4', 'LTIR_5', 'LTIR_6'])

# Concatenate predictions with original dataset
df_final = pd.concat([df_final, future_df], axis=1)

# Display the updated DataFrame
print("Updated DataFrame with future LTIR values:")
print(df_final)


In [None]:
df_final.to_csv("final_LTIR_predictions.csv", index=False)