In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load your transformed dataset
df = pd.read_csv('dataset/yearRecord.csv')  # Use the correct file format and path
# Drop rows with missing values
df = df.dropna()

In [3]:
# Label encoding for AssetType
asset_type_encoder = LabelEncoder()
df['AssetTypeEncoded'] = asset_type_encoder.fit_transform(df['AssetType'])

# Label encoding for WorkDescription
work_description_encoder = LabelEncoder()
df['WorkDescriptionEncoded'] = work_description_encoder.fit_transform(df['WorkDescription'])

In [4]:
# Prepare features (X) and target variables (y) for classification
X_classify = df[['AssetTypeEncoded', 'YearsAfterInstallation']]
y_classify = df['WorkDescriptionEncoded']

In [5]:
# Split the data into training and testing sets for classification
X_train_classify, X_test_classify, y_train_classify, y_test_classify = train_test_split(X_classify, y_classify, test_size=0.2, random_state=42)

In [6]:
# Initialize and train the RandomForestClassifier
classifier = RandomForestClassifier(n_estimators=50, random_state=42)
classifier.fit(X_train_classify, y_train_classify)

In [7]:
# Make predictions on the test set for classification
y_pred_classify = classifier.predict(X_test_classify)


In [8]:
# Evaluate the classification model
accuracy = accuracy_score(y_test_classify, y_pred_classify)
print(f'Classification Accuracy: {accuracy:.2f}')

Classification Accuracy: 0.09


In [11]:
X_classify

Unnamed: 0,AssetTypeEncoded,YearsAfterInstallation
0,64,25.0
1,7,4.0
2,7,4.0
3,64,20.0
4,64,19.0
...,...,...
10826,143,12.0
10827,143,12.0
10828,143,13.0
10829,143,12.0


In [12]:
# Prepare features (X) and target variable (y) for regression
X_regress = df[['AssetTypeEncoded', 'YearsAfterInstallation', 'WorkDescriptionEncoded']]
y_regress = df['Spend_ExGST']

In [13]:
# Split the data into training and testing sets for regression
X_train_regress, X_test_regress, y_train_regress, y_test_regress = train_test_split(X_regress, y_regress, test_size=0.2, random_state=42)

In [14]:
# Initialize and train the RandomForestRegressor
regressor = RandomForestRegressor(n_estimators=10, random_state=42)
regressor.fit(X_train_regress, y_train_regress)

In [15]:
# Make predictions on the test set for regression
y_pred_regress = regressor.predict(X_test_regress)

In [16]:
mse = mean_squared_error(y_test_regress, y_pred_regress)
print(f'Regression Mean Squared Error: {mse:.2f}')

Regression Mean Squared Error: 2305541.07
