In [1]:
pip install --upgrade scikit-learn==1.3.1

Note: you may need to restart the kernel to use updated packages.


In [14]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score

In [15]:
# Load the dataset
dataset = 'Asset.xlsx'
df = pd.read_excel(dataset)


In [40]:
X = df.iloc[:, 2:]
y = df['AssetType']

In [41]:
X.head()

Unnamed: 0,Manufacturer,ModelNbr
0,,
1,,
2,,
3,,O
4,,FRENGER


In [42]:
y.head()

0    CHILLED BEAMS - PASSIVE
1    CHILLED BEAMS - PASSIVE
2    CHILLED BEAMS - PASSIVE
3    CHILLED BEAMS - PASSIVE
4    CHILLED BEAMS - PASSIVE
Name: AssetType, dtype: object

In [43]:
# Impute missing values
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

In [44]:
X = X_imputed

In [45]:
X

Unnamed: 0,Manufacturer,ModelNbr
0,DAIKIN,TBA
1,DAIKIN,TBA
2,DAIKIN,TBA
3,DAIKIN,O
4,DAIKIN,FRENGER
...,...,...
289048,ABB,ACH550
289049,ABB,ACH550
289050,ABB,ACH550
289051,ABB,ACH580


In [46]:
special_values = [1234, 'UNKOWN', 'Unknown', 'NA', 'NULL', 'UNKNOWN', 'TBA', 'N/A', 'NOT VISIBLE', '123TEST', 'UNABLE TO LOCATE', 'NO ID', 'NO ACCESS', 'UNKOWN', 'NaN', 'na', 'AS PER PICS']

# Create a copy of the DataFrame to avoid SettingWithCopyWarning
X_copy = X.copy()

# Replace values using .loc to avoid SettingWithCopyWarning
X_copy.loc[:, 'Manufacturer'] = X_copy['Manufacturer'].replace(special_values, pd.NA)
X_copy.loc[:, 'ModelNbr'] = X_copy['ModelNbr'].replace(special_values, pd.NA)

# Now X_copy contains the modified DataFrame
X = X_copy

In [47]:
X

Unnamed: 0,Manufacturer,ModelNbr
0,DAIKIN,
1,DAIKIN,
2,DAIKIN,
3,DAIKIN,O
4,DAIKIN,FRENGER
...,...,...
289048,ABB,ACH550
289049,ABB,ACH550
289050,ABB,ACH550
289051,ABB,ACH580


In [48]:
LE_Asset = LabelEncoder()
LE_Manufact = LabelEncoder()
LE_Model = LabelEncoder()

In [49]:
X['Manufacturer'] = LE_Manufact.fit_transform(X['Manufacturer'].astype(str))
X['ModelNbr'] = LE_Model.fit_transform(X['ModelNbr'].astype(str))

In [50]:
X

Unnamed: 0,Manufacturer,ModelNbr
0,286,7364
1,286,7364
2,286,7364
3,286,37751
4,286,25761
...,...,...
289048,5,8438
289049,5,8438
289050,5,8438
289051,5,8973


In [51]:
y = LE_Asset.fit_transform(y.astype(str))

In [53]:
# Initial training subset
initial_train_size = 150000
X_train_initial, _, y_train_initial, _ = train_test_split(X, y, train_size=initial_train_size, random_state=42)

In [54]:
# Train the RandomForestClassifier on the initial subset
rf_classifier = RandomForestClassifier(n_estimators=10, random_state=63)
rf_classifier.fit(X_train_initial, y_train_initial)

In [55]:
chunk_size = 10000
X_test_chunk, _, y_test_chunk, _ = train_test_split(X, y, test_size=chunk_size, random_state=35)
y_pred_encoded = rf_classifier.predict(X_test_chunk)

In [56]:
accuracy = accuracy_score(y_test_chunk, y_pred_encoded)
print(f"Accuracy on the chunk: {accuracy}")

Accuracy on the chunk: 0.6422005855518486


In [57]:
import joblib

In [58]:
modelName = 'model/rf_final5'
joblib.dump(rf_classifier, modelName)

['model/rf_final5']

In [59]:
joblib.dump(LE_Asset,'LE_ASSET')

['LE_ASSET']

In [60]:
joblib.dump(LE_Manufact,'LE_MANUFACT')

['LE_MANUFACT']

In [61]:
joblib.dump(LE_Model,'LE_MODEL')

['LE_MODEL']