In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.impute import SimpleImputer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

In [2]:
# Load the dataset
dataset = 'Asset.xlsx'
df = pd.read_excel(dataset)
X = df.iloc[:, 1:]
y = df['AssetType']

In [3]:
X.head()

Unnamed: 0,SerialNo,Manufacturer,ModelNbr
0,,,
1,,,
2,,,
3,0.0,,O
4,,,FRENGER


In [4]:
y.head()

0    CHILLED BEAMS - PASSIVE
1    CHILLED BEAMS - PASSIVE
2    CHILLED BEAMS - PASSIVE
3    CHILLED BEAMS - PASSIVE
4    CHILLED BEAMS - PASSIVE
Name: AssetType, dtype: object

In [8]:
# Handle special values
special_values = [1234,'UNKOWN','Unknown','NA', 'NULL', 'UNKNOWN','TBA','N/A','NOT VISIBLE','123TEST', 'UNABLE TO LOCATE', 'NO ID', 'NO ACCESS', 'UNKOWN', 'NaN', 'na','AS PER PICS']
for column in X.columns:
    X.loc[:, column] = X[column].replace(special_values, pd.NA)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  X.loc[:, column] = X[column].replace(special_values, pd.NA)


In [10]:
label_encoder = LabelEncoder()
for column in X.select_dtypes(include=['object']).columns:
    X.loc[:, column] = label_encoder.fit_transform(X[column].astype(str))

y = label_encoder.fit_transform(y.astype(str))

In [11]:
# Impute missing values
imputer = SimpleImputer(strategy='most_frequent')
X_imputed = pd.DataFrame(imputer.fit_transform(X), columns=X.columns)

  mode = stats.mode(array)


In [12]:
# Initial training subset
initial_train_size = 20000
X_train_initial, _, y_train_initial, _ = train_test_split(X_imputed, y, train_size=initial_train_size, random_state=42)

In [13]:
# Train the RandomForestClassifier on the initial subset
rf_classifier = RandomForestClassifier(n_estimators=100, random_state=63)
rf_classifier.fit(X_train_initial, y_train_initial)

RandomForestClassifier(random_state=63)

In [14]:
chunk_size = 10000
X_test_chunk, _, y_test_chunk, _ = train_test_split(X_imputed, y, test_size=chunk_size, random_state=42)
y_pred_encoded = rf_classifier.predict(X_test_chunk)

In [15]:
accuracy = accuracy_score(y_test_chunk, y_pred_encoded)
print(f"Accuracy on the chunk: {accuracy}")

Accuracy on the chunk: 0.5095161134264817


In [16]:
chunk_size = 5000
X_test_chunk1, _, y_test_chunk1, _ = train_test_split(X_imputed, y, test_size=chunk_size, random_state=33)
y_pred_encoded1 = rf_classifier.predict(X_test_chunk1)

In [17]:
accuracy1 = accuracy_score(y_test_chunk1, y_pred_encoded1)
print(f"Accuracy on the chunk: {accuracy1}")

Accuracy on the chunk: 0.5087536480868007


In [18]:
import joblib

In [19]:
modelName = 'rf_63_whole'
joblib.dump(rf_classifier, modelName)

['rf_63_whole']

In [20]:
encoder_name = 'label_encoder.joblib'
joblib.dump(label_encoder,encoder_name)

['label_encoder.joblib']

In [28]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

In [29]:
# Example for RandomForestClassifier
num_estimators_list = [10, 50, 100, 150, 200]
scores = []

In [21]:
import xgboost as xgb

In [22]:
# Train XGBoost on the same initial subset
dtrain = xgb.DMatrix(X_train_initial, label=y_train_initial)
param = {'max_depth': 6, 'objective': 'multi:softmax', 'num_class': len(label_encoder.classes_)}

In [23]:
num_round = 100
xgb_classifier = xgb.train(param, dtrain, num_round)

In [24]:
# Test XGBoost on the same chunk
dtest = xgb.DMatrix(X_test_chunk)
y_pred_xgb_encoded = xgb_classifier.predict(dtest).astype(int)

In [25]:
accuracy_xgb = accuracy_score(y_test_chunk, y_pred_xgb_encoded)
print(f"Accuracy on the chunk (XGBoost): {accuracy_xgb}")

Accuracy on the chunk (XGBoost): 0.33703633359971047


In [26]:
# Test XGBoost on the same chunk 5000
dtest = xgb.DMatrix(X_test_chunk1)
y_pred_xgb_encoded1 = xgb_classifier.predict(dtest).astype(int)

In [27]:
accuracy_xgb1 = accuracy_score(y_test_chunk1, y_pred_xgb_encoded1)
print(f"Accuracy on the chunk (XGBoost): {accuracy_xgb1}")

Accuracy on the chunk (XGBoost): 0.33688783431261066
