In [61]:
import pandas as pd
import numpy as np

from sklearn.impute import KNNImputer
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import xgboost as xgb
import lightgbm as lgb

In [62]:
# Import cleaned data
df=pd.read_csv('output/0. cleaned_data/df_task2.csv')

In [63]:
df=df.drop(['description_sector','Quarter'],axis=1)
df=pd.get_dummies(df)
df = df.astype(float)

In [64]:
# Change the code_sector to work with certain ml functions below
df['code_sector'] = (df['code_sector'] - 1).astype(int)

In [65]:
X_cols=df[['code_sector']].columns
Y_cols=df.drop(['code_sector'],axis=1).columns

In [66]:
# Split dataset into train-test
X = df.drop(columns=['code_sector'])
y = df['code_sector']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [67]:
# Apply outlier and missing no inputation to train and test separately to avoid leakage

def remove_outliers_iqr(df,threshold):
    """
    Remove outliers from each column of a DataFrame using the Interquartile Range (IQR) method.
    Turns the outliers with None
    """
    df_no_outliers = df.copy()  # Create a copy of the DataFrame to avoid modifying the original
    
    # Select columns with float dtype
    float_cols = df.select_dtypes(include=['float']).columns

    # Iterate over each float column
    for column in df[float_cols].columns:
        # Calculate the first quartile (Q1) and third quartile (Q3) for the column
        Q1 = df[column].quantile(0.25)
        Q3 = df[column].quantile(0.75)
        
        # Calculate the IQR for the column
        IQR = Q3 - Q1
        
        # Define the lower and upper bounds for outliers for the column
        lower_bound = Q1 - threshold * IQR
        upper_bound = Q3 + threshold * IQR
        
        # Replace outliers with None for the column
        df_no_outliers[column] = df[column].mask((df[column] < lower_bound) | (df[column] > upper_bound), other=None)
    
    return df_no_outliers

X_train = remove_outliers_iqr(X_train,4)
X_test = remove_outliers_iqr(X_test,4)

In [68]:
# Inpute missing data with interpolation as it is time series data
X_train = X_train.interpolate(method='linear')
X_test = X_test.interpolate(method='linear')

In [69]:
# Inpute the remaining missing values (due to missing from the start or from the end of df)
imputer = KNNImputer(n_neighbors=5)
X_train = imputer.fit_transform(X_train)
X_test = imputer.fit_transform(X_test)

In [70]:
# For more comprehensible output
labels_list=['Transportation and warehousing','Wholesale trade','Manufacturing','Retail trade','Energy','Construction']

In [71]:
# Initialize and train the Random Forest Classifier
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Classification report
print(classification_report(y_test, y_pred,target_names=labels_list))


Accuracy: 0.74
                                precision    recall  f1-score   support

Transportation and warehousing       0.81      0.41      0.54        32
               Wholesale trade       0.73      0.56      0.63       187
                 Manufacturing       0.71      0.96      0.82       570
                  Retail trade       0.85      0.40      0.54       118
                        Energy       1.00      0.37      0.54        38
                  Construction       0.80      0.62      0.70       211

                      accuracy                           0.74      1156
                     macro avg       0.82      0.55      0.63      1156
                  weighted avg       0.76      0.74      0.72      1156



In [73]:
# Initialize and train the XGBoost Classifier
clf = xgb.XGBClassifier(objective='multi:softmax', num_class=6, random_state=42)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Classification report
print(classification_report(y_test, y_pred,target_names=labels_list))


Accuracy: 0.79
                                precision    recall  f1-score   support

Transportation and warehousing       0.68      0.47      0.56        32
               Wholesale trade       0.78      0.61      0.69       187
                 Manufacturing       0.80      0.94      0.87       570
                  Retail trade       0.78      0.59      0.67       118
                        Energy       0.92      0.61      0.73        38
                  Construction       0.75      0.73      0.74       211

                      accuracy                           0.79      1156
                     macro avg       0.79      0.66      0.71      1156
                  weighted avg       0.79      0.79      0.78      1156



In [75]:
# Initialize and train the LightGBM Classifier
clf = lgb.LGBMClassifier(num_class=6, random_state=42,force_col_wise=True)
clf.fit(X_train, y_train)

# Predict on the test set
y_pred = clf.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy:.2f}')

# Classification report
print(classification_report(y_test, y_pred,target_names=labels_list))


[LightGBM] [Info] Total Bins 14500
[LightGBM] [Info] Number of data points in the train set: 4623, number of used features: 83
[LightGBM] [Info] Start training from score -3.257016
[LightGBM] [Info] Start training from score -1.913769
[LightGBM] [Info] Start training from score -0.671535
[LightGBM] [Info] Start training from score -2.150941
[LightGBM] [Info] Start training from score -3.526144
[LightGBM] [Info] Start training from score -1.849873
Accuracy: 0.79
                                precision    recall  f1-score   support

Transportation and warehousing       0.72      0.41      0.52        32
               Wholesale trade       0.76      0.65      0.70       187
                 Manufacturing       0.79      0.93      0.86       570
                  Retail trade       0.84      0.53      0.65       118
                        Energy       0.96      0.71      0.82        38
                  Construction       0.75      0.73      0.74       211

                      accura

In [None]:
# We dont export anything to output/ , we will export the best model with the production process.