In [1]:
import pandas as pd
import numpy as np
import plotly.express as px
import pickle
from imblearn.over_sampling import SMOTEN
from imblearn.combine import SMOTEENN
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder 
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.metrics import classification_report

LOADING DATA

In [2]:
model_df = pd.read_csv(r'Copper.csv')

QUERYING WON AND LOST

In [3]:
# Win/Lost
query_df = model_df.query("status == 'Won' or status == 'Lost'")
print(query_df['status'].value_counts())

status
Won     59278
Lost     8741
Name: count, dtype: int64


In [4]:
fig = px.bar(x = query_df['status'].value_counts().values)
fig.show()

IMBALANCED DATA

In [5]:
new_df = pd.read_csv(r'C:\Users\Acer\Desktop\copper\Copper_log_transformed.csv')

In [6]:
query_df = new_df.drop(['application','Unnamed: 0','item_date'],axis = 1)
query_df.sample()

Unnamed: 0,country,status,item type,thickness,width,product_ref,delivery date,selling_price,quantity_tons
9012,32.0,Won,W,0.470004,6.102559,1671876026,2021,6.981006,3.931826


MODEL BUILDING

In [7]:
x = query_df.drop(['status'],axis = 1)
y = query_df['status']

In [8]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(y)

In [9]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.25,random_state=42)

In [10]:
x_train.shape,x_test.shape
x_train.sample()

Unnamed: 0,country,item type,thickness,width,product_ref,delivery date,selling_price,quantity_tons
58049,32.0,W,0.405465,7.130899,1670798778,2020,6.413459,1.386294


In [11]:
trans1 = ColumnTransformer([
    ('trans1',OneHotEncoder(sparse_output = False,handle_unknown = 'ignore'),[0,1,4,5])
],remainder = 'passthrough')

In [12]:
x_train_trans = trans1.fit_transform(x_train)
x_train_trans.shape

(50698, 52)

In [13]:
x_test_trans = trans1.transform(x_test)
x_test_trans.shape

(16900, 52)

SAMPLING

In [14]:
# SMOTEENN - Combination

sampling = SMOTEENN()

x_res,y_res = sampling.fit_resample(x_train_trans,y_train)

In [15]:
# SMOTEN - Over sampling

sampling = SMOTEN()

x_res,y_res = sampling.fit_resample(x_train_trans,y_train)

TRAINING THE MODEL

In [16]:
model = RandomForestClassifier(random_state=42)

model_1 = XGBClassifier()

model_2 = HistGradientBoostingClassifier(random_state=42)

model_3 = ExtraTreesClassifier()

In [17]:
result = model.fit(x_res,y_res)
result = model_1.fit(x_res,y_res)
result = model_2.fit(x_res,y_res)
result = model_3.fit(x_res,y_res)

PREDICTIONS

In [18]:
# Random forest regressor
y_pred = model.predict(x_test_trans)
y_pred_1 = model_1.predict(x_test_trans)
y_pred_2 = model_2.predict(x_test_trans)
y_pred_3 = model_3.predict(x_test_trans)

CLASSIFICATION REPORT

In [19]:
# SMOTEENN - Default data
report = classification_report(y_test,y_pred)
print(report)

report = classification_report(y_test,y_pred_1)
print(report)

report = classification_report(y_test,y_pred_2)
print(report)

report = classification_report(y_test,y_pred_3)
print(report)

              precision    recall  f1-score   support

           0       0.73      0.65      0.69      2143
           1       0.95      0.96      0.96     14757

    accuracy                           0.93     16900
   macro avg       0.84      0.81      0.82     16900
weighted avg       0.92      0.93      0.92     16900

              precision    recall  f1-score   support

           0       0.52      0.60      0.56      2143
           1       0.94      0.92      0.93     14757

    accuracy                           0.88     16900
   macro avg       0.73      0.76      0.74     16900
weighted avg       0.89      0.88      0.88     16900

              precision    recall  f1-score   support

           0       0.45      0.56      0.50      2143
           1       0.93      0.90      0.92     14757

    accuracy                           0.86     16900
   macro avg       0.69      0.73      0.71     16900
weighted avg       0.87      0.86      0.86     16900

              preci

In [20]:
# SMOTEENN - Processed Data
report = classification_report(y_test,y_pred)
print(report)

report = classification_report(y_test,y_pred_1)
print(report)

report = classification_report(y_test,y_pred_2)
print(report)

report = classification_report(y_test,y_pred_3)
print(report)

              precision    recall  f1-score   support

           0       0.73      0.65      0.69      2143
           1       0.95      0.96      0.96     14757

    accuracy                           0.93     16900
   macro avg       0.84      0.81      0.82     16900
weighted avg       0.92      0.93      0.92     16900

              precision    recall  f1-score   support

           0       0.52      0.60      0.56      2143
           1       0.94      0.92      0.93     14757

    accuracy                           0.88     16900
   macro avg       0.73      0.76      0.74     16900
weighted avg       0.89      0.88      0.88     16900

              precision    recall  f1-score   support

           0       0.45      0.56      0.50      2143
           1       0.93      0.90      0.92     14757

    accuracy                           0.86     16900
   macro avg       0.69      0.73      0.71     16900
weighted avg       0.87      0.86      0.86     16900

              preci

In [21]:
# SMOTEN - Default data

report = classification_report(y_test,y_pred)
print(report)

report = classification_report(y_test,y_pred_1)
print(report)

report = classification_report(y_test,y_pred_2)
print(report)

report = classification_report(y_test,y_pred_3)
print(report)

              precision    recall  f1-score   support

           0       0.73      0.65      0.69      2143
           1       0.95      0.96      0.96     14757

    accuracy                           0.93     16900
   macro avg       0.84      0.81      0.82     16900
weighted avg       0.92      0.93      0.92     16900

              precision    recall  f1-score   support

           0       0.52      0.60      0.56      2143
           1       0.94      0.92      0.93     14757

    accuracy                           0.88     16900
   macro avg       0.73      0.76      0.74     16900
weighted avg       0.89      0.88      0.88     16900

              precision    recall  f1-score   support

           0       0.45      0.56      0.50      2143
           1       0.93      0.90      0.92     14757

    accuracy                           0.86     16900
   macro avg       0.69      0.73      0.71     16900
weighted avg       0.87      0.86      0.86     16900

              preci

In [22]:
# SMOTEN - Processed Data
report = classification_report(y_test,y_pred)
print(report)

report = classification_report(y_test,y_pred_1)
print(report)

report = classification_report(y_test,y_pred_2)
print(report)

report = classification_report(y_test,y_pred_3)
print(report)

              precision    recall  f1-score   support

           0       0.73      0.65      0.69      2143
           1       0.95      0.96      0.96     14757

    accuracy                           0.93     16900
   macro avg       0.84      0.81      0.82     16900
weighted avg       0.92      0.93      0.92     16900

              precision    recall  f1-score   support

           0       0.52      0.60      0.56      2143
           1       0.94      0.92      0.93     14757

    accuracy                           0.88     16900
   macro avg       0.73      0.76      0.74     16900
weighted avg       0.89      0.88      0.88     16900

              precision    recall  f1-score   support

           0       0.45      0.56      0.50      2143
           1       0.93      0.90      0.92     14757

    accuracy                           0.86     16900
   macro avg       0.69      0.73      0.71     16900
weighted avg       0.87      0.86      0.86     16900

              preci

USER PREDICTION

In [23]:
x_train.sample()

Unnamed: 0,country,item type,thickness,width,product_ref,delivery date,selling_price,quantity_tons
59897,78.0,S,1.386294,7.130899,1668701718,2020,6.575076,3.89182


In [24]:
# Real world Prediction

country = 32
item_type = 'W'
product_ref = 1670798778
delivery_year = 2021

thickness = 0.75
log_thickness = np.log(thickness)

width = 1000
log_width = np.log(width)

quantity_tons = 20
log_quantity = np.log(quantity_tons)

selling_price = 20
log_selling_price = np.log(selling_price)

In [25]:
pred_df = pd.DataFrame([[country,item_type,log_thickness,log_width,product_ref,delivery_year,log_selling_price,log_quantity]],
                       columns = ['country','item type','thickness','width','product_ref','delivery date','selling_price','quantity_tons'])

pred_df

Unnamed: 0,country,item type,thickness,width,product_ref,delivery date,selling_price,quantity_tons
0,32,W,-0.287682,6.907755,1670798778,2021,2.995732,2.995732


In [26]:
pref_df_trans = trans1.transform(pred_df)

In [27]:
y_pred = model.predict(pref_df_trans)

if y_pred == 1:
    print('Status : Won')
else:
    print('Status : Lost')

Status : Lost


SAVING THE ENCODER AND THE MODEL

In [28]:
pickle.dump(model,open('Status_Prediction.pkl','wb'))

In [29]:
pickle.dump(trans1,open('encoder.pkl','wb'))