In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [22]:
df=pd.read_csv("cleaned_data.csv") # for regression 
df_c=df.query('status == 0 or status == 1') # for classification - considering only Won and Loss

In [23]:
df.head()

Unnamed: 0,quantity tons,customer,country,application,thickness,width,product_ref,selling_price,item_date_day,item_date_month,item_date_year,delivery_date_day,delivery_date_month,delivery_date_year,item type,status
0,3.991779,30156308.0,28.0,10.0,0.693147,1500.0,1670798778,6.749931,1.0,4.0,2021.0,1.0,4.0,2021.0,0.0,1.0
1,5.020288,30202938.0,25.0,41.0,-0.223144,1210.0,1668701718,6.953684,1.0,4.0,2021.0,1.0,4.0,2021.0,0.0,1.0
2,5.020288,30153963.0,30.0,28.0,-0.967584,952.0,628377,6.468211,1.0,4.0,2021.0,1.0,4.0,2021.0,1.0,1.0
3,5.020288,30349574.0,32.0,59.0,0.832909,1317.0,1668701718,6.64379,1.0,4.0,2021.0,1.0,4.0,2021.0,2.0,1.0
4,5.020288,30211560.0,28.0,10.0,1.386294,1980.0,640665,6.357842,1.0,4.0,2021.0,1.0,4.0,2021.0,0.0,1.0


In [24]:
df.shape

(181636, 16)

In [6]:
df_c.head()

Unnamed: 0,quantity tons,customer,country,application,thickness,width,product_ref,selling_price,item_date_day,item_date_month,item_date_year,delivery_date_day,delivery_date_month,delivery_date_year,item type,status
0,3.991779,30156308.0,28.0,10.0,0.693147,1500.0,1670798778,6.749931,1.0,4.0,2021.0,1.0,4.0,2021.0,0.0,1.0
1,6.643822,30202938.0,25.0,41.0,-0.223144,1210.0,1668701718,6.953684,1.0,4.0,2021.0,1.0,4.0,2021.0,0.0,1.0
2,5.956169,30153963.0,30.0,28.0,-0.967584,952.0,628377,6.468211,1.0,4.0,2021.0,1.0,4.0,2021.0,1.0,1.0
3,5.310301,30349574.0,32.0,59.0,0.832909,1317.0,1668701718,6.64379,1.0,4.0,2021.0,1.0,4.0,2021.0,2.0,1.0
4,6.666354,30211560.0,28.0,10.0,1.386294,1980.0,640665,6.357842,1.0,4.0,2021.0,1.0,4.0,2021.0,0.0,1.0


In [7]:
df_c.shape

(150438, 16)

## Model building

#### Regression model - to predict selling price

In [25]:
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.tree import ExtraTreeRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error , r2_score
import pickle

In [26]:
# To split the train and test data

X=df.drop("selling_price",axis=1)
y=df['selling_price']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)

In [27]:
# choosing the best algorithm

models=[LinearRegression,DecisionTreeRegressor,RandomForestRegressor,GradientBoostingRegressor,ExtraTreeRegressor,XGBRegressor]

for i in models :
    model=i().fit(X_train,y_train)
    y_pred=model.predict(X_test)
    mse=mean_squared_error(y_test,y_pred)
    r2=r2_score(y_test,y_pred)

    print("-------------------------------")
    print(i.__name__,":")
    print("mean_squared_error :",mse)
    print("r2_score :",r2)


-------------------------------
LinearRegression :
mean_squared_error : 0.047071009286811114
r2_score : 0.41363200879215833
-------------------------------
DecisionTreeRegressor :
mean_squared_error : 0.00533522837865582
r2_score : 0.9335385581395631
-------------------------------
RandomForestRegressor :
mean_squared_error : 0.002996626302013862
r2_score : 0.9626707442280236
-------------------------------
GradientBoostingRegressor :
mean_squared_error : 0.00811453184075381
r2_score : 0.8989165134305276
-------------------------------
ExtraTreeRegressor :
mean_squared_error : 0.009438155237324317
r2_score : 0.8824280122506685
-------------------------------
XGBRegressor :
mean_squared_error : 0.0037756148797920496
r2_score : 0.9529668102260468


Considering the MSE and r2 score - **Random Forest** performs well . Proceeding with Random Forest for regression

#### Hyper parameter tuning using GridSeachCV

In [None]:
param_grid_r = {
    'max_depth': [None, 2, 5, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': [None, 'sqrt', 'log2']
}

grid_search_r=GridSearchCV(estimator=RandomForestRegressor() , param_grid=param_grid_r, cv=5, n_jobs=-1,verbose=2)
grid_search_r.fit(X_train,y_train)
print('Best Parameters:',grid_search_r.best_params_)
print("-------------------------------")
print('Best score:',grid_search_r.best_score_)

In [28]:
# predicting the selling price with hypertuning parameters and calculating the accuracy using metrics

model=RandomForestRegressor(max_depth= 20, max_features= None, min_samples_leaf= 1, min_samples_split= 2)
model.fit(X_train,y_train)
y_pred=model.predict(X_test)
mse=mean_squared_error(y_test,y_pred)
r2=r2_score(y_test,y_pred)

print("mean_squared_error :",mse)
print("r2_score :",r2)

mean_squared_error : 0.003005465298520558
r2_score : 0.9625606360169515


In [29]:
# save the regression model by using pickle

with open('regression_model.pkl', 'wb') as f:
    pickle.dump(model, f)

In [30]:
# load the pickle model to predict selling price

with open('regression_model.pkl', 'rb') as f:
    model = pickle.load(f)


In [31]:
# testing the model for accuracy with a random record from the cleaned dataframe before transforming

sp = [np.log(202.411065),30349574.0,32.0,59.0,np.log(2.3),np.log(1317.0),1668701718,1.0,4.0,2021.0,1.0,4.0,2021.0,2.0,1.0]
y_pred = model.predict([sp])
np.exp(y_pred[0])



901.0470759927149

#### Classification model - to predict status

In [32]:
from sklearn.model_selection import train_test_split , GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.tree import ExtraTreeClassifier
from xgboost import XGBClassifier
from imblearn.combine import SMOTETomek
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import StandardScaler,MinMaxScaler
from sklearn.metrics import accuracy_score , f1_score , roc_auc_score , confusion_matrix , classification_report,ConfusionMatrixDisplay

In [33]:
df_c['status'].value_counts()

status
1.0    116007
0.0     34431
Name: count, dtype: int64

As we can see that there is an imbalance in the target variable and hence we should perform oversampling .

In [34]:
# To split the train and test data

X=df_c.drop("status",axis=1)
y=df_c['status']
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.3,random_state=42)

In [35]:
# Using SMOTETomek

smoteTomek = SMOTETomek(random_state=42)
X_resampled, y_resampled = smoteTomek.fit_resample(X_train, y_train)

In [36]:
# choosing the best algorithm - using SMOTETomek

models=[LogisticRegression,DecisionTreeClassifier,RandomForestClassifier,GradientBoostingClassifier,ExtraTreeClassifier,XGBClassifier]

for i in models :
    model=i().fit(X_resampled,y_resampled)
    y_pred=model.predict(X_test)
    accuracy=accuracy_score(y_test,y_pred)
    f1=f1_score(y_test,y_pred)
    roc_auc=roc_auc_score(y_test,y_pred)

    print("-------------------------------")
    print(i.__name__,":")
    print("accuracy_score :",accuracy)
    print("f1_score:",f1)
    print("roc_auc_score :",roc_auc)


-------------------------------
LogisticRegression :
accuracy_score : 0.6515775946113622
f1_score: 0.7661953402619802
roc_auc_score : 0.5456295969231362
-------------------------------
DecisionTreeClassifier :
accuracy_score : 0.9411504032615439
f1_score: 0.9614860357878252
roc_auc_score : 0.9238313003456529
-------------------------------
RandomForestClassifier :
accuracy_score : 0.9633741026322786
f1_score: 0.9760715681591176
roc_auc_score : 0.9530548704313796
-------------------------------
GradientBoostingClassifier :
accuracy_score : 0.7852078347957103
f1_score: 0.8509395085647508
roc_auc_score : 0.7704164664038722
-------------------------------
ExtraTreeClassifier :
accuracy_score : 0.916777452805105
f1_score: 0.9453402409919087
roc_auc_score : 0.89370995677771
-------------------------------
XGBClassifier :
accuracy_score : 0.9177745280510502
f1_score: 0.9455585711142082
roc_auc_score : 0.9043106238139795


Considering the accuracy and f1 score - **Random Forest** performs well . Proceeding with Random Forest for classification

#### Hyper parameter tuning using GridSeachCV

In [None]:
param_grid_r = {'n_estimators': [50, 100, 150],
                'max_depth': [None, 10, 20],
                'min_samples_split': [2, 5, 10],
                'min_samples_leaf': [1, 2, 4],
                'max_features': ['sqrt', 'log2']}


grid_search_r=GridSearchCV(estimator=RandomForestClassifier() , param_grid=param_grid_r, cv=5, n_jobs=-2)
grid_search_r.fit(X_resampled,y_resampled)
print('Best Parameters:',grid_search_r.best_params_)
print("-------------------------------")
print('Best score:',grid_search_r.best_score_)

In [37]:
# predicting the status with hypertuning parameters and calculating the accuracy using metrics

model=RandomForestClassifier(n_estimators=100,max_depth=30, max_features= 'sqrt', min_samples_leaf=1, min_samples_split= 2)
model.fit(X_resampled,y_resampled)
y_pred=model.predict(X_test)
accuracy=accuracy_score(y_test,y_pred)
f1=f1_score(y_test,y_pred)
roc_auc=roc_auc_score(y_test,y_pred)
print("-------------------------------")
print("Random Forest Classifier:")
print("accuracy_score :",accuracy)
print("f1_score:",f1)
print("roc_auc_score :",roc_auc)

-------------------------------
Random Forest Classifier:
accuracy_score : 0.963219002038465
f1_score: 0.9759510908932866
roc_auc_score : 0.9537554191612039


In [38]:
# saving as a pickle file
with open("classification_model.pkl",'wb') as f:
    pickle.dump(model,f)

In [39]:
# load the pickle model to predict the status

with open("classification_model.pkl",'rb') as f:
    model=pickle.load(f)

In [40]:
sp=[np.log(768.024839),30202938.0,25.0,41.0,np.log(0.80),np.log(1210.0),1668701718,1047.00,1.0,4.0,2021.0,1.0,4.0,2021.0,0.0]

y_pred=model.predict([sp])
y_pred[0]



1.0