In [1]:
#Required lib
import sklearn
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn import metrics
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, auc , accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import matplotlib.pyplot as plt
import pickle
import pandas as pd
     

# Data collection

In [2]:
df = pd.read_csv('copper_final.csv')

In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181640 entries, 0 to 181639
Data columns (total 18 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   quantity_tons        181640 non-null  float64
 1   customer             181640 non-null  int64  
 2   country              181640 non-null  float64
 3   status               181640 non-null  int64  
 4   item_type            181640 non-null  float64
 5   application          181640 non-null  float64
 6   thickness            181640 non-null  float64
 7   width                181640 non-null  float64
 8   product_ref          181640 non-null  int64  
 9   selling_price        181640 non-null  float64
 10  item_date            181640 non-null  object 
 11  delivery_date_pred   181640 non-null  object 
 12  item_date_day        181640 non-null  int64  
 13  item_date_month      181640 non-null  int64  
 14  item_date_year       181640 non-null  int64  
 15  delivery_date_day

In [4]:
df['status'].value_counts()

1    116009
0     34433
5     19568
7      4276
3      4169
2      3121
8        53
4        10
6         1
Name: status, dtype: int64

In [5]:
df.columns     

Index(['quantity_tons', 'customer', 'country', 'status', 'item_type',
       'application', 'thickness', 'width', 'product_ref', 'selling_price',
       'item_date', 'delivery_date_pred', 'item_date_day', 'item_date_month',
       'item_date_year', 'delivery_date_day', 'delivery_date_month',
       'delivery_date_year'],
      dtype='object')

In [6]:
df = df[['quantity_tons', 'customer', 'country',  'item_type',
       'application', 'thickness', 'width', 'product_ref',
       'item_date_day', 'item_date_month',
       'item_date_year', 'delivery_date_day', 'delivery_date_month',
       'delivery_date_year','selling_price','status']]

In [7]:
df = df.query('status==1 or status ==0')
df

Unnamed: 0,quantity_tons,customer,country,item_type,application,thickness,width,product_ref,item_date_day,item_date_month,item_date_year,delivery_date_day,delivery_date_month,delivery_date_year,selling_price,status
0,3.991779,30156308,28.0,5.0,2.302585,0.693147,7.313220,1670798778,1,4,2021,1,7,2021,6.749931,1
2,4.235147,30341428,38.0,3.0,2.302585,-0.510826,7.150701,1668701376,1,4,2021,1,7,2021,7.217443,1
4,3.314642,30165529,78.0,5.0,2.302585,-0.287682,7.130899,164141591,1,4,2021,1,7,2021,7.001246,1
5,3.473063,30202362,27.0,3.0,2.302585,-0.510826,6.917706,628377,1,4,2021,1,7,2021,6.880384,1
7,3.035295,30211222,30.0,5.0,2.302585,-0.798508,6.802395,611993,1,4,2021,1,7,2021,7.181736,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181635,4.629691,30200854,25.0,5.0,3.713572,-0.040822,7.106606,164141591,2,7,2020,1,8,2020,6.381816,1
181636,5.337954,30200854,25.0,5.0,3.713572,-0.051293,7.313220,164141591,2,7,2020,1,8,2020,6.378426,1
181637,1.443523,30200854,25.0,5.0,3.713572,-0.342490,7.130899,164141591,2,7,2020,1,8,2020,6.428105,1
181638,-0.323075,30200854,25.0,5.0,3.713572,-0.162519,7.130899,164141591,2,7,2020,1,8,2020,6.398595,1


Unbalanced data

In [8]:
x = df.drop('status',axis=1)
y = df['status']

In [9]:
x

Unnamed: 0,quantity_tons,customer,country,item_type,application,thickness,width,product_ref,item_date_day,item_date_month,item_date_year,delivery_date_day,delivery_date_month,delivery_date_year,selling_price
0,3.991779,30156308,28.0,5.0,2.302585,0.693147,7.313220,1670798778,1,4,2021,1,7,2021,6.749931
2,4.235147,30341428,38.0,3.0,2.302585,-0.510826,7.150701,1668701376,1,4,2021,1,7,2021,7.217443
4,3.314642,30165529,78.0,5.0,2.302585,-0.287682,7.130899,164141591,1,4,2021,1,7,2021,7.001246
5,3.473063,30202362,27.0,3.0,2.302585,-0.510826,6.917706,628377,1,4,2021,1,7,2021,6.880384
7,3.035295,30211222,30.0,5.0,2.302585,-0.798508,6.802395,611993,1,4,2021,1,7,2021,7.181736
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181635,4.629691,30200854,25.0,5.0,3.713572,-0.040822,7.106606,164141591,2,7,2020,1,8,2020,6.381816
181636,5.337954,30200854,25.0,5.0,3.713572,-0.051293,7.313220,164141591,2,7,2020,1,8,2020,6.378426
181637,1.443523,30200854,25.0,5.0,3.713572,-0.342490,7.130899,164141591,2,7,2020,1,8,2020,6.428105
181638,-0.323075,30200854,25.0,5.0,3.713572,-0.162519,7.130899,164141591,2,7,2020,1,8,2020,6.398595


In [10]:
y

0         1
2         1
4         1
5         1
7         1
         ..
181635    1
181636    1
181637    1
181638    1
181639    1
Name: status, Length: 150442, dtype: int64

Oversampling

In [11]:
from imblearn.combine import SMOTETomek
model = SMOTETomek()
x_new , y_new = model.fit_resample(x,y)
x_new.shape,y_new.shape

((231034, 15), (231034,))

In [12]:
y_new.value_counts(normalize=True)

1    0.5
0    0.5
Name: status, dtype: float64

In [13]:
x_new

Unnamed: 0,quantity_tons,customer,country,item_type,application,thickness,width,product_ref,item_date_day,item_date_month,item_date_year,delivery_date_day,delivery_date_month,delivery_date_year,selling_price
0,3.991779,30156308,28.0,5.0,2.302585,0.693147,7.313220,1670798778,1,4,2021,1,7,2021,6.749931
1,4.235147,30341428,38.0,3.0,2.302585,-0.510826,7.150701,1668701376,1,4,2021,1,7,2021,7.217443
2,3.314642,30165529,78.0,5.0,2.302585,-0.287682,7.130899,164141591,1,4,2021,1,7,2021,7.001246
3,3.473063,30202362,27.0,3.0,2.302585,-0.510826,6.917706,628377,1,4,2021,1,7,2021,6.880384
4,3.035295,30211222,30.0,5.0,2.302585,-0.798508,6.802395,611993,1,4,2021,1,7,2021,7.181736
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231029,2.506856,30197559,25.0,5.0,2.708050,2.269611,7.313220,640405,3,12,2020,1,4,2021,6.554185
231030,2.325993,30205658,32.0,5.0,2.302585,0.405465,7.175376,611993,24,11,2020,1,4,2021,6.784359
231031,1.888511,30198074,26.0,3.0,2.708050,0.579570,7.241240,164337175,11,1,2021,1,6,2021,6.873054
231032,2.186356,30201223,26.0,5.0,2.708050,2.477565,7.126891,640405,11,1,2021,1,5,2021,6.726337


In [14]:
y_new

0         1
1         1
2         1
3         1
4         1
         ..
231029    0
231030    0
231031    0
231032    0
231033    0
Name: status, Length: 231034, dtype: int64

In [15]:
x_new['status'] = y_new
x_new.to_csv('classification_dataset.csv',index=False)

In [16]:
x_new

Unnamed: 0,quantity_tons,customer,country,item_type,application,thickness,width,product_ref,item_date_day,item_date_month,item_date_year,delivery_date_day,delivery_date_month,delivery_date_year,selling_price,status
0,3.991779,30156308,28.0,5.0,2.302585,0.693147,7.313220,1670798778,1,4,2021,1,7,2021,6.749931,1
1,4.235147,30341428,38.0,3.0,2.302585,-0.510826,7.150701,1668701376,1,4,2021,1,7,2021,7.217443,1
2,3.314642,30165529,78.0,5.0,2.302585,-0.287682,7.130899,164141591,1,4,2021,1,7,2021,7.001246,1
3,3.473063,30202362,27.0,3.0,2.302585,-0.510826,6.917706,628377,1,4,2021,1,7,2021,6.880384,1
4,3.035295,30211222,30.0,5.0,2.302585,-0.798508,6.802395,611993,1,4,2021,1,7,2021,7.181736,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231029,2.506856,30197559,25.0,5.0,2.708050,2.269611,7.313220,640405,3,12,2020,1,4,2021,6.554185,0
231030,2.325993,30205658,32.0,5.0,2.302585,0.405465,7.175376,611993,24,11,2020,1,4,2021,6.784359,0
231031,1.888511,30198074,26.0,3.0,2.708050,0.579570,7.241240,164337175,11,1,2021,1,6,2021,6.873054,0
231032,2.186356,30201223,26.0,5.0,2.708050,2.477565,7.126891,640405,11,1,2021,1,5,2021,6.726337,0


In [17]:
df = pd.read_csv('classification_dataset.csv')

In [18]:
df

Unnamed: 0,quantity_tons,customer,country,item_type,application,thickness,width,product_ref,item_date_day,item_date_month,item_date_year,delivery_date_day,delivery_date_month,delivery_date_year,selling_price,status
0,3.991779,30156308,28.0,5.0,2.302585,0.693147,7.313220,1670798778,1,4,2021,1,7,2021,6.749931,1
1,4.235147,30341428,38.0,3.0,2.302585,-0.510826,7.150701,1668701376,1,4,2021,1,7,2021,7.217443,1
2,3.314642,30165529,78.0,5.0,2.302585,-0.287682,7.130899,164141591,1,4,2021,1,7,2021,7.001246,1
3,3.473063,30202362,27.0,3.0,2.302585,-0.510826,6.917706,628377,1,4,2021,1,7,2021,6.880384,1
4,3.035295,30211222,30.0,5.0,2.302585,-0.798508,6.802395,611993,1,4,2021,1,7,2021,7.181736,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231029,2.506856,30197559,25.0,5.0,2.708050,2.269611,7.313220,640405,3,12,2020,1,4,2021,6.554185,0
231030,2.325993,30205658,32.0,5.0,2.302585,0.405465,7.175376,611993,24,11,2020,1,4,2021,6.784359,0
231031,1.888511,30198074,26.0,3.0,2.708050,0.579570,7.241240,164337175,11,1,2021,1,6,2021,6.873054,0
231032,2.186356,30201223,26.0,5.0,2.708050,2.477565,7.126891,640405,11,1,2021,1,5,2021,6.726337,0


In [19]:
x = df.drop('status',axis=True)
y = df['status']
x

Unnamed: 0,quantity_tons,customer,country,item_type,application,thickness,width,product_ref,item_date_day,item_date_month,item_date_year,delivery_date_day,delivery_date_month,delivery_date_year,selling_price
0,3.991779,30156308,28.0,5.0,2.302585,0.693147,7.313220,1670798778,1,4,2021,1,7,2021,6.749931
1,4.235147,30341428,38.0,3.0,2.302585,-0.510826,7.150701,1668701376,1,4,2021,1,7,2021,7.217443
2,3.314642,30165529,78.0,5.0,2.302585,-0.287682,7.130899,164141591,1,4,2021,1,7,2021,7.001246
3,3.473063,30202362,27.0,3.0,2.302585,-0.510826,6.917706,628377,1,4,2021,1,7,2021,6.880384
4,3.035295,30211222,30.0,5.0,2.302585,-0.798508,6.802395,611993,1,4,2021,1,7,2021,7.181736
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
231029,2.506856,30197559,25.0,5.0,2.708050,2.269611,7.313220,640405,3,12,2020,1,4,2021,6.554185
231030,2.325993,30205658,32.0,5.0,2.302585,0.405465,7.175376,611993,24,11,2020,1,4,2021,6.784359
231031,1.888511,30198074,26.0,3.0,2.708050,0.579570,7.241240,164337175,11,1,2021,1,6,2021,6.873054
231032,2.186356,30201223,26.0,5.0,2.708050,2.477565,7.126891,640405,11,1,2021,1,5,2021,6.726337


In [20]:
y

0         1
1         1
2         1
3         1
4         1
         ..
231029    0
231030    0
231031    0
231032    0
231033    0
Name: status, Length: 231034, dtype: int64

In [21]:
def model_classification(x,y,algorithm):
    for i in algorithm:
        # Split data
        xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.3)

        # Model
        model = i().fit(xtrain,ytrain)

        # Predict for train and test accuracy

        y_train_pred = model.predict(xtrain)
        y_test_pred  = model.predict(xtest)

        # Accuracy score

        training = accuracy_score(ytrain,y_train_pred)
        testing = accuracy_score(ytest,y_test_pred)
        data = {'Algorithm':i.__name__, 'Training Acuuracy':training,'Testing Accuracy':testing}

        print(data)


In [22]:
model_classification(x,y,[DecisionTreeClassifier,ExtraTreesClassifier,RandomForestClassifier,XGBClassifier])

{'Algorithm': 'DecisionTreeClassifier', 'Training Acuuracy': 1.0, 'Testing Accuracy': 0.9619252355325995}
{'Algorithm': 'ExtraTreesClassifier', 'Training Acuuracy': 1.0, 'Testing Accuracy': 0.9850817330582448}
{'Algorithm': 'RandomForestClassifier', 'Training Acuuracy': 1.0, 'Testing Accuracy': 0.9806812771421564}
{'Algorithm': 'XGBClassifier', 'Training Acuuracy': 0.9523382573907236, 'Testing Accuracy': 0.9454199189161894}


Note ExtraTreesClassifier has given highest accuracy 

# Hyper parameter tuning

ExtraTressClassifier Hyper-parameter Tuning

In [23]:
param_grid = {
    'n_estimators': [50, 100, 150],           # Number of trees in the forest
    'criterion': ['gini', 'entropy'],       # Split criterion
    'max_depth': [None, 10, 20, 30],        # Maximum depth of trees
    'min_samples_split': [2, 5, 10],       # Min samples to split an internal node
    'min_samples_leaf': [1, 2, 4],         # Min samples required at a leaf node
    'max_features': ['auto', 'sqrt', 'log2'],  # Number of features to consider
    'class_weight': [None, 'balanced'],    # Class weight
    'random_state': [42],                  # Random seed for reproducibility
}

In [24]:
#extra_tree = ExtraTreesClassifier()
#grid_search = GridSearchCV(extra_tree, param_grid, cv=5, n_jobs=-1)
#grid_search.fit(x_new, y_new)
#print("Best Hyperparameters:", grid_search.best_params_)
     


In [25]:
#grid_search.best_score_

In [26]:
# Model building with Best Parameters

xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2)

# Model
model = ExtraTreesClassifier().fit(xtrain, ytrain)

# predicition
y_pred_train = model.predict(xtrain)
y_pred_test = model.predict(xtest)
accuracy_train = metrics.accuracy_score(ytrain, y_pred_train)
accuracy_test = metrics.accuracy_score(ytest, y_pred_test)
('Training accuracy :',accuracy_train),( 'Testing Accuracy :', accuracy_test)

(('Training accuracy :', 1.0), ('Testing Accuracy :', 0.9870149544441318))

In [27]:
df.columns

Index(['quantity_tons', 'customer', 'country', 'item_type', 'application',
       'thickness', 'width', 'product_ref', 'item_date_day', 'item_date_month',
       'item_date_year', 'delivery_date_day', 'delivery_date_month',
       'delivery_date_year', 'selling_price', 'status'],
      dtype='object')

In [28]:
quantity_tons = float(input('Enter the Quantity Ton value :'))

customer = float(input('Enter the customer code :'))

country = float(input('Enter the country code :'))

item_type = float(input('Enter the item type value :'))

application = float(input('Enter the application value :'))

thickness = float(input('Enter the thickness value :'))

width = float(input('Enter the width value :'))

product_ref = float(input('Enter the product ref value :'))

item_date_day = float(input('Enter the item day :'))

item_date_month = float(input('Enter the item month:'))

item_date_year = float(input('Enter the item year :'))

delivery_date_day = float(input('Enter the delivery day :'))

delivery_date_month = float(input('Enter the delivery month :'))

delivery_date_year = float(input('Enter the delivery year :'))

selling_price = float(input('Enter the selling price :'))

Enter the Quantity Ton value :5.2336561533
Enter the customer code :286585
Enter the country code :20
Enter the item type value :4
Enter the application value :2.369955858
Enter the thickness value :1.026562965
Enter the width value :6.56296258626
Enter the product ref value :16549586
Enter the item day :25
Enter the item month:4
Enter the item year :2021
Enter the delivery day :1
Enter the delivery month :10
Enter the delivery year :2021
Enter the selling price :6.24564625632


In [29]:
x = df.loc[5000]

x.values.tolist()

[3.9010053982822,
 30205825.0,
 25.0,
 5.0,
 2.302585092994046,
 -0.5108256237659907,
 7.313220387090301,
 1332077137.0,
 24.0,
 3.0,
 2021.0,
 1.0,
 8.0,
 2021.0,
 6.985641817639208,
 1.0]

In [30]:
x

quantity_tons          3.901005e+00
customer               3.020582e+07
country                2.500000e+01
item_type              5.000000e+00
application            2.302585e+00
thickness             -5.108256e-01
width                  7.313220e+00
product_ref            1.332077e+09
item_date_day          2.400000e+01
item_date_month        3.000000e+00
item_date_year         2.021000e+03
delivery_date_day      1.000000e+00
delivery_date_month    8.000000e+00
delivery_date_year     2.021000e+03
selling_price          6.985642e+00
status                 1.000000e+00
Name: 5000, dtype: float64

In [31]:
# Provided data for prediction

# Lost

# p = [5.0550630558573335,27,3,2.302585092994046,1.0986122886681098,7.313220387090301,1670798778.0,28.0,3,2021,1,7,2021,6.891625897052253,
# ]

# WON

p = [quantity_tons, customer ,country, item_type, application,
       thickness, width, product_ref, item_date_day, item_date_month,
       item_date_year, delivery_date_day, delivery_date_month,
       delivery_date_year,
       selling_price]  # 0

# prediction

In [32]:
#model predection
x = model.predict([[3.824247867319961,
 30205825.0,
 25.0,
 5.0,
 2.302585092994046,
 0.6931471805599453,
 7.098375638590786,
 1332077137.0,
 24.0,
 3.0,
 2021.0,
 1.0,
 8.0,
 2021.0,
 6.907755278982137]])

if x[0]==1:
    print()
    print('WON')
    print()
else :
    print()
    print('LOST')
    print()


WON





In [33]:
import pickle

# Download the Pickle Model:

In [34]:
with open('classification_dataset.pkl','wb') as f:
      pickle.dump(model,f)

In [35]:
with open('classification_dataset.pkl', 'rb') as f:
    model = pickle.load(f)


In [36]:
model

In [None]:
# quantity_tons = float(input('Enter the Quantity Ton value :'))

# country = float(input('Enter the country code :'))

# item_type = float(input('Enter the item type value :'))

# application = float(input('Enter the application value :'))

# thickness = float(input('Enter the thickness value :'))

# width = float(input('Enter the width value :'))

# product_ref = float(input('Enter the product ref value :'))

# item_date_day = float(input('Enter the item day :'))

# item_date_month = float(input('Enter the item month:'))

# item_date_year = float(input('Enter the item year :'))

# delivery_date_day = float(input('Enter the delivery day :'))

# delivery_date_month = float(input('Enter the delivery month :'))

# delivery_date_year = float(input('Enter the delivery year :'))

# selling_price = float(input('Enter the selling price :'))

x = model.predict([p])

if x[0]==1:
    print()
    print('WON')
    print()
else :
    print()
    print('LOST')
    print()

LOST

# Regression model

Aim to predict selling price

In [40]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
import pickle

In [41]:
import pandas as pd

df = pd.read_csv('copper_final.csv')
df

Unnamed: 0,quantity_tons,customer,country,status,item_type,application,thickness,width,product_ref,selling_price,item_date,delivery_date_pred,item_date_day,item_date_month,item_date_year,delivery_date_day,delivery_date_month,delivery_date_year
0,3.991779,30156308,28.0,1,5.0,2.302585,0.693147,7.313220,1670798778,6.749931,2021-04-01,2021-07-01,1,4,2021,1,7,2021
1,1.259203,30209509,30.0,2,5.0,3.713572,-0.967584,7.025538,611993,5.975038,2021-04-01,2021-07-01,1,4,2021,1,7,2021
2,4.235147,30341428,38.0,1,3.0,2.302585,-0.510826,7.150701,1668701376,7.217443,2021-04-01,2021-07-01,1,4,2021,1,7,2021
3,2.216566,30209509,30.0,2,5.0,3.713572,-0.967584,7.025538,611993,5.975038,2021-04-01,2021-07-01,1,4,2021,1,7,2021
4,3.314642,30165529,78.0,1,5.0,2.302585,-0.287682,7.130899,164141591,7.001246,2021-04-01,2021-07-01,1,4,2021,1,7,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181635,4.629691,30200854,25.0,1,5.0,3.713572,-0.040822,7.106606,164141591,6.381816,2020-07-02,2020-08-01,2,7,2020,1,8,2020
181636,5.337954,30200854,25.0,1,5.0,3.713572,-0.051293,7.313220,164141591,6.378426,2020-07-02,2020-08-01,2,7,2020,1,8,2020
181637,1.443523,30200854,25.0,1,5.0,3.713572,-0.342490,7.130899,164141591,6.428105,2020-07-02,2020-08-01,2,7,2020,1,8,2020
181638,-0.323075,30200854,25.0,1,5.0,3.713572,-0.162519,7.130899,164141591,6.398595,2020-07-02,2020-08-01,2,7,2020,1,8,2020


In [42]:
df.columns

Index(['quantity_tons', 'customer', 'country', 'status', 'item_type',
       'application', 'thickness', 'width', 'product_ref', 'selling_price',
       'item_date', 'delivery_date_pred', 'item_date_day', 'item_date_month',
       'item_date_year', 'delivery_date_day', 'delivery_date_month',
       'delivery_date_year'],
      dtype='object')

# Split data

In [43]:
df = df[['quantity_tons', 'customer', 'country', 'item_type',
       'application', 'thickness', 'width', 'product_ref',
        'item_date_day', 'item_date_month',
       'item_date_year', 'delivery_date_day', 'delivery_date_month',
       'delivery_date_year', 'status','selling_price']]
     

In [44]:
x = df.drop('selling_price',axis=1)
y = df['selling_price']
x

Unnamed: 0,quantity_tons,customer,country,item_type,application,thickness,width,product_ref,item_date_day,item_date_month,item_date_year,delivery_date_day,delivery_date_month,delivery_date_year,status
0,3.991779,30156308,28.0,5.0,2.302585,0.693147,7.313220,1670798778,1,4,2021,1,7,2021,1
1,1.259203,30209509,30.0,5.0,3.713572,-0.967584,7.025538,611993,1,4,2021,1,7,2021,2
2,4.235147,30341428,38.0,3.0,2.302585,-0.510826,7.150701,1668701376,1,4,2021,1,7,2021,1
3,2.216566,30209509,30.0,5.0,3.713572,-0.967584,7.025538,611993,1,4,2021,1,7,2021,2
4,3.314642,30165529,78.0,5.0,2.302585,-0.287682,7.130899,164141591,1,4,2021,1,7,2021,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
181635,4.629691,30200854,25.0,5.0,3.713572,-0.040822,7.106606,164141591,2,7,2020,1,8,2020,1
181636,5.337954,30200854,25.0,5.0,3.713572,-0.051293,7.313220,164141591,2,7,2020,1,8,2020,1
181637,1.443523,30200854,25.0,5.0,3.713572,-0.342490,7.130899,164141591,2,7,2020,1,8,2020,1
181638,-0.323075,30200854,25.0,5.0,3.713572,-0.162519,7.130899,164141591,2,7,2020,1,8,2020,1


In [45]:
y

0         6.749931
1         5.975038
2         7.217443
3         5.975038
4         7.001246
            ...   
181635    6.381816
181636    6.378426
181637    6.428105
181638    6.398595
181639    6.408529
Name: selling_price, Length: 181640, dtype: float64

In [48]:
def model_regression(x,y,algorithm):
    for i in algorithm:
        xtrain,xtest,ytrain,ytest = train_test_split(x,y,test_size=0.2)
        model = i().fit(xtrain,ytrain)
        # predict for train and test accuracy
        y_train_pred = model.predict(xtrain)
        y_test_pred  = model.predict(xtest)

       # Accuracy score
        training = r2_score(ytrain,y_train_pred)
        testing = r2_score(ytest,y_test_pred)
        data = {'Algorithm':i.__name__, 'Training R2 Score':training,'Testing R2 Score':testing}
        print(data)

In [49]:
model_regression(x,y,[DecisionTreeRegressor,ExtraTreesRegressor,RandomForestRegressor,XGBRegressor])
     

{'Algorithm': 'DecisionTreeRegressor', 'Training R2 Score': 0.9999360501669607, 'Testing R2 Score': 0.9184726252087668}
{'Algorithm': 'ExtraTreesRegressor', 'Training R2 Score': 0.9999167562153229, 'Testing R2 Score': 0.9553569212753874}
{'Algorithm': 'RandomForestRegressor', 'Training R2 Score': 0.9938750552668101, 'Testing R2 Score': 0.9587834615458357}
{'Algorithm': 'XGBRegressor', 'Training R2 Score': 0.9510500998398758, 'Testing R2 Score': 0.9469426793367377}


# Hyper parameter Tuning Using grid-search cv

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.2,random_state=42)
param_grid_r = {'max_depth'      : [2, 5, 10, 20],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf' : [1, 2, 4],
              'max_features'     : ['sqrt', 'log2', None]}

grid_search_r = GridSearchCV(estimator=RandomForestRegressor(), param_grid=param_grid_r, cv=5, n_jobs=-1)
grid_search_r.fit(xtrain, ytrain)

In [None]:
grid_search_r.best_params_,grid_search_r.best_score_

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y,test_size=0.2)

model = ExtraTreesRegressor().fit(xtrain, ytrain)

y_pred_train = model.predict(xtrain)

y_pred_test = model.predict(xtest)

r2_train = r2_score(ytrain, y_pred_train)

r2_test = r2_score(ytest, y_pred_test)

r2_train, r2_test

# with normal df
     

In [53]:
quantity_tons = float(input('Enter the Quantity Ton value :'))

customer = float(input('enter the cusomer value'))

country = float(input('Enter the country code :'))

status = float(input('Enter the  status :'))

item_type = float(input('Enter the item type value :'))

application = float(input('Enter the application value :'))

thickness = float(input('Enter the thickness value :'))

width = float(input('Enter the width value :'))

product_ref = float(input('Enter the product ref value :'))

item_date_day = float(input('Enter the item day :'))

item_date_month = float(input('Enter the item month:'))

item_date_year = float(input('Enter the item year :'))

delivery_date_day = float(input('Enter the delivery day :'))

delivery_date_month = float(input('Enter the delivery month :'))

delivery_date_year = float(input('Enter the delivery year :'))



Enter the Quantity Ton value :1.2563
enter the cusomer value25462
Enter the country code :24
Enter the  status :3
Enter the item type value :5
Enter the application value :2.35646
Enter the thickness value :2.2696
Enter the width value :1.26656
Enter the product ref value :454666
Enter the item day :13
Enter the item month:6
Enter the item year :2020
Enter the delivery day :1
Enter the delivery month :9
Enter the delivery year :2021


In [54]:
df.loc[10000]

quantity_tons          1.849617e+00
customer               3.016038e+07
country                7.800000e+01
item_type              5.000000e+00
application            3.713572e+00
thickness             -3.566749e-01
width                  7.129298e+00
product_ref            6.119930e+05
item_date_day          1.900000e+01
item_date_month        3.000000e+00
item_date_year         2.021000e+03
delivery_date_day      1.000000e+00
delivery_date_month    7.000000e+00
delivery_date_year     2.021000e+03
status                 2.000000e+00
selling_price          5.975038e+00
Name: 10000, dtype: float64

In [55]:
x = df.loc[1000]
x.values.tolist()

[4.571440618970477,
 30271289.0,
 28.0,
 5.0,
 2.70805020110221,
 1.791759469228055,
 7.313220387090301,
 1670798778.0,
 31.0,
 3.0,
 2021.0,
 1.0,
 9.0,
 2021.0,
 1.0,
 6.880384082186005]

In [56]:
selling_price = model.predict([[4.571440618970477,
 30271289.0,
 28.0,
 5.0,
 2.70805020110221,
 1.791759469228055,
 7.313220387090301,
 1670798778.0,
 31.0,
 3.0,
 2021.0,
 1.0,
 9.0,
 2021.0,
 1.0]])

import numpy as np

f'predicted selling price : {(np.exp(selling_price))}' f'actual price {np.exp(6.880384082186005)}'



'predicted selling price : [2.71828183]actual price 972.9999999999999'

# Download Pickle Model

In [57]:
with open('regression_model.pkl', 'wb') as f:
    pickle.dump(model, f)
     


In [59]:
with open('regression_model.pkl', 'rb') as f:
    model = pickle.load(f)
     

# Prediction

In [60]:
sp = [np.log(54.1511386169278),30156308,28,5,np.log(10),np.log(2),np.log(1500),1670798778,1,4,2021,1,7,2021,1]

In [61]:
sell = model.predict([sp])



In [62]:
np.exp(sell)

array([2.71828183])

In [63]:
# Sellling -- manufacturer year plays major role -- item year   , type of item -- item_type , application -- code
     