## Import necessary libraries

In [2]:
import sys
print(sys.version)

3.11.2 (tags/v3.11.2:878ead1, Feb  7 2023, 16:38:35) [MSC v.1934 64 bit (AMD64)]


In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn.metrics import r2_score
from sklearn.ensemble import BaggingRegressor
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv(r"H:\Guvi\Project\Industrial Copper Model\Dataset\Copper_Set.csv")

In [5]:
df.drop(columns=['id', 'delivery date', 'material_ref'], axis=1, inplace=True)

In [6]:
df['item_date'] = df['item_date'].replace(19950000.0, 19950101.0)

In [7]:
df['item_date'] = pd.to_datetime(df['item_date'], format='%Y%m%d', errors='coerce')

In [8]:
## since I got error while running format correction, I did this changes 
df['quantity tons'] = df['quantity tons'].replace('e', np.nan)

In [9]:
df['quantity tons'] = pd.to_numeric(df['quantity tons'])

In [10]:
df['item_date'] = df['item_date'].fillna(df['item_date'].mode()[0])
df['quantity tons'] = df['quantity tons'].fillna(df['quantity tons'].median())
df['customer'] = df['customer'].fillna(df['customer'].mode()[0])
df['country'] = df['country'].fillna(df['country'].median())
df['status'] = df['status'].fillna(df['status'].mode()[0])
df['application'] = df['application'].fillna(df['application'].median())
df['thickness'] = df['thickness'].fillna(df['thickness'].median())
df['selling_price'] = df['selling_price'].fillna(df['selling_price'].median())

In [11]:
## Since some values contains garbage value('0') at the start so removing with left strip technique
# df['material_ref'] = df['material_ref'].str.lstrip('0')    

In [12]:
# ## Used random sampling imputation for filling nan values since data is missing at random and also considering there are many categorical values
# def impute_nan_material(df, feature):
#     random_sample = df[feature].dropna().sample(df[feature].isnull().sum(),random_state=0)
#     random_sample.index = df[df[feature].isnull()].index
#     df.loc[df[feature].isnull(), feature] = random_sample

# impute_nan_material(df, 'material_ref')

In [13]:
q1 = df['quantity tons'].quantile(0.25)
print('q1', q1)
q3 = df['quantity tons'].quantile(0.75)
print('q3', q3)
median = df['quantity tons'].median()
print('median', median)
iqr = q3 - q1
print('iqr', iqr)
lower_bound = q1 - 3 * iqr
print('lower_bound', lower_bound)
upper_bound = q3 + 3 * iqr
print('upper_bound', upper_bound)

q1 10.9703737835989
q3 67.1603846825272
median 30.3646354529424
iqr 56.19001089892829
lower_bound -157.599658913186
upper_bound 235.7304173793121


In [14]:
##since weight cannot be negative but the values seems to be within range so i tried removing minus sign from the values to see how it works.
df['quantity tons'] = np.where(df['quantity tons']<-1000, q1, df['quantity tons'])
df['quantity tons'] = np.where(df['quantity tons']<0,df['quantity tons'].abs() , df['quantity tons'])

In [15]:
df['quantity tons'] = np.where(df['quantity tons']>1.000000e+05,upper_bound , df['quantity tons'])

In [16]:
df['quantity tons']  = np.log1p(df['quantity tons'])

In [17]:
q1 = df['thickness'].quantile(0.25)
print('q1', q1)
q3 = df['thickness'].quantile(0.75)
print('q3', q3)
median = df['thickness'].median()
print('median', median)
iqr = q3 - q1
print('iqr', iqr)
lower_bound = q1 - 3 * iqr
print('lower_bound', lower_bound)
upper_bound = q3 + 3 * iqr
print('upper_bound', upper_bound)

q1 0.7
q3 3.0
median 1.5
iqr 2.3
lower_bound -6.199999999999999
upper_bound 9.899999999999999


In [18]:
df['thickness'] = np.where(df['thickness']>250, upper_bound, df['thickness'])

In [19]:
df['thickness'] = np.log(df['thickness'])

In [20]:
q1 = df['selling_price'].quantile(0.25)
print('q1', q1)
q3 = df['selling_price'].quantile(0.75)
print('q3', q3)
median = df['selling_price'].median()
print('median', median)
iqr = q3 - q1
print('iqr', iqr)
lower_bound = q1 - 3 * iqr
print('lower_bound', lower_bound)
upper_bound = q3 + 3 * iqr
print('upper_bound', upper_bound)

q1 669.0
q3 953.0
median 812.0
iqr 284.0
lower_bound -183.0
upper_bound 1805.0


In [21]:
df['selling_price'] = np.where(df['selling_price']<=0, q1, df['selling_price'])

In [22]:
df['selling_price'] = np.where(df['selling_price']>1e4, upper_bound, df['selling_price'])

In [23]:
df['selling_price'] = (df['selling_price'])**(1/2)

In [24]:
## frequency or count encoding is done for status, item type, material_ref
for feature in ['status', 'item type']:
    feature_map = df[feature].value_counts().to_dict()
    df[feature] = df[feature].map(feature_map)

In [25]:
df['item_date'] = df['item_date'].astype(str)

In [26]:
df['item_delivery_year'] = df['item_date'].apply(lambda x : x.split('-')[0])
df['item_delivery_month'] = df['item_date'].apply(lambda x : x.split('-')[1])
df['item_delivery_date'] = df['item_date'].apply(lambda x : x.split('-')[2])

In [27]:
df.drop(columns='item_date', axis=1,inplace=True)

In [28]:
df['item_delivery_year'] = df['item_delivery_year'].astype(int)
df['item_delivery_month'] = df['item_delivery_month'].astype(int)
df['item_delivery_date'] = df['item_delivery_date'].astype(int)

In [29]:
df['item_delivery_year'].describe()

count    181673.000000
mean       2020.390085
std           0.491424
min        1995.000000
25%        2020.000000
50%        2020.000000
75%        2021.000000
max        2021.000000
Name: item_delivery_year, dtype: float64

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181673 entries, 0 to 181672
Data columns (total 13 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   quantity tons        181673 non-null  float64
 1   customer             181673 non-null  float64
 2   country              181673 non-null  float64
 3   status               181673 non-null  int64  
 4   item type            181673 non-null  int64  
 5   application          181673 non-null  float64
 6   thickness            181673 non-null  float64
 7   width                181673 non-null  float64
 8   product_ref          181673 non-null  int64  
 9   selling_price        181673 non-null  float64
 10  item_delivery_year   181673 non-null  int32  
 11  item_delivery_month  181673 non-null  int32  
 12  item_delivery_date   181673 non-null  int32  
dtypes: float64(7), int32(3), int64(3)
memory usage: 15.9 MB


In [31]:
## Assigning features into input(X) and ouput(y)
X = df.drop(labels=['selling_price'], axis=1)
y = df['selling_price']

In [32]:
## Splitting dataset for testing and training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [33]:
model = BaggingRegressor(n_estimators=30, random_state=42)

In [34]:
model.fit(X_train, y_train)

In [35]:
y_pred = model.predict(X_test)

In [36]:
model_r2_score = r2_score(y_test,y_pred)
print(model_r2_score)

0.9626909788123424


In [37]:
model_mse = mean_squared_error(y_test,y_pred)
model_mae = mean_absolute_error(y_test,y_pred)
model_rmse = np.sqrt(model_mse)
print(model_mse)
print(model_mae)
print(model_rmse)

0.8123043690796415
0.4807016983110685
0.9012792958232434


In [38]:
Adjusted_r2_score = (1 - (1-model_r2_score)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))
Adjusted_r2_score

0.962681118700149

### New prediction

In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181673 entries, 0 to 181672
Data columns (total 13 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   quantity tons        181673 non-null  float64
 1   customer             181673 non-null  float64
 2   country              181673 non-null  float64
 3   status               181673 non-null  int64  
 4   item type            181673 non-null  int64  
 5   application          181673 non-null  float64
 6   thickness            181673 non-null  float64
 7   width                181673 non-null  float64
 8   product_ref          181673 non-null  int64  
 9   selling_price        181673 non-null  float64
 10  item_delivery_year   181673 non-null  int32  
 11  item_delivery_month  181673 non-null  int32  
 12  item_delivery_date   181673 non-null  int32  
dtypes: float64(7), int32(3), int64(3)
memory usage: 15.9 MB


In [39]:
status = {'Won': 116012, 'Lost': 34438, 'Not lost for AM': 19573, 'Revised': 4276, 'To be approved': 4170, 'Draft': 3140, 'Offered': 53, 'Offerable': 10, 'Wonderful': 1}

In [41]:
item_type = {'W': 105615, 'S': 69236, 'PL': 5660, 'Others': 610, 'WI': 524, 'IPL': 27, 'SLAWR': 1}

In [46]:
country = [ 28.,  25.,  30.,  32.,  38.,  78.,  27.,  77., 113.,  79.,  26., 39.,  40.,  84.,  80., 107.,  89.]

In [47]:
application = [10., 41., 28., 59., 15.,  4., 38., 56., 42., 26., 27., 19., 20., 66., 29., 22., 40., 25., 67., 79.,
               3., 99.,  2.,  5., 39., 69., 70., 65., 58., 68.]

In [48]:
product_ref = [1670798778, 1668701718,     628377,     640665,     611993, 1668701376,  164141591, 1671863738, 1332077137,     640405,
              1693867550, 1665572374, 1282007633, 1668701698,     628117, 1690738206,     628112,     640400, 1671876026,
              164336407, 164337175, 1668701725, 1665572032,     611728, 1721130331, 1693867563, 611733, 1690738219, 
               1722207579,  929423819, 1665584320, 1665584662, 1665584642]

In [45]:
df['customer'].describe()

count    1.816730e+05
mean     3.051221e+07
std      2.433375e+07
min      1.245800e+04
25%      3.019688e+07
50%      3.020524e+07
75%      3.028042e+07
max      2.147484e+09
Name: customer, dtype: float64

In [50]:
df.columns

Index(['quantity tons', 'customer', 'country', 'status', 'item type',
       'application', 'thickness', 'width', 'product_ref', 'selling_price',
       'item_delivery_year', 'item_delivery_month', 'item_delivery_date'],
      dtype='object')

In [48]:
new_value = np.array([500, 250000, 32., 19573, 69236, 56., 15, 1540, 1668701376, 2003, 5, 25]).reshape(1,-1)

In [49]:
new_value

array([[5.00000000e+02, 2.50000000e+05, 3.20000000e+01, 1.95730000e+04,
        6.92360000e+04, 5.60000000e+01, 1.50000000e+01, 1.54000000e+03,
        1.66870138e+09, 2.00300000e+03, 5.00000000e+00, 2.50000000e+01]])

In [50]:
selling_price = (model.predict(new_value))**2

In [51]:
round(selling_price[0])

864

In [52]:
X_test

Unnamed: 0,quantity tons,customer,country,status,item type,application,thickness,width,product_ref,item_delivery_year,item_delivery_month,item_delivery_date
46308,2.125729,30160005.0,78.0,116012,105615,41.0,-0.755023,1246.0,611993,2021,2,2
29893,4.614464,30394817.0,78.0,116012,105615,10.0,-0.693147,1250.0,611993,2021,3,1
159082,4.654407,30161289.0,26.0,116012,69236,10.0,-0.287682,1300.0,1668701718,2020,8,12
108098,3.515408,30203192.0,27.0,34438,69236,15.0,-0.356675,1250.0,628377,2020,11,6
2960,2.344968,30153510.0,30.0,116012,105615,41.0,-0.765718,1260.0,611993,2021,3,29
...,...,...,...,...,...,...,...,...,...,...,...,...
135201,3.627290,30287099.0,32.0,34438,105615,41.0,-0.597837,1220.0,611993,2020,9,29
138613,1.472248,30205825.0,25.0,116012,105615,10.0,-0.223144,1250.0,1332077137,2020,9,24
110531,5.345906,30201306.0,26.0,19573,69236,28.0,-0.916291,1210.0,628377,2020,11,3
173592,4.360523,30155824.0,25.0,116012,105615,10.0,-0.040822,1250.0,611993,2020,7,13


In [53]:
x_test_26545 = np.array(X_test.loc[26545]).reshape(1,-1)

In [54]:
selling_price = (model.predict(x_test_26545))**2

In [57]:
round(selling_price[0])

1001

In [58]:
round(y_test.loc[26545]**2)

1001

In [41]:
import pickle

In [42]:
pickle.dump(model, open('copper_reg_model.pkl','wb'))

In [None]:
copper_model = pickle.load(open('copper_reg_model.pkl', 'rb'))

In [None]:
copper_model.predict(X_test[2].reshape(1,-1))

In [None]:
pickle.dump(scaler,open('transformation_copper_df.pkl','wb'))

In [43]:
import bz2file as bz2

In [44]:
def compressed_pickle(filename, data):
    with bz2.BZ2File(filename + '.pbz2', 'w') as f:
        pickle.dump(data, f)

In [45]:
compressed_pickle('copper_reg_model', model)

In [1]:
def decompress_pickle(file):
    data = bz2.BZ2File(file, 'rb')
    data = pickle.load(data)
    return data

In [39]:
import joblib
joblib.dump(model, 'copper_reg_model',compress=9)

['copper_reg_model']

In [40]:
uncompressed_model = joblib.load('copper_reg_model')

In [64]:
sp = (uncompressed_model.predict(x_test_26545))**2

In [66]:
round(sp[0])

1001

In [49]:
import gzip, pickle
with gzip.open('finalized_model.pkl.gz', 'wb') as ofp:
    pickle.dump(model, ofp)

In [50]:
with gzip.open('finalized_model.pkl.gz', 'rb') as ifp:
    uncompressed_model1 = pickle.load(ifp)

In [69]:
uncompressed_model1.predict(X_test[2].reshape(1,-1))

array([21.68635201])

In [67]:
fns_input = [np.log(quantity_tons), customer, selected_country, status[selected_status],
                    item_type[selected_item_type], selected_application, np.log(thickness), width, selected_product_ref,
                    item_year, item_month, item_date]

NameError: name 'quantity_tons' is not defined