## Import necessary libraries

In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn.metrics import r2_score
from sklearn.ensemble import BaggingRegressor
import warnings
warnings.filterwarnings('ignore')

In [4]:
df = pd.read_csv(r"H:\Projects\End_to_End_Industrial_Copper_Modelling\Databook\Copper_Set.csv")

In [5]:
df.drop(columns=['id', 'delivery date', 'material_ref'], axis=1, inplace=True)

In [6]:
df['item_date'] = df['item_date'].replace(19950000.0, 19950101.0)

In [7]:
df['item_date'] = pd.to_datetime(df['item_date'], format='%Y%m%d', errors='coerce')

In [8]:
## since I got error while running format correction, I did this changes 
df['quantity tons'] = df['quantity tons'].replace('e', np.nan)

In [9]:
df['quantity tons'] = pd.to_numeric(df['quantity tons'])

In [10]:
df['item_date'] = df['item_date'].fillna(df['item_date'].mode()[0])
df['quantity tons'] = df['quantity tons'].fillna(df['quantity tons'].median())
df['customer'] = df['customer'].fillna(df['customer'].mode()[0])
df['country'] = df['country'].fillna(df['country'].median())
df['status'] = df['status'].fillna(df['status'].mode()[0])
df['application'] = df['application'].fillna(df['application'].median())
df['thickness'] = df['thickness'].fillna(df['thickness'].median())
df['selling_price'] = df['selling_price'].fillna(df['selling_price'].median())

In [11]:
q1 = df['quantity tons'].quantile(0.25)
print('q1', q1)
q3 = df['quantity tons'].quantile(0.75)
print('q3', q3)
median = df['quantity tons'].median()
print('median', median)
iqr = q3 - q1
print('iqr', iqr)
lower_bound = q1 - 3 * iqr
print('lower_bound', lower_bound)
upper_bound = q3 + 3 * iqr
print('upper_bound', upper_bound)

q1 10.9703737835989
q3 67.1603846825272
median 30.3646354529424
iqr 56.19001089892829
lower_bound -157.599658913186
upper_bound 235.7304173793121


In [12]:
##since weight cannot be negative but the values seems to be within range so i tried removing minus sign from the values to see how it works.
df['quantity tons'] = np.where(df['quantity tons']<-1000, q1, df['quantity tons'])
df['quantity tons'] = np.where(df['quantity tons']<0,df['quantity tons'].abs() , df['quantity tons'])

In [13]:
df['quantity tons'] = np.where(df['quantity tons']>1.000000e+05,upper_bound , df['quantity tons'])

In [14]:
df['quantity_tons']  = np.log1p(df['quantity tons'])

In [15]:
df.drop(columns='quantity tons', axis=1, inplace=True)

In [16]:
q1 = df['thickness'].quantile(0.25)
print('q1', q1)
q3 = df['thickness'].quantile(0.75)
print('q3', q3)
median = df['thickness'].median()
print('median', median)
iqr = q3 - q1
print('iqr', iqr)
lower_bound = q1 - 3 * iqr
print('lower_bound', lower_bound)
upper_bound = q3 + 3 * iqr
print('upper_bound', upper_bound)

q1 0.7
q3 3.0
median 1.5
iqr 2.3
lower_bound -6.199999999999999
upper_bound 9.899999999999999


In [17]:
df['thickness'] = np.where(df['thickness']>250, upper_bound, df['thickness'])

In [18]:
df['thickness'] = np.log1p(df['thickness'])

In [19]:
q1 = df['selling_price'].quantile(0.25)
print('q1', q1)
q3 = df['selling_price'].quantile(0.75)
print('q3', q3)
median = df['selling_price'].median()
print('median', median)
iqr = q3 - q1
print('iqr', iqr)
lower_bound = q1 - 3 * iqr
print('lower_bound', lower_bound)
upper_bound = q3 + 3 * iqr
print('upper_bound', upper_bound)

q1 669.0
q3 953.0
median 812.0
iqr 284.0
lower_bound -183.0
upper_bound 1805.0


In [20]:
df['selling_price'] = np.where(df['selling_price']<=0, q1, df['selling_price'])

In [21]:
df['selling_price'] = np.where(df['selling_price']>1e4, upper_bound, df['selling_price'])

In [22]:
df['selling_price'] = (df['selling_price'])**(1/2)

In [23]:
## frequency or count encoding is done for status, item type, material_ref
for feature in ['status', 'item type']:
    feature_map = df[feature].value_counts().to_dict()
    df[feature] = df[feature].map(feature_map)

In [24]:
df = df.rename(columns={'item type': 'item_type'})

In [25]:
df['item_date'] = df['item_date'].astype(str)

In [26]:
df['item_delivery_year'] = df['item_date'].apply(lambda x : x.split('-')[0])
df['item_delivery_month'] = df['item_date'].apply(lambda x : x.split('-')[1])
df['item_delivery_date'] = df['item_date'].apply(lambda x : x.split('-')[2])

In [27]:
df.drop(columns='item_date', axis=1,inplace=True)

In [28]:
df['item_delivery_year'] = df['item_delivery_year'].astype(int)
df['item_delivery_month'] = df['item_delivery_month'].astype(int)
df['item_delivery_date'] = df['item_delivery_date'].astype(int)

In [29]:
df['item_delivery_year'].describe()

count    181673.000000
mean       2020.390085
std           0.491424
min        1995.000000
25%        2020.000000
50%        2020.000000
75%        2021.000000
max        2021.000000
Name: item_delivery_year, dtype: float64

In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181673 entries, 0 to 181672
Data columns (total 13 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   customer             181673 non-null  float64
 1   country              181673 non-null  float64
 2   status               181673 non-null  int64  
 3   item_type            181673 non-null  int64  
 4   application          181673 non-null  float64
 5   thickness            181673 non-null  float64
 6   width                181673 non-null  float64
 7   product_ref          181673 non-null  int64  
 8   selling_price        181673 non-null  float64
 9   quantity_tons        181673 non-null  float64
 10  item_delivery_year   181673 non-null  int32  
 11  item_delivery_month  181673 non-null  int32  
 12  item_delivery_date   181673 non-null  int32  
dtypes: float64(7), int32(3), int64(3)
memory usage: 15.9 MB


In [31]:
df['item_type'].value_counts()

item_type
105615    105615
69236      69236
5660        5660
610          610
524          524
27            27
1              1
Name: count, dtype: int64

In [32]:
## Assigning features into input(X) and ouput(y)
X = df.drop(labels=['selling_price'], axis=1)
y = df['selling_price']

In [33]:
## Splitting dataset for testing and training
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [32]:
df

Unnamed: 0,customer,country,status,item_type,application,thickness,width,product_ref,selling_price,quantity_tons,item_delivery_year,item_delivery_month,item_delivery_date
0,30156308.0,28.0,116012,105615,10.0,1.098612,1500.0,1670798778,29.223278,4.010077,2021,4,1
1,30202938.0,25.0,116012,105615,41.0,0.587787,1210.0,1668701718,32.357379,6.645123,2021,4,1
2,30153963.0,30.0,116012,524,28.0,0.322083,952.0,628377,25.383656,5.958755,2021,4,1
3,30349574.0,32.0,116012,69236,59.0,1.193922,1317.0,1668701718,27.712813,5.315229,2021,4,1
4,30211560.0,28.0,116012,105615,10.0,1.609438,2000.0,640665,24.020824,6.667626,2021,4,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
181668,30200854.0,25.0,116012,105615,41.0,0.672944,1220.0,164141591,24.310492,4.639402,2020,7,2
181669,30200854.0,25.0,116012,105615,41.0,0.667829,1500.0,164141591,24.269322,5.342748,2020,7,2
181670,30200854.0,25.0,116012,105615,41.0,0.536493,1250.0,164141591,24.879711,1.655480,2020,7,2
181671,30200854.0,25.0,116012,105615,41.0,0.615186,1250.0,164141591,24.515301,2.482435,2020,7,2


In [34]:
X_train.columns

Index(['customer', 'country', 'status', 'item_type', 'application',
       'thickness', 'width', 'product_ref', 'quantity_tons',
       'item_delivery_year', 'item_delivery_month', 'item_delivery_date'],
      dtype='object')

In [35]:
model = BaggingRegressor(n_estimators=30, random_state=42)

In [36]:
model.fit(X_train, y_train)

In [37]:
y_pred = model.predict(X_test)

In [38]:
model_r2_score = r2_score(y_test,y_pred)
print(model_r2_score)

0.9626754619373032


In [39]:
model_mse = mean_squared_error(y_test,y_pred)
model_mae = mean_absolute_error(y_test,y_pred)
model_rmse = np.sqrt(model_mse)
print(model_mse)
print(model_mae)
print(model_rmse)

0.8126422076234451
0.4808075608894946
0.9014666980113271


In [40]:
Adjusted_r2_score = (1 - (1-model_r2_score)*(len(y_test)-1)/(len(y_test)-X_test.shape[1]-1))
Adjusted_r2_score

0.9626655977242752

### New prediction

In [68]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 181673 entries, 0 to 181672
Data columns (total 13 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   quantity tons        181673 non-null  float64
 1   customer             181673 non-null  float64
 2   country              181673 non-null  float64
 3   status               181673 non-null  int64  
 4   item type            181673 non-null  int64  
 5   application          181673 non-null  float64
 6   thickness            181673 non-null  float64
 7   width                181673 non-null  float64
 8   product_ref          181673 non-null  int64  
 9   selling_price        181673 non-null  float64
 10  item_delivery_year   181673 non-null  int32  
 11  item_delivery_month  181673 non-null  int32  
 12  item_delivery_date   181673 non-null  int32  
dtypes: float64(7), int32(3), int64(3)
memory usage: 15.9 MB


In [None]:
fns_input = [np.log1p(float(quantity_tons)), float(customer), float(selected_country), int(status[selected_status]),
                    int(item_type[selected_item_type]), float(selected_application), float(np.log1p(thickness)), float(width), int(selected_product_ref),
                    int(item_year), int(item_month), int(item_date)]

In [39]:
status = {'Won': 116012, 'Lost': 34438, 'Not lost for AM': 19573, 'Revised': 4276, 'To be approved': 4170, 'Draft': 3140, 'Offered': 53, 'Offerable': 10, 'Wonderful': 1}

In [41]:
item_type = {'W': 105615, 'S': 69236, 'PL': 5660, 'Others': 610, 'WI': 524, 'IPL': 27, 'SLAWR': 1}

In [46]:
country = [ 28.,  25.,  30.,  32.,  38.,  78.,  27.,  77., 113.,  79.,  26., 39.,  40.,  84.,  80., 107.,  89.]

In [47]:
application = [10., 41., 28., 59., 15.,  4., 38., 56., 42., 26., 27., 19., 20., 66., 29., 22., 40., 25., 67., 79.,
               3., 99.,  2.,  5., 39., 69., 70., 65., 58., 68.]

In [48]:
product_ref = [1670798778, 1668701718,     628377,     640665,     611993, 1668701376,  164141591, 1671863738, 1332077137,     640405,
              1693867550, 1665572374, 1282007633, 1668701698,     628117, 1690738206,     628112,     640400, 1671876026,
              164336407, 164337175, 1668701725, 1665572032,     611728, 1721130331, 1693867563, 611733, 1690738219, 
               1722207579,  929423819, 1665584320, 1665584662, 1665584642]

In [45]:
df['customer'].describe()

count    1.816730e+05
mean     3.051221e+07
std      2.433375e+07
min      1.245800e+04
25%      3.019688e+07
50%      3.020524e+07
75%      3.028042e+07
max      2.147484e+09
Name: customer, dtype: float64

In [45]:
X_train.columns

Index(['customer', 'country', 'status', 'item_type', 'application',
       'thickness', 'width', 'product_ref', 'quantity_tons',
       'item_delivery_year', 'item_delivery_month', 'item_delivery_date'],
      dtype='object')

In [48]:
new_value = np.array([500, 250000, 32., 19573, 69236, 56., 15, 1540, 1668701376, 2003, 5, 25]).reshape(1,-1)

In [49]:
new_value

array([[5.00000000e+02, 2.50000000e+05, 3.20000000e+01, 1.95730000e+04,
        6.92360000e+04, 5.60000000e+01, 1.50000000e+01, 1.54000000e+03,
        1.66870138e+09, 2.00300000e+03, 5.00000000e+00, 2.50000000e+01]])

In [50]:
selling_price = (model.predict(new_value))**2

In [51]:
round(selling_price[0])

864

In [39]:
X_test

Unnamed: 0,quantity tons,customer,country,status,item type,application,thickness,width,product_ref,item_delivery_year,item_delivery_month,item_delivery_date
46308,2.238474,30160005.0,78.0,116012,105615,41.0,0.385262,1246.0,611993,2021,2,2
29893,4.624323,30394817.0,78.0,116012,105615,10.0,0.405465,1250.0,611993,2021,3,1
159082,4.663881,30161289.0,26.0,116012,69236,10.0,0.559616,1300.0,1668701718,2020,8,12
108098,3.544710,30203192.0,27.0,34438,69236,15.0,0.530628,1250.0,628377,2020,11,6
2960,2.436498,30153510.0,30.0,116012,105615,41.0,0.381855,1260.0,611993,2021,3,29
...,...,...,...,...,...,...,...,...,...,...,...,...
135201,3.653531,30287099.0,32.0,34438,105615,41.0,0.438255,1220.0,611993,2020,9,29
138613,1.678782,30205825.0,25.0,116012,105615,10.0,0.587787,1250.0,1332077137,2020,9,24
110531,5.350663,30201306.0,26.0,19573,69236,28.0,0.336472,1210.0,628377,2020,11,3
173592,4.373214,30155824.0,25.0,116012,105615,10.0,0.672944,1250.0,611993,2020,7,13


In [41]:
y_test

46308     34.785054
29893     34.655447
159082    27.073973
108098    27.313001
2960      35.905431
            ...    
135201    30.066593
138613    26.191602
110531    25.787594
173592    29.103264
26545     31.638584
Name: selling_price, Length: 45419, dtype: float64

In [43]:
x_test_26545 = np.array(X_test.loc[26545]).reshape(1,-1)

In [54]:
selling_price = (model.predict(x_test_26545))**2

In [57]:
round(selling_price[0])

1001

In [58]:
round(y_test.loc[26545]**2)

1001

In [41]:
import pickle

In [42]:
pickle.dump(model, open('copper_reg_model.pkl','wb'))

In [None]:
copper_model = pickle.load(open('copper_reg_model.pkl', 'rb'))

In [None]:
copper_model.predict(X_test[2].reshape(1,-1))

In [None]:
pickle.dump(scaler,open('transformation_copper_df.pkl','wb'))

In [43]:
import bz2file as bz2

In [44]:
def compressed_pickle(filename, data):
    with bz2.BZ2File(filename + '.pbz2', 'w') as f:
        pickle.dump(data, f)

In [45]:
compressed_pickle('copper_reg_model', model)

In [1]:
def decompress_pickle(file):
    data = bz2.BZ2File(file, 'rb')
    data = pickle.load(data)
    return data

In [41]:
import joblib
joblib.dump(model, 'copper_reg_model.joblib',compress=9)

['copper_reg_model.joblib']

In [42]:
uncompressed_model = joblib.load('copper_reg_model.joblib')

In [44]:
sp = (uncompressed_model.predict(x_test_26545))**2

In [45]:
round(sp[0])

1001

In [49]:
import gzip, pickle
with gzip.open('finalized_model.pkl.gz', 'wb') as ofp:
    pickle.dump(model, ofp)

In [50]:
with gzip.open('finalized_model.pkl.gz', 'rb') as ifp:
    uncompressed_model1 = pickle.load(ifp)

In [69]:
uncompressed_model1.predict(X_test[2].reshape(1,-1))

array([21.68635201])

In [67]:
fns_input = [np.log(quantity_tons), customer, selected_country, status[selected_status],
                    item_type[selected_item_type], selected_application, np.log(thickness), width, selected_product_ref,
                    item_year, item_month, item_date]

NameError: name 'quantity_tons' is not defined

In [9]:
import pkg_resources

package_name = 'numpy'  # Replace with the name of the package you want to check

# Get the version of the specific package
try:
    version = pkg_resources.get_distribution(package_name).version
    print(f"{package_name} - {version}")
except pkg_resources.DistributionNotFound:
    print(f"{package_name} is not installed.")

numpy - 1.26.1


In [5]:
import pkg_resources

# Get a list of installed packages and their versions
installed_packages = {pkg.key: pkg.version for pkg in pkg_resources.working_set}

# Print the list of packages and their versions
for package, version in installed_packages.items():
    print(f"{package} - {version}")


multidict - 6.0.4
async-timeout - 4.0.2
aiohttp - 3.8.5
crc32c - 2.3.post0
yarl - 1.9.2
frozenlist - 1.4.0
aiosignal - 1.3.1
codeium-jupyter - 1.1.21
regex - 2023.5.5
xyzservices - 2023.5.0
tifffile - 2023.4.12
pytz - 2023.3
tzdata - 2023.3
certifi - 2022.12.7
pywin32 - 306
setuptools - 65.5.0
cryptography - 40.0.2
pyzmq - 25.0.2
pip - 23.3.1
attrs - 23.1.0
packaging - 23.1
argon2-cffi - 21.3.0
argon2-cffi-bindings - 21.2.0
isoduration - 20.11.0
rich - 13.3.5
pyarrow - 12.0.0
pillow - 9.5.0
ipython - 8.12.0
tenacity - 8.2.2
jupyter-client - 8.2.0
thinc - 8.1.10
click - 8.1.3
mysql-connector-python - 8.0.32
ipywidgets - 8.0.6
nbconvert - 7.3.1
lab - 7.3
ipykernel - 6.22.0
jupyter-console - 6.6.3
importlib-metadata - 6.6.0
notebook - 6.5.4
tornado - 6.3.1
smart-open - 6.3.0
bleach - 6.0.0
pyyaml - 6.0
plotly - 5.14.1
psutil - 5.9.5
traitlets - 5.9.0
nbformat - 5.8.0
qtconsole - 5.4.2
cachetools - 5.3.0
jupyter-core - 5.3.0
decorator - 5.1.1
smmap - 5.0.0
tqdm - 4.65.0
fonttools - 4.39.3


In [None]:
python -m pip install --upgrade pip

In [5]:
!pip --version

pip 23.3 from C:\Python\Python310\lib\site-packages\pip (python 3.10)



In [1]:
!pip freeze > installed_packages.txt

In [2]:
!pip uninstall -y -r installed_packages.txt

ERROR: You must give at least one requirement to uninstall (see "pip help uninstall")


In [4]:
import pkg_resources
import subprocess

# Get a list of installed packages
installed_packages = {pkg.key for pkg in pkg_resources.working_set}

# Exclude 'pip' from the list of installed packages
if 'pip' in installed_packages:
    installed_packages.remove('pip')

# Uninstall each package one by one
for package in installed_packages:
    subprocess.check_call(['pip', 'uninstall', '-y', package])
