In [None]:
import seaborn as sns
import pandas as pd 
import numpy as np
import matplotlib.pyplot as plt
import pickle 
from sklearn.tree import DecisionTreeRegressor,DecisionTreeClassifier
from sklearn.preprocessing import OneHotEncoder,StandardScaler,LabelBinarizer
from sklearn.metrics import confusion_matrix,accuracy_score,r2_score,mean_squared_error,classification_report,roc_curve,auc
from sklearn.model_selection import GridSearchCV ,train_test_split
df=pd.read_csv("D:/copperman/copperman.csv")
df.columns


In [None]:
print(df.shape)
print(df.size)

In [None]:
df.rename(columns={"quantity tons":"tons_quantity",
                   "item type":"item_type",
                   "delivery date":"delivery_date"},inplace=True)


In [None]:
df.isnull().sum()

In [None]:

df["country"]==df["country"].fillna(0,inplace=True)
df["country"]=df["country"].astype(int)
df.info()

In [None]:
df.columns

In [None]:
df["item_type"]

In [None]:
df["tons_quantity"]=pd.to_numeric(df["tons_quantity"],errors='coerce')
df["customer"]=pd.to_numeric(df["customer"],errors='coerce')
df["country"]=pd.to_numeric(df["country"],errors='coerce')
df["application"]=pd.to_numeric(df['application'],errors='coerce')
df["thickness"]=pd.to_numeric(df['thickness'],errors='coerce')
df["width"]=pd.to_numeric(df['width'],errors='coerce')
df["item_date"]=pd.to_datetime(df['item_date'],format='%Y%m%d',errors='coerce').dt.date
df["delivery_date"]=pd.to_datetime(df['delivery_date'],format='%Y%m%d',errors='coerce').dt.date
df["product_ref"]=pd.to_numeric(df['product_ref'],errors='coerce')
df["material_ref"]=df["material_ref"].str.lstrip('0')


In [None]:
df.isnull().sum()

In [None]:
missing_values_count = df.isnull().sum()
print(missing_values_count)
print(df.shape)
df.info()

In [None]:
df['material_ref'].fillna('unknown', inplace=True)
df = df.dropna()

In [None]:
null_count = df.isnull().sum()
print(null_count)
print(df.shape)

In [None]:
df["tons_quantity"].skew()

In [None]:
sns.distplot(df['thickness'])
plt.show()
sns.distplot(df['width'])
plt.show()
sns.distplot(df['selling_price'])
plt.show()
sns.distplot(df['tons_quantity'])
plt.show()
sns.distplot(df['country'])
plt.show()
sns.distplot(df['application'])
plt.show()


In [None]:
import numpy as np
negsell= df['selling_price'] <= 0
print(negsell.sum())
df.loc[negsell, 'selling_price'] = np.nan

negtons = df['tons_quantity'] <= 0
print(negtons.sum())
df.loc[negtons, 'tons_quantity'] = np.nan

negth = df['thickness'] <= 0
print(negth.sum())

In [None]:
print(df.isnull().sum())
df.dropna(inplace=True)
print(df.isna().sum())

In [None]:
len(df)

In [None]:
df['tons_quantity'] = df['tons_quantity'].apply(lambda x: np.nan if x <= 0 else x)
# Apply log transformation
df['tons_quantity_log'] = np.log(df['tons_quantity'])
sns.distplot(df['tons_quantity_log'].dropna())
plt.show()

In [None]:
df['thickness_log'] = np.log(df['thickness'])
sns.distplot(df['thickness_log'].dropna())
plt.show()

In [None]:
df['selling_price_log'] = np.log(df['selling_price'])  
# Using np.log to handle zero values
sns.distplot(df['selling_price_log'])
plt.show()

In [None]:
x=df[['tons_quantity_log',
      'application',
      'thickness_log',
      'width','selling_price_log',
      'country',
      'customer',
      'product_ref']].corr()
sns.heatmap(x, annot=True,cmap='plasma')

In [None]:
df_cleaned = df.dropna(subset=['selling_price_log'])

# Separate categorical values (X) and target variable (y)
X = df_cleaned.drop('selling_price_log', axis=1)
y = df_cleaned['selling_price_log']

In [None]:
# encoding categorical variables
ohe = OneHotEncoder(handle_unknown='ignore')
ohe.fit(X[['item_type']])
X_ohe = ohe.fit_transform(X[['item_type']]).toarray()
ohe2 = OneHotEncoder(handle_unknown='ignore')
ohe2.fit(X[['status']])
X_be = ohe2.fit_transform(X[['status']]).toarray()
# independent features after encoding
X = np.concatenate((X[['tons_quantity_log',
                       'application',
                       'thickness_log',
                       'width','country',
                       'customer',
                       'product_ref']].values,
                    X_ohe, X_be), axis=1)
scaler = StandardScaler()
X = scaler.fit_transform(X)
# test and train split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)


In [None]:

# decision tree regression
dtr = DecisionTreeRegressor()
# hyperparameters
param_grid = {'max_depth': [3, 5, 9, 17],
              'min_samples_split': [2, 5, 10],
              'min_samples_leaf': [1, 2, 4],
              'max_features': ['auto', 'sqrt', 'log2']}

In [None]:
# gridsearchcv
grid_search = GridSearchCV(estimator=dtr, param_grid=param_grid, cv=5)
grid_search.fit(X_train, y_train)
print("Best hyperparameters:", grid_search.best_params_)
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)

In [None]:
# evalution metrics
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)
print('Mean squared error:', mse)
print('R-squared:', r2)

In [None]:
df["item_type"].unique()

In [None]:
df["width"].describe()

In [None]:
sample = np.array([[np.log(40),99 , np.log(2),
                        5, 28,30202938,
                        1670798778,'PL','']])
sample

In [None]:

new_sample_ohe = ohe.transform(sample[:, [7]]).toarray()
new_sample_be = ohe2.transform(sample[:, [8]]).toarray()
new_sample = np.concatenate((sample[:, [0,1,2, 3, 4, 5, 6]],
                             new_sample_ohe,new_sample_be), axis=1)
sc_sample1 = scaler.transform(new_sample)
new_pred = best_model.predict(sc_sample1)
print('Predicted selling price:', np.exp(new_pred))

In [None]:
#uncomment to pickle the decision tree model,standard scaler,one hot encoders,
'''
with open('decisiontreemodel.pkl', 'wb') as file:
    pickle.dump(best_model, file)
with open('standardscaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open('onehotencoder.pkl', 'wb') as f:
    pickle.dump(ohe, f)
with open('onehotencoder2.pkl', 'wb') as f:
    pickle.dump(ohe2, f)'''

In [None]:
print(len(df_cleaned))
df_cleaned.head(3)

In [None]:
dfc = df_cleaned[df_cleaned['status'].isin(['Won', 'Lost'])]
len(dfc)

In [None]:
dfc.isna().sum()

In [None]:
#df_4.to_csv("D:/copperman/dataframe_coppermodel.csv",index=False)
dfc.head()

In [None]:
Y = dfc['status']
X= dfc[['tons_quantity_log','selling_price_log',
         'item_type','application','thickness_log',
         'width','country','customer','product_ref']]
# encoding categorical variables
ohe = OneHotEncoder(handle_unknown='ignore')
ohe.fit(X[['item_type']])
X_ohe = ohe.fit_transform(X[['item_type']]).toarray()
be = LabelBinarizer()
be.fit(Y) 
y = be.fit_transform(Y)

In [None]:
# categorical values after encoding
X = np.concatenate((X[['tons_quantity_log', 'selling_price_log','application',
                       'thickness_log', 'width','country',
                       'customer','product_ref']].values, X_ohe), axis=1)
scaler = StandardScaler()
X = scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=18)
# decision tree classifier
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)
y_pred = dtc.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy}")
cm = confusion_matrix(y_test, y_pred)
print(f"Confusion Matrix:\n{cm}")

In [None]:
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))
# ROC curve and AUC
fpr, tpr, thresholds = roc_curve(y_test, y_pred)
roc_auc = auc(fpr, tpr)
plt.plot(fpr, tpr, label='ROC curve (area = %0.2f)' % roc_auc)
plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.5])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver operating characteristic')
plt.legend(loc="upper right")

plt.show()

In [None]:
# Predict the status for a new sample
# 'quantity tons_log', 'selling_price_log','application', 'thickness_log', 'width','country','customer','product_ref'
new_sample = np.array([[np.log(700), np.log(956), 10, np.log(2),1500,28.0,30202938,1670798778,'W']])
new_sample_ohe = ohe.transform(new_sample[:, [8]]).toarray()
new_sample = np.concatenate((new_sample[:, [0,1,2, 3, 4, 5, 6,7]], new_sample_ohe), axis=1)
new_sample = scaler.transform(new_sample)
new_pred = dtc.predict(new_sample)
if new_pred==1:
    print('The status is: Won')
else:
    print('The status is: Lost')

In [None]:
'''
# Saving the model
import pickle
with open('classificationmodel.pkl', 'wb') as file:
    pickle.dump(dtc, file)
with open('classificationscaler.pkl', 'wb') as f:
    pickle.dump(scaler, f)
with open('classification_encoder_t.pkl', 'wb') as f:
    pickle.dump(ohe, f)'''