### Importing Libraries


In [2]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, roc_auc_score, f1_score,precision_score, recall_score
from sklearn.model_selection import cross_val_score
from sklearn.preprocessing import MinMaxScaler, OneHotEncoder
from sklearn.model_selection import RandomizedSearchCV,GridSearchCV
from sklearn.model_selection import StratifiedKFold

### Data collection

In [4]:
datapath = r"3d-printing-model/dataset/dataset.csv"
DataFrame= pd.read_csv(datapath)

FileNotFoundError: [Errno 2] No such file or directory: '3d-printing-model/dataset/dataset.csv'

In [None]:
print("DataFrame loaded successfully.")
DataFrame=pd.DataFrame(DataFrame)
DataFrame

### Analyze the Data

In [None]:
#printing the first 5 rows of the DataFrame
print("First 5 rows of the DataFrame:")
first_five_rows = DataFrame.head(5)
first_five_rows_output = pd.DataFrame(first_five_rows.values, columns=first_five_rows.columns, index=first_five_rows.index)
print("="*170)
print(first_five_rows_output.to_string())
print("="*170)



In [None]:
#printing the last 5 rows of the DataFrame
print("Last 5 rows of the DataFrame:")
last_rows = DataFrame.tail(5)  
formatted_output = pd.DataFrame(last_rows.values,  columns=last_rows.columns, index=last_rows.index)
print("="*170)
print(formatted_output.to_string())
print("="*170)

In [None]:
#Shape of the DataFrame
print(f"The shape of the DataFrame is: {DataFrame.shape[1]} columns")
print(f"The shape of the DataFrame is: {DataFrame.shape[0]} rows")

In [None]:
#checking the info of the DataFrame
print("="*80)
print("DATAFRAME STRUCTURE OVERVIEW".center(80))
print("="*80)
info_str = DataFrame.info()
print(info_str)


In [None]:
print("="*150)
print("STATISTICAL SUMMARY OF THE DATAFRAME".center(150))
print("="*150)
summary = DataFrame.describe()
summary

### Data cleaning

In [None]:
## checking duplicate values
print("="*50)
print("DUPLICATE VALUES CHECK".center(50))
print("="*50)
duplicate_count = DataFrame.duplicated().sum()
print(f"Number of duplicate rows in the DataFrame: {duplicate_count}")

#### There are no duplicated rows present in the  DataFrame

In [None]:
# checking null values in the DataFrame
null_values = DataFrame.isnull().sum()
print("="*50)
print("Null values in each column".center(50))
print("="*50)
print(null_values)

###### From the above code snipett we can say there are zero null values means no null values situated in DataFrame 

In [None]:
## unique values in each column
print("="*240)
print("Unique values in each column".center(190))
print("="*240)
for col in DataFrame.columns:
    print(f" {col} : {set(DataFrame[col])}")


###### By checking the above values which are unique from each features, there is no inconsistent data in the DataFrame

In [None]:
##seperating categorical and numerical columns
categorical_cols = DataFrame.select_dtypes(include=['object']).columns
numerical_cols = DataFrame.select_dtypes(include=['number']).columns

print("Categorical columns:", list(categorical_cols))
print("Numerical columns:", list(numerical_cols))

### EXploratory Data Analysis

#### Univariate Analysis

In [None]:
## printing the nuerical coilumns
for col in DataFrame.columns:
    print(f"{col}: {DataFrame[col].dtype}")

In [None]:
#checking the count of values in material feature
fig, axes = plt.subplots(1, 2, figsize=(9, 8))
material_counts = DataFrame['material'].value_counts()
infill_values = DataFrame['infill_pattern'].value_counts()

axes[0].pie(material_counts, labels=material_counts.index, autopct='%1.1f%%', startangle=90)
axes[0].set_title('Material Distribution')
axes[1].pie(infill_values, labels=infill_values.index, autopct='%1.1f%%', startangle=90)
axes[1].set_title('Infill Pattern Distribution')
plt.show()

print(DataFrame['material'].value_counts())
print('\n', DataFrame['infill_pattern'].value_counts())

##### 1. In Infill pattern Distribution both honeycomb and grid are equally disributed with 51 % and 49%.
##### 2. In material distribution both PLA(Polylactic Acid) and ABS(Acrylonitrile Butadiene Styrene) are distributed with 55 % and 45 %

In [None]:
for col in numerical_cols:
    data = pd.Series(DataFrame[col])
    print(f"Skewness of {col} is:",data.skew())

##### 1. bed_temperature, elongation and print_speed are right skeweed. Because these features are between skew values 0.5 to 1.
##### 2. According to right thumb rule if skewness is less than 0 then it is left skeweed.From features infill_density is left skeweed.
##### 3. Remaining all the features are symmetric because the values between 0.1 to 0.5.

In [None]:

fig, ax = plt.subplots(2, 2, figsize=(10,5))
sns.histplot(DataFrame['nozzle_temperature'], ax=ax[0,0], kde=True, color="green", edgecolor="black")
ax[0,0].set_ylabel('Frequency')

sns.histplot(DataFrame['bed_temperature'], ax=ax[0,1], kde=True, color="blue", edgecolor="black")
ax[0,1].set_ylabel('Frequency')
# Add skewness text to plot
sns.histplot(DataFrame['elongation'], ax=ax[1,0], kde=True, color="orange", edgecolor="black")
ax[1,0].set_ylabel('Frequency')

sns.histplot(DataFrame['print_speed'], ax=ax[1,1], kde=True, color="yellow", edgecolor="black")
ax[1,1].set_ylabel('Frequency')
plt.tight_layout()
plt.show()



### Distribution of Features:
##### 1. The histogram shows values mostly concentrated between 220–230°C with a few lower and higher values in nozzle temperature.
##### 2. Elongation Distribution peaks around 1–2%, with some higher values above 2.5%.
##### 3. bed temperature Values cluster around 65–75°C, with a long tail stretching toward 100°C.
##### 4. Histogram shows clusters at 40–60 mm/s, 60–70 mm/s, and a small group around 110–120 mm/s.

In [None]:


fig, ax = plt.subplots(1, 2, figsize=(10,4))

ax[0].boxplot([DataFrame['infill_density'], DataFrame['tension_strenght']], tick_labels=['Infill Density', 'Tension Strength'])
ax[0].set_title('Boxplot of Infill Density and Tension Strength')

ax[1].boxplot([DataFrame['roughness'], DataFrame['fan_speed']], tick_labels=['Roughness', 'Fan Speed'])
ax[1].set_title('Boxplot of Roughness and Fan Speed')

plt.tight_layout()
plt.show()



### Bivariate Analysis

In [None]:
plt.figure(figsize=(5,5))
sns.countplot(data=DataFrame, x='material', hue='infill_pattern', palette='Set3')
plt.show()
#cross table for material and infill_pattern
cross_tab = pd.crosstab(DataFrame['material'], DataFrame['infill_pattern'])
print(cross_tab)


In [None]:
fix,axs = plt.subplots(1, 2, figsize=(9, 4))
sns.scatterplot(data=DataFrame, x='elongation', y='roughness', hue='material', ax=axs[0], palette='Set1')
axs[0].set_title('Roughness vs Tension Strength by Material')

sns.scatterplot(data=DataFrame,x='elongation', y='tension_strenght', hue='material', ax=axs[1], palette='Set2')
axs[1].set_title('Tension Strength vs Elongation by Material')
plt.tight_layout()
plt.show()

In [None]:

fig, axes = plt.subplots(2, 2, figsize=(9,5))
sns.boxplot(data=DataFrame, x='material', y='infill_density', ax=axes[0, 0])
axes[0, 0].set_title('Infill Density by Material')
sns.boxplot(data=DataFrame, x='material', y='tension_strenght', ax=axes[0, 1])
axes[0, 1].set_title('Tension Strength by Material')
sns.boxplot(data=DataFrame, x='material', y='roughness', ax=axes[1, 0])
axes[1, 0].set_title('Roughness by Material')
sns.boxplot(data=DataFrame, x='material', y='print_speed', ax=axes[1, 1])
axes[1, 1].set_title('Fan Speed by Material')
plt.tight_layout()
plt.show()


In [None]:
#checking outliers using IQR method for numerical columns
Q1 = DataFrame[numerical_cols].quantile(0.25)
Q3 = DataFrame[numerical_cols].quantile(0.75)
IQR = Q3 - Q1
outliers = ((DataFrame[numerical_cols] < (Q1 - 1.5 * IQR)) | (DataFrame[numerical_cols] > (Q3 + 1.5 * IQR))).sum()
print("Number of outliers in each numerical column:")
print(outliers)

plt.figure(figsize=(6,4))
cols_to_plot = [col for col in numerical_cols if col not in ['elongation','layer_height','wall_thickness','infill_pattern','material']]
sns.boxplot(data=DataFrame[cols_to_plot], palette='Set3')
plt.xticks(rotation=45)
plt.show()

    



##### 1. There are 2 outliers in bed temperature feature which can ignore.
##### 2. There are 12 outliers in print speed feature. which can be 18 % of data cannot be removed due to less data.

#### Multivariate analysis

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(DataFrame[numerical_cols].corr(), annot=True, cmap='coolwarm', fmt=".2f")
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.figure(figsize=(10,8))
sns.pairplot(DataFrame, hue='material', palette='Set2')
plt.show()

## one-Hot Encoding

####  One-Hot encoding is used for categorical columns. Because both infill pattern and material categorical features are not ordinal categorical columns. Label encoding is used can for ordinal categorical columns.

In [None]:
DataFrame_encoded= pd.get_dummies(DataFrame, columns=categorical_cols, drop_first=True)

In [None]:
DataFrame_encoded.head()

### Data splitting

In [None]:

x = DataFrame_encoded.drop(['material_pla'], axis=1)
y = DataFrame_encoded['material_pla']

##### Data is splitted from the DataFrame 'DataFrame_encoded' for two variables x(independent variable) and y(dependent variable).
##### Dependent variable is nothing but output in the dataset and independent variable is remaining all features

In [None]:
model = RandomForestClassifier()
model.fit(x, y)
imp = model.feature_importances_
title = x.columns
Top_features = pd.Series(imp,index=title).sort_values(ascending=False)

plt.figure(figsize=(5,4))
sns.barplot(x=Top_features.values, y=Top_features.index, color='skyblue')
plt.ylabel('Features')
plt.show()

In [None]:
## normalizing the data
scaler = MinMaxScaler()
x_scaled = scaler.fit_transform(x)
x_scaled = pd.DataFrame(x_scaled, columns=x.columns)
x_scaled.head()


In [None]:
x_train,x_test,y_train,y_test = train_test_split(x_scaled,y,test_size=0.4, random_state=42)

### Model Building

#### Since the problem is classification to classsify the material between ABS and PAL, classification algorithm like RandomForestclassifier is used.
#### Random Forest: An ensemble of decision trees that predicts by majority vote (classification) or averaging (regression); used for accurate and stable predictions while reducing overfitting.

In [None]:


rf_model = RandomForestClassifier(random_state=42)
param_grid = {
    'n_estimators': [100, 200, 300,400,500],
    'max_depth': [2,6,8,10],
    'min_samples_split': [2, 5,7],
    'min_samples_leaf': [1, 2,4,5],
    'max_features': ['sqrt', 'log2', None] 
}



In [None]:
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

rf = RandomForestClassifier(random_state=42)

grid_search = GridSearchCV(
    estimator=rf,
    param_grid=param_grid, 
    scoring='accuracy',
    cv=cv,  
    n_jobs=-1,
    verbose=1,
    error_score='raise'
)
grid_search.fit(x_train, y_train)

print(grid_search.best_params_)

y_pred = grid_search.predict(x_test)



In [None]:
y_pred_test = grid_search.predict(x_test)
y_pred_train = grid_search.predict(x_train)

print("test accuracy:",accuracy_score(y_test,y_pred_test))
print("train accuracy:",accuracy_score(y_train,y_pred_train))

#### K-Fold Cross-Validation:
######  A method where data is split into K parts, the model is trained on K‑1 parts and tested on the remaining part, repeated K times.

######  It is used  to give a more reliable estimate of model performance than a single train/test split.

###### Compared to train/test accuracy: Train/test accuracy shows performance on one split only, which can be optimistic or misleading, while K-Fold CV averages results across folds, showing both expected accuracy and stability (via CV std).

In [None]:
from sklearn.model_selection import StratifiedKFold

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


scoe = cross_val_score(grid_search,x_scaled,y,cv=cv,scoring='accuracy')
print("Mean accuracy:",scoe.mean())
print("Standard deviation of CV:",scoe.std())

In [None]:
cm = confusion_matrix(y_test,y_pred)
print(cm)

plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=[0,1,2], yticklabels=[0,1,2])
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.title("Confusion Matrix Heatmap")
plt.show()

In [None]:
print("classification report:")
print(classification_report(y_test,y_pred))
print("Accuracy:",accuracy_score(y_test,y_pred))
print("precision score:",precision_score(y_test,y_pred))
print("F1 score:",f1_score(y_test,y_pred))
print("Recall score:",recall_score(y_test,y_pred))

#### Case 1 – Good Fit:

###### Train accuracy and Test accuracy are both high and similar values.

###### CV mean and Train/Test accuracy, CV standard deviation is low <=1, then  model generalizes well.

#### Case 2 – Overfitting:

###### Train accuracy and  Test accuracy are both high and similar values.

###### CV mean much lower than train accuracy, CV  standard deviation is high >=1 or low <=1 model memorizes data, unstable.

#### Case 3 – Underfitting:

###### Train and Test accuracy both low.

###### CV mean and Train/Test accuracy are too low,  CV  standard deviation is high >=1 or low <=1 model too simple, cannot learn patterns.

#### Test and train accuracy is 0.92 and mean accuracy is 0.92 which are similar values and standard deviation is 0.04 which is <=1. So model is stable.

In [None]:
# Choose one tree to plot
estimator = random_search.estimators_[0]  # First tree in the forest

# Plot the tree
plt.figure(figsize=(20,10))
plot_tree(estimator, 
          feature_names=data.feature_names, 
          class_names=data.target_names, 
          filled=True, 
          rounded=True,
          fontsize=12)
plt.show()

In [None]:
import sklearn.metrics as metrics

fpr, tpr, threshold = metrics.roc_curve(y_test, y_pred)

roc_auc_DT = metrics.auc(fpr, tpr)

plt.plot(fpr, tpr, label='AUC = %0.2f' % roc_auc_DT)
plt.xlabel("fpr")
plt.ylabel("tpr")
plt.title("roc_curve")
plt.legend()

In [None]:
import joblib
file_name='3d_print_model.pkl'
joblib.dump(random_search,file_name)